In [47]:
import numpy as np
import pandas as pd
import xarray as xr
import pymongo
import pdb
from datetime import datetime, timedelta
from dateutil.relativedelta import *
from scipy.io import loadmat
import os
import glob
import matplotlib.pyplot as plt
from scipy.interpolate import griddata

In [48]:
def transform_lon(lon):
    '''
    Transforms longitude from absolute to -180 to 180 deg
    '''
    if lon >= 180:
        lon -= 360
    return lon

def adjust_lon(lon, delta, lon_0):
    '''
    BSOSE grid lon should be uniform. This ensures g
    '''

def make_docs(df, date, presLevel, dataVariable, param, measurement, gridName, units, nChunks, sortBy=['lon', 'lat'], sparse=False):
    '''
    Takes df and converts it into a document for mongodb
    '''
    docs = []
    if sparse:
        df = df[df[dataVariable] != 0]
    df = df.rename(index=str, columns={dataVariable: 'value'})
    df = df.sort_values(by=sortBy) # sort by (lon, lat) or by (lat, _then_ lon).
    df = df[['lat', 'lon', 'value']] # drop other columns if exist
    df['value'] = df['value']
    #df.value = df.value.replace(0, np.nan) # replace 0 with nan (sparse data)
    dataDict = df.to_dict(orient='records')
    dataChunks = np.array_split(np.array(dataDict), nChunks)
    for idx in range(0, nChunks):
        doc = {}
        doc['gridName'] = gridName
        doc['measurement'] = measurement #temperature or psal
        doc['units'] = units # degrees celsius or psu
        doc['data'] = dataChunks[idx].tolist()
        doc['variable'] = dataVariable # ARGO_TEMPERATURE_ANOMALY or ARGO_TEMPERATURE_MEAN or total
        doc['date'] = date
        doc['pres'] = presLevel
        doc['param'] = param # anomaly, mean, or total
        doc['cellsize'] = 1  #  Degree
        doc['NODATA_value'] = np.NaN
        doc['chunk'] = idx
        docs.append(doc)
    return docs

def create_grid_df(chunk, latName, longName):
    df = chunk.to_dataframe()
    df = df.reset_index()
    df = df.rename(columns={latName:'lat', longName: 'lon', 'Z': 'pres'})
    df['lon'] = df['lon'].apply(lambda lon: transform_lon(lon))
    return df

def format_date(date):
    return datetime.strptime(str(date.year)+'-'+str(date.month), '%Y-%m')
    
def insert_pres_time_grid(da, coll, param, measurement, gridName, latName, longName, units,
                          dataVariable, maskName, nChunks=5, sortBy=['lon', 'lat'], sparse=False):
    baseDate = datetime.strptime('2008-01-01', '%Y-%m-%d')
    for sec, chunk in da.groupby('time'):
        date = baseDate + timedelta(seconds=int(sec))
        date = format_date(date)
        print(date)
        df = create_grid_df(chunk, latName, longName)
        df[param] = df[param] = df.loc[(df[maskName] != 1), param] = np.NaN
        for pdx, presDf in df.groupby('pres'):
            if not pdx in [-2.1, -12.15, -95, -1800]:
                continue
            presDf['pres'] = presDf['pres'].apply(lambda x: np.round(-1 * x, 5))
            presDf = presDf.drop(['time', 'pres', 'iter', 'drF', 'Depth', 'rA', 'hFacC'], axis=1)
            docs = make_docs(presDf, date, pdx, dataVariable, param, measurement, gridName, units, nChunks, sortBy, sparse)
            coll.insert_many(docs)


In [6]:
def create_collection(dbName, collectionName):
    dbUrl = 'mongodb://localhost:27017/'
    client = pymongo.MongoClient(dbUrl)
    db = client[dbName]
    coll = db[collectionName]
    coll = init_profiles_collection(coll)
    return coll

def init_profiles_collection(coll):
    try:
        coll.create_index([('date', pymongo.ASCENDING)])
        coll.create_index([('pres', pymongo.ASCENDING)])
        coll.create_index([('gridName', pymongo.ASCENDING)])
        coll.create_index([('date', pymongo.ASCENDING), ('pres', pymongo.ASCENDING), ('gridName', pymongo.ASCENDING)])
        coll.create_index([('data.lat', pymongo.DESCENDING)])
        coll.create_index([('data.lon', pymongo.ASCENDING)])


    except:
        msg = 'not able to get collections or set indexes'
        print(msg)
        logging.warning(msg)
    return coll

In [7]:
soFilename = '/storage/bsose/bsose_i105_2008to2012_monthly_O2.nc'
so = xr.open_dataset(soFilename, decode_times=False)

In [8]:
dataVariable='TRAC03'
soDataArray = so[dataVariable]
param='TRAC03'
maskName='hFacC'
measurement='doxy'
latName = 'YC'
longName = 'XC'
gridName='sose_doxy'
units = 'mol O/m'
dbName = 'argo'
nChunks = 5

coll = create_collection(dbName, gridName)
coll.drop()
insert_pres_time_grid(soDataArray, coll, param, measurement, gridName, latName, longName, units,
                          dataVariable, maskName, nChunks=5)

# Sea Ice

In [87]:
def format_date(date):
    '''keeps only the year and month of the date object'''
    return datetime.strptime(str(date.year)+'-'+str(date.month), '%Y-%m')

def insert_time_grid(da, coll, param, measurement, gridName, latName, longName, timeName, units,
                          dataVariable, maskName, nChunks=5, test=False, baseDate='2012-12-01', sortBy=['lon', 'lat'], sparse=True):
    presLevel = 0
    group = da.groupby(timeName)
    baseDate = datetime.strptime(baseDate, '%Y-%m-%d')
    for sec, chunk in da.groupby(timeName):
        date = baseDate + timedelta(seconds=int(sec))
        #if not (date.year == 2013 and date.month==1 and date.day <=5):
        #if not (date.year == 2014):
        #    continue
        date = datetime.strftime(date, '%Y-%m-%d')
        print(date)
        df = create_grid_df(chunk, latName, longName)
        df = df.rename(columns={'t': 'time'}) # rename for monthly
        df = df.drop(['time'], axis=1)
        df.lat = df.lat.round(3)
        df.lon = df.lon.round(3)
        docs = make_docs(df, date, presLevel, dataVariable, param, measurement, gridName, units, nChunks, sortBy, sparse)
        try:
            coll.insert_many(docs)
        except:
            print(f'date {date} was not added')
        if test and date == '2013-01-04':
            break

def create_grid_coord_collection(da, gridName, coll):
    lons = np.sort(np.array([transform_lon(lon) for lon in np.unique(np.unique(da.XC.values.round(3))).tolist()]))
    lats = np.sort(np.unique(np.unique(da.YC.values.round(3))))
    lons = lons.astype(np.float64).round(3).tolist()
    lats = lats.astype(np.float64).round(3).tolist()
    doc = {'gridName': gridName, 'lats': lats, 'lons': lons}
    coll.insert_one(doc)
    
            
def insert_monthly_grid(da, coll, param, measurement, gridName, latName, longName, timeName, units,
                          dataVariable, maskName, nChunks=1, baseDate='2012-01-01', sortBy=['lon', 'lat'], sparse=True):
    #baseDate = datetime.strptime('2013-01-01', '%Y-%m-%d')
    baseDate = datetime.strptime(baseDate, '%Y-%m-%d')
    presLevel = 0
    for sec, chunk in da.groupby('t'):
        date = baseDate + timedelta(seconds=int(sec))
        date = format_date(date)
        if not date.year == 2012 and date.month==1 and date.day<=5:
            continue
        print(date)
        df = create_grid_df(chunk, latName, longName)
        df = df.rename(columns={'t': 'time'}) # rename for monthly
        df = df.drop(['time'], axis=1)
        df.lat = df.lat.round(3)
        df.lon = df.lon.round(3)
        docs = make_docs(df, date, presLevel, dataVariable, param, measurement, gridName, units, nChunks, sortBy, sparse)
        coll.insert_many(docs)

In [88]:
soFilename = '/storage/bsose/bsose_i133_2013to2018_1dy_SeaIceArea.nc'
so = xr.open_dataset(soFilename, decode_times=False, decode_coords=False)
def correct_longitude(lons, dlon=1/6):
    '''
    longitude should be linear, but data is not uniformly spaced
    '''
    dlon = 1/6
    lon_0 = lons[0]
    lon_correct = lambda ldx: dlon * ldx + lon_0
    return np.array([ lon_correct(ldx) for ldx in range(0, len(lons))]) 
    
so['XC'] = correct_longitude(so.XC.values)  

dataVariable='SIarea'
soDataArray = so[dataVariable]
param='SIarea'
maskName=None
measurement='SIarea'
gridName='sose_si_area_1_day_sparse'
# for 3 day
latName = 'YC'
longName = 'XC'
timeName = 'time'
units = 'm^2/m^2'
dbName = 'argo'
nChunks = 5


dbUrl = 'mongodb://localhost:27017/'
client = pymongo.MongoClient(dbUrl)
db = client['argo']
coll = db['grid_coords']
coll.drop()
create_grid_coord_collection(soDataArray, gridName, coll)

so.info()

coll = create_collection(dbName, gridName)
coll.drop()

In [93]:
insert_time_grid(soDataArray, coll, param, 
                      measurement, gridName, latName, longName, timeName, units,
                      param, maskName, nChunks)

2013-01-01
2013-01-02
2013-01-03
2013-01-04
2013-01-05
2013-01-06
2013-01-07
2013-01-08
2013-01-09
2013-01-10
2013-01-11
2013-01-12
2013-01-13
2013-01-14
2013-01-15
2013-01-16
2013-01-17
2013-01-18
2013-01-19
2013-01-20
2013-01-21
2013-01-22
2013-01-23
2013-01-24
2013-01-25
2013-01-26
2013-01-27
2013-01-28
2013-01-29
2013-01-30
2013-01-31
2013-02-01
2013-02-02
2013-02-03
2013-02-04
2013-02-05
2013-02-06
2013-02-07
2013-02-08
2013-02-09
2013-02-10
2013-02-11
2013-02-12
2013-02-13
2013-02-14
2013-02-15
2013-02-16
2013-02-17
2013-02-18
2013-02-19
2013-02-20
2013-02-21
2013-02-22
2013-02-23
2013-02-24
2013-02-25
2013-02-26
2013-02-27
2013-02-28
2013-03-01
2013-03-02
2013-03-03
2013-03-04
2013-03-05
2013-03-06
2013-03-07
2013-03-08
2013-03-09
2013-03-10
2013-03-11
2013-03-12
2013-03-13
2013-03-14
2013-03-15
2013-03-16
2013-03-17
2013-03-18
2013-03-19
2013-03-20
2013-03-21
2013-03-22
2013-03-23
2013-03-24
2013-03-25
2013-03-26
2013-03-27
2013-03-28
2013-03-29
2013-03-30
2013-03-31
2013-04-01

soFilename = '/storage/bsose/bsose_i133_2013to2018_1deg_monthly_SeaIceArea.nc'
so = xr.open_dataset(soFilename, decode_times=False)
#so.info()

dataVariable='SIarea'
soDataArray = so[dataVariable]
param='SIarea'
maskName=None
measurement='SIarea'
gridName='sose_si_area_monthly'
# for monthly
latName = 'y'
longName = 'x'
timeName = 't'
units = 'm^2/m^2'
dbName = 'argo'
nChunks = 1

so.info()

coll = create_collection(dbName, gridName)
coll.drop()
insert_monthly_grid(soDataArray, coll, param, 
                      measurement, gridName, latName, longName, timeName, units,
                      param, maskName, nChunks)

soFilename = '/storage/bsose/bsose_i105_2008to2012_3day_SeaIceArea.nc'
so = xr.open_dataset(soFilename, decode_times=False)
#so.info()

dataVariable='SIarea'
soDataArray = so[dataVariable]
param='SIarea'
maskName=None
measurement='SIarea'
gridName='sose_si_area_3_day'
# for 3 day
latName = 'YC'
longName = 'XC'
timeName = 'time'
units = 'm^2/m^2'
dbName = 'argo'
nChunks = 5

so.info()

np.unique(np.diff(np.unique(so.XC.values)))

coll = create_collection(dbName, gridName)
coll.drop()
insert_time_grid(soDataArray, coll, param, 
                      measurement, gridName, latName, longName, timeName, units,
                      param, maskName, nChunks)

coll = create_collection('argo-express-test', gridName)
coll.drop()
insert_time_grid(soDataArray, coll, param, 
                      measurement, gridName, latName, longName, timeName, units,
                      param, maskName, nChunks, test=True)

In [10]:
np.unique(np.diff(np.unique(so.XC.values)))

array([0.1666565 , 0.16666412, 0.16666603, 0.16666651, 0.16666663,
       0.16666666, 0.16666669, 0.16666675, 0.16666698, 0.16666794,
       0.16667175, 0.16668701], dtype=float32)

In [8]:
coll = create_collection(dbName, gridName)