In [20]:
import numpy as np
import pandas as pd
import xarray as xr
import pymongo
import pdb
from datetime import datetime, timedelta
from dateutil.relativedelta import *
from scipy.io import loadmat
import os
import glob
import matplotlib.pyplot as plt
from scipy.interpolate import griddata

In [21]:
def transform_lon(lon):
    '''
    Transforms longitude from absolute to -180 to 180 deg
    '''
    if lon >= 180:
        lon -= 360
    return lon

def make_docs(df, date, presLevel, dataVariable, param, measurement, gridName, units, nChunks, sortBy=['lon', 'lat']):
    '''
    Takes df and converts it into a document for mongodb
    '''
    docs = []
    df = df.rename(index=str, columns={dataVariable: 'value'})
    df = df.sort_values(by=sortBy) # sort by (lon, lat) or by (lat, _then_ lon).
    df = df[['lat', 'lon', 'value']] # drop other columns if exist
    dataDict = df.to_dict(orient='records')
    dataChunks = np.split(np.array(dataDict), nChunks)
    for idx in range(0, nChunks):
        doc = {}
        doc['gridName'] = gridName
        doc['measurement'] = measurement #temperature or psal
        doc['units'] = units # degrees celsius or psu
        doc['data'] = dataChunks[idx].tolist()
        doc['variable'] = dataVariable # ARGO_TEMPERATURE_ANOMALY or ARGO_TEMPERATURE_MEAN or total
        doc['date'] = date
        doc['pres'] = presLevel
        doc['param'] = param # anomaly, mean, or total
        doc['cellsize'] = 1  #  Degree
        doc['NODATA_value'] = np.NaN
        doc['chunk'] = idx
        docs.append(doc)
    return docs

def create_grid_df(chunk, latName, longName):
    df = chunk.to_dataframe()
    df = df.reset_index()
    df = df.rename(columns={latName:'lat', longName: 'lon', 'Z': 'pres'})
    df['lon'] = df['lon'].apply(lambda lon: transform_lon(lon))
    return df

def format_date(date):
    return datetime.strptime(str(date.year)+'-'+str(date.month), '%Y-%m')
    
def insert_pres_time_grid(da, coll, param, measurement, gridName, latName, longName, units,
                          dataVariable, maskName, nChunks=5):
    baseDate = datetime.strptime('2008-01-01', '%Y-%m-%d')
    for sec, chunk in da.groupby('time'):
        date = baseDate + timedelta(seconds=int(sec))
        date = format_date(date)
        print(date)
        df = create_grid_df(chunk, latName, longName)
        df[param] = df[param] = df.loc[(df[maskName] != 1), param] = np.NaN
        for pdx, presDf in df.groupby('pres'):
            if not pdx in [-2.1, -12.15, -95, -1800]:
                continue
            presDf['pres'] = presDf['pres'].apply(lambda x: np.round(-1 * x, 5))
            presDf = presDf.drop(['time', 'pres', 'iter', 'drF', 'Depth', 'rA', 'hFacC'], axis=1)
            docs = make_docs(presDf, date, pdx, dataVariable, param, measurement, gridName, units, nChunks)
            coll.insert_many(docs)


In [22]:
def create_collection(dbName, collectionName):
    dbUrl = 'mongodb://localhost:27017/'
    client = pymongo.MongoClient(dbUrl)
    db = client[dbName]
    coll = db[collectionName]
    coll = init_profiles_collection(coll)
    return coll

def init_profiles_collection(coll):
    try:
        coll.create_index([('date', pymongo.DESCENDING)])
        coll.create_index([('pres', pymongo.DESCENDING)])
        coll.create_index([('data.lat', pymongo.DESCENDING)])
        coll.create_index([('data.lon', pymongo.ASCENDING)])
        
        #may want to store as geojson feature collection one day
        #coll.create_index([('data.geometries', pymongo.GEOSPHERE)])

    except:
        msg = 'not able to get collections or set indexes'
        print(msg)
        logging.warning(msg)
    return coll

In [23]:
soFilename = '/storage/bsose/bsose_i105_2008to2012_monthly_O2.nc'
so = xr.open_dataset(soFilename, decode_times=False)
so.info()

xarray.Dataset {
dimensions:
	XC = 1080 ;
	YC = 294 ;
	Z = 52 ;
	time = 60 ;

variables:
	int64 iter(time) ;
		iter:long_name = model timestep number ;
		iter:standard_name = timestep ;
	int64 time(time) ;
		time:long_name = Time ;
		time:standard_name = time ;
		time:axis = T ;
		time:units = seconds since 2008-01-01 ;
		time:calendar = proleptic_gregorian ;
	float32 XC(XC) ;
		XC:coordinate = YC XC ;
		XC:units = degrees_east ;
		XC:standard_name = longitude ;
		XC:long_name = longitude ;
		XC:axis = X ;
	float32 YC(YC) ;
		YC:coordinate = YC XC ;
		YC:units = degrees_north ;
		YC:standard_name = latitude ;
		YC:long_name = latitude ;
		YC:axis = Y ;
	float32 Z(Z) ;
		Z:units = m ;
		Z:positive = down ;
		Z:standard_name = depth ;
		Z:long_name = vertical coordinate of cell center ;
		Z:axis = Z ;
	float32 Depth(YC, XC) ;
		Depth:coordinate = XC YC ;
		Depth:units = m ;
		Depth:standard_name = ocean_depth ;
		Depth:long_name = ocean depth ;
	float32 rA(YC, XC) ;
		rA:coordinate = YC 

In [24]:
dataVariable='TRAC03'
soDataArray = so[dataVariable]
param='TRAC03'
maskName='hFacC'
measurement='doxy'
latName = 'YC'
longName = 'XC'
gridName='sose_doxy'
units = 'mol O/m'
dbName = 'argo'
nChunks = 5

In [49]:
coll = create_collection(dbName, gridName)
coll.drop()
insert_pres_time_grid(soDataArray, coll, param, measurement, gridName, latName, longName, units,
                          dataVariable, maskName, nChunks=5)

2008-01-01 00:00:00


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


2008-03-01 00:00:00
2008-04-01 00:00:00
2008-05-01 00:00:00
2008-06-01 00:00:00
2008-07-01 00:00:00
2008-07-01 00:00:00
2008-08-01 00:00:00
2008-09-01 00:00:00
2008-10-01 00:00:00
2008-11-01 00:00:00
2008-12-01 00:00:00
2009-01-01 00:00:00
2009-03-01 00:00:00
2009-04-01 00:00:00
2009-05-01 00:00:00
2009-06-01 00:00:00
2009-07-01 00:00:00
2009-08-01 00:00:00
2009-08-01 00:00:00
2009-09-01 00:00:00
2009-10-01 00:00:00
2009-11-01 00:00:00
2009-12-01 00:00:00
2010-01-01 00:00:00
2010-03-01 00:00:00
2010-04-01 00:00:00
2010-05-01 00:00:00
2010-06-01 00:00:00
2010-07-01 00:00:00
2010-08-01 00:00:00
2010-08-01 00:00:00
2010-10-01 00:00:00
2010-10-01 00:00:00
2010-11-01 00:00:00
2010-12-01 00:00:00
2011-01-01 00:00:00
2011-03-01 00:00:00
2011-04-01 00:00:00
2011-05-01 00:00:00
2011-06-01 00:00:00
2011-07-01 00:00:00
2011-08-01 00:00:00
2011-08-01 00:00:00
2011-10-01 00:00:00
2011-10-01 00:00:00
2011-11-01 00:00:00
2011-12-01 00:00:00
2012-01-01 00:00:00
2012-03-01 00:00:00
2012-03-01 00:00:00


# Sea Ice

In [25]:
def format_date(date):
    '''keeps only the year and month of the date object'''
    return datetime.strptime(str(date.year)+'-'+str(date.month), '%Y-%m')

def insert_time_grid(da, coll, param, measurement, gridName, latName, longName, timeName, units,
                          dataVariable, maskName, nChunks=5, test=False):
    baseDate = datetime.strptime('2013-01-01', '%Y-%m-%d')
    presLevel = 0
    for sec, chunk in da.groupby(timeName):
        date = baseDate + timedelta(seconds=int(sec))
        date = datetime.strftime(date, '%Y-%m-%d')
        print(date)
        df = create_grid_df(chunk, latName, longName)
        df = df.rename(columns={'t': 'time'}) # rename for monthly
        df = df.drop(['time'], axis=1)
        df.lat = df.lat.round(3)
        df.lon = df.lon.round(3)
        docs = make_docs(df, date, presLevel, dataVariable, param, measurement, gridName, units, nChunks)
        coll.insert_many(docs)
        if test and date == '2013-01-04':
            break
        
def insert_monthly_grid(da, coll, param, measurement, gridName, latName, longName, timeName, units,
                          dataVariable, maskName, nChunks=1):
    baseDate = datetime.strptime('2013-01-01', '%Y-%m-%d')
    presLevel = 0
    for sec, chunk in da.groupby('t'):
        date = baseDate + timedelta(seconds=int(sec))
        date = format_date(date)
        print(date)
        df = create_grid_df(chunk, latName, longName)
        df = df.rename(columns={'t': 'time'}) # rename for monthly
        df = df.drop(['time'], axis=1)
        df.lat = df.lat.round(3)
        df.lon = df.lon.round(3)
        docs = make_docs(df, date, presLevel, dataVariable, param, measurement, gridName, units, nChunks)
        coll.insert_many(docs)

In [26]:
soFilename = '/storage/bsose/bsose_i133_2013to2018_1deg_monthly_SeaIceArea.nc'
so = xr.open_dataset(soFilename, decode_times=False)
#so.info()

dataVariable='SIarea'
soDataArray = so[dataVariable]
param='SIarea'
maskName=None
measurement='SIarea'
gridName='sose_si_area_monthly'
# for monthly
latName = 'y'
longName = 'x'
timeName = 't'
units = 'm^2/m^2'
dbName = 'argo'
nChunks = 1

In [13]:
coll = create_collection(dbName, gridName)
coll.drop()
insert_monthly_grid(soDataArray, coll, param, 
                      measurement, gridName, latName, longName, timeName, units,
                      param, maskName, nChunks)

2013-01-01 00:00:00
2013-02-01 00:00:00
2013-03-01 00:00:00
2013-04-01 00:00:00
2013-05-01 00:00:00
2013-06-01 00:00:00
2013-07-01 00:00:00
2013-08-01 00:00:00
2013-09-01 00:00:00
2013-10-01 00:00:00
2013-11-01 00:00:00
2013-12-01 00:00:00
2014-01-01 00:00:00
2014-02-01 00:00:00
2014-03-01 00:00:00
2014-04-01 00:00:00
2014-05-01 00:00:00
2014-06-01 00:00:00
2014-07-01 00:00:00
2014-08-01 00:00:00
2014-09-01 00:00:00
2014-10-01 00:00:00
2014-11-01 00:00:00
2014-12-01 00:00:00
2015-01-01 00:00:00
2015-02-01 00:00:00
2015-03-01 00:00:00
2015-04-01 00:00:00
2015-05-01 00:00:00
2015-06-01 00:00:00
2015-07-01 00:00:00
2015-08-01 00:00:00
2015-09-01 00:00:00
2015-10-01 00:00:00
2015-11-01 00:00:00
2015-12-01 00:00:00
2016-01-01 00:00:00
2016-02-01 00:00:00
2016-03-01 00:00:00
2016-04-01 00:00:00
2016-05-01 00:00:00
2016-06-01 00:00:00
2016-07-01 00:00:00
2016-08-01 00:00:00
2016-09-01 00:00:00
2016-10-01 00:00:00
2016-11-01 00:00:00
2016-12-01 00:00:00
2017-01-01 00:00:00
2017-02-01 00:00:00


In [27]:
soFilename = '/storage/bsose/bsose_i105_2008to2012_3day_SeaIceArea.nc'
so = xr.open_dataset(soFilename, decode_times=False)
#so.info()

dataVariable='SIarea'
soDataArray = so[dataVariable]
param='SIarea'
maskName=None
measurement='SIarea'
gridName='sose_si_area_3_day'
# for 3 day
latName = 'YC'
longName = 'XC'
timeName = 'time'
units = 'm^2/m^2'
dbName = 'argo'
nChunks = 5

In [28]:
coll = create_collection(dbName, gridName)
coll.drop()
insert_time_grid(soDataArray, coll, param, 
                      measurement, gridName, latName, longName, timeName, units,
                      param, maskName, nChunks)

2013-01-04
2013-01-07
2013-01-10
2013-01-13
2013-01-16
2013-01-19
2013-01-22
2013-01-25
2013-01-28
2013-01-31
2013-02-03
2013-02-06
2013-02-09
2013-02-12
2013-02-15
2013-02-18
2013-02-21
2013-02-24
2013-02-27
2013-03-02
2013-03-05
2013-03-08
2013-03-11
2013-03-14
2013-03-17
2013-03-20
2013-03-23
2013-03-26
2013-03-29
2013-04-01
2013-04-04
2013-04-07
2013-04-10
2013-04-13
2013-04-16
2013-04-19
2013-04-22
2013-04-25
2013-04-28
2013-05-01
2013-05-04
2013-05-07
2013-05-10
2013-05-13
2013-05-16
2013-05-19
2013-05-22
2013-05-25
2013-05-28
2013-05-31
2013-06-03
2013-06-06
2013-06-09
2013-06-12
2013-06-15
2013-06-18
2013-06-21
2013-06-24
2013-06-27
2013-06-30
2013-07-03
2013-07-06
2013-07-09
2013-07-12
2013-07-15
2013-07-18
2013-07-21
2013-07-24
2013-07-27
2013-07-30
2013-08-02
2013-08-05
2013-08-08
2013-08-11
2013-08-14
2013-08-17
2013-08-20
2013-08-23
2013-08-26
2013-08-29
2013-09-01
2013-09-04
2013-09-07
2013-09-10
2013-09-13
2013-09-16
2013-09-19
2013-09-22
2013-09-25
2013-09-28
2013-10-01

In [19]:
coll = create_collection('argo-express-test', gridName)
coll.drop()
insert_time_grid(soDataArray, coll, param, 
                      measurement, gridName, latName, longName, timeName, units,
                      param, maskName, nChunks, test=True)

2013-01-04


In [None]:
#soFilename = '/storage/bsose/bsose_i105_2008to2012_3day_SeaIceArea.nc'

In [32]:
datetime.strptime('2013-1', '%Y-%m')

datetime.datetime(2013, 1, 1, 0, 0)