In [3]:
import numpy as np
import pandas as pd
import xarray as xr
import pymongo
import pdb
from datetime import datetime, timedelta
from dateutil.relativedelta import *
from scipy.io import loadmat
import os
import glob
import itertools

In [4]:
class KSToDb(object):
    def __init__(self, jgBool=False):
        self.jgBool = jgBool
        print('init KSToDb. JG bool is {}'.format(self.jgBool))
        self.spaceTimeParams  = ['nResGrid', 'nll', 'sigmaOpt', 'thetaLatOpt', 'thetaLongOpt', 'thetasOpt', 'thetatOpt']
        self.spaceParams = ['aOpt', 'latGrid', 'longGrid', 'nResGrid', 'nll', 'sigmaOpt', 'theta1Opt', 'theta2Opt']
        self.units = {
                    'nResGrid': 'number of profiles used',
                    'nll': 'negative log likilihood',
                    'sigmaOpt': 'Degrees Celsius',
                    'thetaLatOpt': 'Degrees',
                    'thetaLongOpt': 'Degrees',
                    'thetasOpt': '[]'
                    }
    @staticmethod
    def create_collection(dbName, collectionName, init_collection):
        dbUrl = 'mongodb://localhost:27017/'
        client = pymongo.MongoClient(dbUrl)
        db = client[dbName]
        coll = db[collectionName]
        coll = init_collection(coll)
        return coll

    @staticmethod
    def init_grid_collection(coll):
        coll.create_index([('date', pymongo.DESCENDING)])
        coll.create_index([('pres', pymongo.DESCENDING)])
        coll.create_index([('data.lat', pymongo.DESCENDING)])
        coll.create_index([('data.lon', pymongo.ASCENDING)])
        coll.create_index([('trend', pymongo.DESCENDING)])

        return coll

    @staticmethod
    def init_param_collection(coll):
        coll.create_index([('pres', pymongo.DESCENDING)])
        coll.create_index([('data.lon', pymongo.DESCENDING)])
        coll.create_index([('data.lat', pymongo.ASCENDING)])
        coll.create_index([('param', pymongo.DESCENDING),
                           ('trend', pymongo.DESCENDING),
                           ('model', pymongo.DESCENDING),
                           ('modelParam', pymongo.DESCENDING)])

        return coll

    @staticmethod
    def transform_lon(lon):
        '''
        Transforms longitude from absolute to -180 to 180 deg
        '''
        if lon >= 180:
            lon -= 360
        return lon

    def make_doc(self, df, date, trend, presLevel, dataVariable, param, measurement, gridName, units):
        '''
        Takes df and converts it into a document for mongodb
        '''
        doc = {}
        df = df.rename(index=str, columns={dataVariable: 'value'})
        dataDict = df.to_dict(orient='records')
        doc['gridName'] = gridName
        doc['measurement'] = measurement #temperature or psal
        doc['units'] = units # degrees celsius or psu
        doc['param'] = param # anomaly or mean
        doc['data'] = dataDict 
        doc['variable'] = dataVariable # ARGO_TEMPERATURE_ANOMALY or ARGO_TEMPERATURE_MEAN or predGrid
        doc['date'] = date
        doc['pres'] = float(presLevel)
        doc['cellsize'] = 1  #  Degree
        doc['NODATA_value'] = np.NaN
        doc['trend'] = trend
        return doc

    def make_grid_docs(self, files, gridName, trend, param='anomaly', dataVariable='predGrid'):
        docs = []
        for file in files:
            doc = {}
            anomData = loadmat(file)
            fa = file.split('/')[-1].split('_')
            year = fa[-1].replace('.mat', '')
            month = fa[-2]
            year_month = year + month
            date = datetime.strptime(year_month, '%Y%m')
            presLevel = float(fa[-6].replace('at', '').replace('dbar', ''))
            latGrid = anomData['latGrid'].flatten()
            lonGrid = anomData['longGrid'].flatten()
            values = anomData[dataVariable].flatten()
            df = pd.DataFrame()
            df['lat'] = latGrid
            df['lon'] = lonGrid
            df['lon'] = df['lon'].apply(lambda lon: self.transform_lon(lon))
            df['value'] = values
            
            #filter out outliers
            df['value'][df['value'] > 50.0] = np.nan
            df['value'][df['value'] < -5.0] = np.nan
            
            doc = self.make_doc(df, date, trend,
                                presLevel, dataVariable, param,
                                'temperature', gridName, 'Degrees Celcius')
            docs.append(doc)
        return docs

    def make_param_docs(self, grids):
        docs = []
        for pres, measurement, model, trend in grids:
            paramFiles = '{0}{1}{2}*.mat'.format(model, trend, measurement)
            presDir = pres
            if self.jgBool:
                presDir += '-jg'
            path = os.path.join(kuuselaBase, presDir,\
                                    'outliers_removed', trend, 'Results',\
                                    'localMLE' + paramFiles)
            filename = glob.glob(path) # should be one
            print(paramFiles)
            if not filename:
                print('file not found: {}'.format(path))
                continue
            print(filename)
            paramData = loadmat(filename[0])
            if model == 'Space':
                modelParams = self.spaceParams
            elif model == 'SpaceTime':
                modelParams = self.spaceTimeParams
            else:
                raise('modelParams not found')

            for modelParam in modelParams:
                print(pres, measurement, model, trend, modelParam)
                doc = self.make_param_doc(paramData, modelParam,
                                          model, trend, measurement,
                                          pres)
                docs.append(doc)
        return docs

    def make_param_doc(self, paramData, modelParam, model, trend, measurement, pres):
        values = paramData[modelParam].flatten()
        latGrid = paramData['latGrid'].flatten()
        lonGrid = paramData['longGrid'].flatten()
        gridName = 'ks' + model + measurement + trend

        df = pd.DataFrame()
        df['lat'] = latGrid
        df['lon'] = lonGrid
        df['lon'] = df['lon'].apply(lambda lon: self.transform_lon(lon))
        df['value'] = values
        dataDict = df.to_dict(orient='records')
        doc = {}
        if self.jgBool:
            gridName += 'JG'
        doc['gridName'] = gridName
        doc['model'] = model
        doc['param'] = modelParam
        doc['units'] = '' #TODO: figure out what to add for units
        doc['trend'] = trend
        doc['measurement'] = measurement
        doc['data'] = dataDict
        doc['pres'] = float(pres)
        doc['cellsize'] = 1  #  Degree
        doc['NODATA_value'] = np.NaN
        return doc

In [5]:
kuuselaBase = os.path.join('/home','tyler','Kuusela-Stein', 'Data', 'Data')
pressures = [str(10.0)]
measurements = ['Temp']
trends = ['Trend', 'NoTrend', 'Trend2']
models = ['Space', 'SpaceTime']

In [4]:
ksToDb = KSToDb()

init KSToDb. JG bool is False


# Make param collection

In [21]:
allIters = [pressures, measurements, models, trends]
grids = list(itertools.product(*allIters))
docs = ksToDb.make_param_docs(grids)
collName = 'ksTParams'
coll = ksToDb.create_collection('argo', collName, init_param_collection)
coll.drop()
coll.insert_many(docs)

SpaceTrendTemp*.mat
['/home/tyler/Kuusela-Stein/Data/Data/10.0/outliers_removed/Trend/Results/localMLESpaceTrendTemp_at10.0dbar_5_20_02_2007_2018.mat']
10.0 Temp Space Trend aOpt
10.0 Temp Space Trend latGrid
10.0 Temp Space Trend longGrid
10.0 Temp Space Trend nResGrid
10.0 Temp Space Trend nll
10.0 Temp Space Trend sigmaOpt
10.0 Temp Space Trend theta1Opt
10.0 Temp Space Trend theta2Opt
SpaceNoTrendTemp*.mat
['/home/tyler/Kuusela-Stein/Data/Data/10.0/outliers_removed/NoTrend/Results/localMLESpaceNoTrendTemp_at10.0dbar_5_20_02_2007_2018.mat']
10.0 Temp Space NoTrend aOpt
10.0 Temp Space NoTrend latGrid
10.0 Temp Space NoTrend longGrid
10.0 Temp Space NoTrend nResGrid
10.0 Temp Space NoTrend nll
10.0 Temp Space NoTrend sigmaOpt
10.0 Temp Space NoTrend theta1Opt
10.0 Temp Space NoTrend theta2Opt
SpaceTrend2Temp*.mat
['/home/tyler/Kuusela-Stein/Data/Data/10.0/outliers_removed/Trend2/Results/localMLESpaceTrend2Temp_at10.0dbar_5_20_02_2007_2018.mat']
10.0 Temp Space Trend2 aOpt
10.0 Temp S

<pymongo.results.InsertManyResult at 0x7fc7258c8a08>

In [22]:
jgBool = True
ksToDb = KSToDb(jgBool)
jgDocs = ksToDb.make_param_docs(grids)
coll.insert_many(jgDocs)

SpaceTrendTemp*.mat
['/home/tyler/Kuusela-Stein/Data/Data/10.0-jg/outliers_removed/Trend/Results/localMLESpaceTrendTemp_at10.0dbar_5_20_02_2007_2018.mat']
10.0 Temp Space Trend aOpt
10.0 Temp Space Trend latGrid
10.0 Temp Space Trend longGrid
10.0 Temp Space Trend nResGrid
10.0 Temp Space Trend nll
10.0 Temp Space Trend sigmaOpt
10.0 Temp Space Trend theta1Opt
10.0 Temp Space Trend theta2Opt
SpaceNoTrendTemp*.mat
file not found: /home/tyler/Kuusela-Stein/Data/Data/10.0-jg/outliers_removed/NoTrend/Results/localMLESpaceNoTrendTemp*.mat
SpaceTrend2Temp*.mat
file not found: /home/tyler/Kuusela-Stein/Data/Data/10.0-jg/outliers_removed/Trend2/Results/localMLESpaceTrend2Temp*.mat
SpaceTimeTrendTemp*.mat
file not found: /home/tyler/Kuusela-Stein/Data/Data/10.0-jg/outliers_removed/Trend/Results/localMLESpaceTimeTrendTemp*.mat
SpaceTimeNoTrendTemp*.mat
file not found: /home/tyler/Kuusela-Stein/Data/Data/10.0-jg/outliers_removed/NoTrend/Results/localMLESpaceTimeNoTrendTemp*.mat
SpaceTimeTrend2Tem

<pymongo.results.InsertManyResult at 0x7fc725880548>

# Make anomaly collection

In [6]:
class KSGridToDb(KSToDb):
    def __init__(self, jgBool=False):
        self.jgBool = jgBool
        self.dropCollection = not self.jgBool
        KSToDb.__init__(self, self.jgBool)
        self.testDoc=None
        print('init KSGridToDb. JG bool is {}'.format(self.jgBool))
        
    def make_path_collection(self, pres, measurement, model, trend, param):
        modelLabel = model.strip('localMLE')
        presLabel = '_at{}dbar'.format(pres)
        files = '{0}*{1}{2}{3}{4}*.mat'.format(param, modelLabel, trend, measurement, presLabel)
        if self.jgBool:
            pres += '-jg'
        path = os.path.join(kuuselaBase, pres,\
                            'outliers_removed', trend, 'Results',\
                            files)
        print(path)
        return modelLabel, path

    def make_grid_collection(self, collName, dropCollection=True):
        coll = self.create_collection('argo', collName, self.init_grid_collection)
        if dropCollection:
            coll.drop()
        return coll

    def add_doc_for_test(self, collName, doc, dbName='argo-express-test'):
        testColl = self.create_collection('argo-express-test', collName, self.init_grid_collection)
        testColl.drop()
        try:
            testColl.insert_one(doc)  
        except Exception as err:
            pdb.set_trace()
            print(err)
    
    def insert_many_grid_docs(self, coll, pres, measurement, model, trend, param, dataVariable):
        modelLabel, path = self.make_path_collection(pres, measurement, model, trend, param)
        anomMats = glob.glob(path)
        gridName = 'ks' + model + measurement + trend
        if self.jgBool:
            gridName += 'JG'
        print(gridName)
        if not anomMats:
            print('file not found: {}'.format(path))
            return
        for fileChunk in np.array_split(anomMats, 3):
            docs = self.make_grid_docs(fileChunk, gridName, trend, param, dataVariable)
            coll.insert_many(docs)
        if docs:
            self.testDoc = docs[0]

    def make_anomaly_collection(self):
        allIters = [pressures, measurements, models, trends]
        grids = list(itertools.product(*allIters))
        param = 'anomaly'
        dataVariable = 'predGrid'
        collName = 'ksTempAnom'
        coll = self.make_grid_collection(collName, self.dropCollection)
        for pres, measurement, model, trend in grids:
            self.insert_many_grid_docs(coll, pres, measurement, model, trend, param, dataVariable)
        # make for express testing
        if self.testDoc:
            self.add_doc_for_test(collName, self.testDoc, dbName='argo-express-test')
    
    def make_mean_collection(self):
        param = 'mean'
        dataVariable = 'meanGrid'
        allIters = [pressures, measurements, trends]
        grids = list(itertools.product(*allIters))
        collName = 'ksTempMean'
        dropCollection = not jgBool
        coll = self.make_grid_collection(collName, dropCollection)
        for pres, measurement, trend in grids:
            self.insert_many_grid_docs(coll, pres, measurement, '', trend, param, dataVariable)
        # make for express testing
        if self.testDoc:
            self.add_doc_for_test(collName, self.testDoc, dbName='argo-express-test')

In [7]:
kSGridToDb = KSGridToDb()

init KSToDb. JG bool is False
init KSGridToDb. JG bool is False


In [66]:
kSGridToDb.make_anomaly_collection()

ksTempAnom
/home/tyler/Kuusela-Stein/Data/Data/10.0/outliers_removed/Trend/Results/anomalySpaceTrendTemp_at10.0dbar*.mat
ksSpaceTempTrend
num of docs: 144
48
48
48
making test doc
/home/tyler/Kuusela-Stein/Data/Data/10.0/outliers_removed/NoTrend/Results/anomalySpaceNoTrendTemp_at10.0dbar*.mat
ksSpaceTempNoTrend
num of docs: 12
4
4
4
making test doc
/home/tyler/Kuusela-Stein/Data/Data/10.0/outliers_removed/Trend2/Results/anomalySpaceTrend2Temp_at10.0dbar*.mat
ksSpaceTempTrend2
num of docs: 12
4
4
4
making test doc
/home/tyler/Kuusela-Stein/Data/Data/10.0/outliers_removed/Trend/Results/anomalySpaceTimeTrendTemp_at10.0dbar*.mat
ksSpaceTimeTempTrend
num of docs: 12
4
4
4
making test doc
/home/tyler/Kuusela-Stein/Data/Data/10.0/outliers_removed/NoTrend/Results/anomalySpaceTimeNoTrendTemp_at10.0dbar*.mat
ksSpaceTimeTempNoTrend
file not found: /home/tyler/Kuusela-Stein/Data/Data/10.0/outliers_removed/NoTrend/Results/anomalySpaceTimeNoTrendTemp_at10.0dbar*.mat
/home/tyler/Kuusela-Stein/Data/Da

In [68]:
# special case for JG data
jgBool = True
kSGridToDb = KSGridToDb(jgBool)
kSGridToDb.make_anomaly_collection()

init KSToDb. JG bool is True
init KSGridToDb. JG bool is True
ksTempAnom
> <ipython-input-64-26e855f64d31>(14)make_path_collection()
-> pres += '-jg'
(Pdb) c
/home/tyler/Kuusela-Stein/Data/Data/10.0-jg/outliers_removed/Trend/Results/anomalySpaceTrendTemp_at10.0dbar*.mat
ksSpaceTempTrendJG
num of docs: 12
4
4
4
making test doc
> <ipython-input-64-26e855f64d31>(14)make_path_collection()
-> pres += '-jg'
(Pdb) c
/home/tyler/Kuusela-Stein/Data/Data/10.0-jg/outliers_removed/NoTrend/Results/anomalySpaceNoTrendTemp_at10.0dbar*.mat
ksSpaceTempNoTrendJG
file not found: /home/tyler/Kuusela-Stein/Data/Data/10.0-jg/outliers_removed/NoTrend/Results/anomalySpaceNoTrendTemp_at10.0dbar*.mat
> <ipython-input-64-26e855f64d31>(14)make_path_collection()
-> pres += '-jg'
(Pdb) c
/home/tyler/Kuusela-Stein/Data/Data/10.0-jg/outliers_removed/Trend2/Results/anomalySpaceTrend2Temp_at10.0dbar*.mat
ksSpaceTempTrend2JG
file not found: /home/tyler/Kuusela-Stein/Data/Data/10.0-jg/outliers_removed/Trend2/Results/anom

# Make mean collection

In [8]:
jgBool = False
ksGridToDb = KSGridToDb(jgBool)
ksGridToDb.make_mean_collection()

init KSToDb. JG bool is False
init KSGridToDb. JG bool is False
/home/tyler/Kuusela-Stein/Data/Data/10.0/outliers_removed/Trend/Results/mean*TrendTemp_at10.0dbar*.mat
ksTempTrend
/home/tyler/Kuusela-Stein/Data/Data/10.0/outliers_removed/NoTrend/Results/mean*NoTrendTemp_at10.0dbar*.mat
ksTempNoTrend
/home/tyler/Kuusela-Stein/Data/Data/10.0/outliers_removed/Trend2/Results/mean*Trend2Temp_at10.0dbar*.mat
ksTempTrend2


In [33]:
jgBool = True
ksGridToDb = KSGridToDb(jgBool)
ksGridToDb.make_mean_collection()

init KSToDb. JG bool is True
init KSGridToDb. JG bool is True
/home/tyler/Kuusela-Stein/Data/Data/10.0-jg/outliers_removed/Trend/Results/mean*TrendTemp_at10.0dbar*.mat
ksTempTrendJG
/home/tyler/Kuusela-Stein/Data/Data/10.0-jg/outliers_removed/NoTrend/Results/mean*NoTrendTemp_at10.0dbar*.mat
ksTempNoTrendJG
file not found: /home/tyler/Kuusela-Stein/Data/Data/10.0-jg/outliers_removed/NoTrend/Results/mean*NoTrendTemp_at10.0dbar*.mat
/home/tyler/Kuusela-Stein/Data/Data/10.0-jg/outliers_removed/Trend2/Results/mean*Trend2Temp_at10.0dbar*.mat
ksTempTrend2JG
file not found: /home/tyler/Kuusela-Stein/Data/Data/10.0-jg/outliers_removed/Trend2/Results/mean*Trend2Temp_at10.0dbar*.mat


# Make total temperature (anomaly + mean)

In [26]:
class KSTotalGrid(KSGridToDb):
    def __init__(self, mfCollName, anomCollName):
        jgBool=False
        self.dumpTH = 12
        KSGridToDb.__init__(self, jgBool)
        print('init KSTotalGrid. mfCollName: {0}, amCollName {1}'.format(mfCollName, anomCollName))
        self.meanColl = self.make_grid_collection(mfCollName, dropCollection=False)
        self.anomColl = self.make_grid_collection(anomCollName, dropCollection=False)
        totalCollName = 'ksTempTotal'
        self.totalColl = self.make_grid_collection(totalCollName, dropCollection=True)
        

    @staticmethod
    def record_to_array(measurements, xLab='value'):
        x = []
        for meas in measurements:
            x.append(meas[xLab])
        return x
    
    @staticmethod
    def replace_record(measurements, repArray, key='value'):
        outMeasurements = []
        for idx, meas in enumerate(measurements):
            meas[key] = repArray[idx]
            outMeasurements.append(meas)
        return outMeasurements
    
    @staticmethod
    def get_trouble_docs(bwe, docs):
        writeErrors = bwe.details['writeErrors']
        troubleDocs = []
        for we in writeErrors:
            troubleDocs.append(we['op'])
        return troubleDocs
    
    def insert_one_doc(self,docs):
        '''
        Use if bulk write error
        '''
        for doc in docs:
            try:
                self.totalColl.insert_one(doc)
            except Exception as err:
                print('doc _id: {} not added'.format(doc['_id']))
                print(err)
                pass
            
        
    def dump_docs(self, docs, end=False):
        '''
        Adds docs to database if th is large, otherwise return docs for 
        another iteration
        '''
        ld = len(docs)
        if ld == 0 and end:
            print('done adding documents')
            return docs
        if end:
            print('at end')
        if len(docs) >= self.dumpTH or end:
            print('adding: {} to database'.format(ld))
            try:
                self.totalColl.insert_many(docs)
            except pymongo.errors.BulkWriteError as bwe:
                troubleDocs = self.get_trouble_docs(bwe, docs)
                self.insert_one_doc(troubleDocs)
            docs = []
        return docs
        
    def get_mean_fields(self):
        cursor = self.meanColl.find()
        docs = []
        for mf in cursor:
            date = mf['date']
            trend = mf['trend']
            pres = mf['pres']
            meanArray = self.record_to_array(mf['data'])
            print('date: {0}, trend: {1}, pres: {2}'.format(date, trend, pres))
            mfDocs = self.get_total_fields(date, trend, pres, meanArray)
            docs += mfDocs
            docs = self.dump_docs(docs, end=False)
        self.dump_docs(docs, end=True)

    
    def get_total_fields(self, date, trend, pres, meanArray):
        cursor = self.anomColl.find({'date': date, 'trend': trend, 'pres': pres})
        docs = []
        for af in cursor:
            doc = af.copy()
            gridName = doc['gridName'] + 'Total'
            doc['_id'] = gridName + '-' + str(date.year) + '-' + str(date.month)+ '-' + str(pres)
            doc['gridName'] = gridName
            doc['variable'] = doc['variable'] + ' + mean'
            doc['param'] = 'total'
            anomArray = self.record_to_array(af['data'])
            totalArray = np.add(anomArray, meanArray)
            doc['data'] = self.replace_record(doc['data'], totalArray)
            docs.append(doc)
        return docs
        

In [27]:
mfCollName = 'ksTempMean'
anomCollName = 'ksTempAnom'

ksTG = KSTotalGrid(mfCollName, anomCollName)

cursor = ksTG.get_mean_fields()

init KSToDb. JG bool is False
init KSGridToDb. JG bool is False
init KSTotalGrid. mfCollName: ksTempMean, amCollName ksTempAnom
date: 2007-11-01 00:00:00, trend: Trend, pres: 10.0
date: 2007-08-01 00:00:00, trend: Trend, pres: 10.0
adding: 6 to database
date: 2007-06-01 00:00:00, trend: Trend, pres: 10.0
date: 2007-04-01 00:00:00, trend: Trend, pres: 10.0
adding: 6 to database
date: 2007-03-01 00:00:00, trend: Trend, pres: 10.0
date: 2007-10-01 00:00:00, trend: Trend, pres: 10.0
adding: 6 to database
date: 2007-09-01 00:00:00, trend: Trend, pres: 10.0
date: 2007-01-01 00:00:00, trend: Trend, pres: 10.0
adding: 6 to database
date: 2007-02-01 00:00:00, trend: Trend, pres: 10.0
date: 2007-07-01 00:00:00, trend: Trend, pres: 10.0
adding: 6 to database
date: 2007-12-01 00:00:00, trend: Trend, pres: 10.0
date: 2007-05-01 00:00:00, trend: Trend, pres: 10.0
adding: 6 to database
date: 2007-06-01 00:00:00, trend: NoTrend, pres: 10.0
date: 2007-03-01 00:00:00, trend: NoTrend, pres: 10.0
date: 20

In [101]:
meanField = cursor.next()

StopIteration: 

In [96]:
meanField.keys()

dict_keys(['_id', 'gridName', 'measurement', 'units', 'param', 'data', 'variable', 'date', 'pres', 'cellsize', 'NODATA_value', 'trend'])