In [1]:
import numpy as np
import pandas as pd
import xarray as xr
import pymongo
import pdb
from datetime import datetime, timedelta
from dateutil.relativedelta import *
from scipy.io import loadmat
import os
import glob
import itertools

In [2]:
def create_collection(dbName, collectionName, init_collection):
    dbUrl = 'mongodb://localhost:27017/'
    client = pymongo.MongoClient(dbUrl)
    db = client[dbName]
    coll = db[collectionName]
    coll = init_collection(coll)
    return coll

def init_grid_collection(coll):
    coll.create_index([('date', pymongo.DESCENDING)])
    coll.create_index([('pres', pymongo.DESCENDING)])
    coll.create_index([('data.lat', pymongo.DESCENDING)])
    coll.create_index([('data.lon', pymongo.ASCENDING)])
    coll.create_index([('trend', pymongo.DESCENDING)])

    return coll

def init_param_collection(coll):
    coll.create_index([('pres', pymongo.DESCENDING)])
    coll.create_index([('data.lon', pymongo.DESCENDING)])
    coll.create_index([('data.lat', pymongo.ASCENDING)])
    coll.create_index([('param', pymongo.DESCENDING),
                       ('trend', pymongo.DESCENDING),
                       ('model', pymongo.DESCENDING),
                       ('modelParam', pymongo.DESCENDING)])
        
    return coll

In [21]:
def transform_lon(lon):
    '''
    Transforms longitude from absolute to -180 to 180 deg
    '''
    if lon >= 180:
        lon -= 360
    return lon

def make_doc(df, date, trend, presLevel, dataVariable, param, measurement, gridName, units):
    '''
    Takes df and converts it into a document for mongodb
    '''
    doc = {}
    df = df.rename(index=str, columns={dataVariable: 'value'})
    dataDict = df.to_dict(orient='records')
    doc['gridName'] = gridName
    doc['measurement'] = measurement #temperature or psal
    doc['units'] = units # degrees celsius or psu
    doc['param'] = param # anomaly or mean
    doc['data'] = dataDict 
    doc['variable'] = dataVariable # ARGO_TEMPERATURE_ANOMALY or ARGO_TEMPERATURE_MEAN or predGrid
    doc['date'] = date
    doc['pres'] = float(presLevel)
    doc['cellsize'] = 1  #  Degree
    doc['NODATA_value'] = np.NaN
    doc['trend'] = trend
    return doc

def make_grid_docs(files, gridName, trend, param='anomaly', dataVariable='predGrid'):
    docs = []
    for file in files:
        doc = {}
        anomData = loadmat(file)
        fa = file.split('/')[-1].split('_')
        year = fa[-1].replace('.mat', '')
        month = fa[-2]
        year_month = year + month
        date = datetime.strptime(year_month, '%Y%m')
        presLevel = float(fa[-6].replace('at', '').replace('dbar', ''))
        latGrid = anomData['latGrid'].flatten()
        lonGrid = anomData['longGrid'].flatten()
        values = anomData[dataVariable].flatten()
        df = pd.DataFrame()
        df['lat'] = latGrid
        df['lon'] = lonGrid
        df['lon'] = df['lon'].apply(lambda lon: transform_lon(lon))
        df['value'] = values
        
        doc = make_doc(df, date, trend, presLevel, dataVariable, param, 'temperature', gridName, 'Degrees Celcius')
        docs.append(doc)
    return docs

def make_param_docs(grids):
    docs = []
    for pres, measurement, model, trend in grids:
        paramFiles = '{0}{1}{2}*.mat'.format(model, trend, measurement)

        path = os.path.join(kuuselaBase, pres,\
                                'outliers_removed', trend, 'Results',\
                                'localMLE' + paramFiles)
        filename = glob.glob(path) # should be one
        print(paramFiles)
        if not filename:
            print('file not found: {}'.format(path))
            continue
        print(filename)
        paramData = loadmat(filename[0])
        if model == 'Space':
            modelParams = spaceParams
        elif model == 'SpaceTime':
            modelParams = spaceTimeParams
        else:
            raise('modelParams not found')

        for modelParam in modelParams:
            print(pres, measurement, model, trend, modelParam)
            doc = make_param_doc(paramData, modelParam, model, trend, measurement, pres)
            docs.append(doc)
    return docs

def make_param_doc(paramData, modelParam, model, trend, measurement, pres):
    values = paramData[modelParam].flatten()
    latGrid = paramData['latGrid'].flatten()
    lonGrid = paramData['longGrid'].flatten()
    gridName = 'ks' + model + measurement + trend

    df = pd.DataFrame()
    df['lat'] = latGrid
    df['lon'] = lonGrid
    df['lon'] = df['lon'].apply(lambda lon: transform_lon(lon))
    df['value'] = values
    dataDict = df.to_dict(orient='records')
    doc = {}
    doc['gridName'] = gridName
    doc['model'] = model
    doc['param'] = modelParam
    doc['units'] = '' #TODO: figure out what to add for units
    doc['trend'] = trend
    doc['measurement'] = measurement
    doc['data'] = dataDict
    doc['pres'] = float(pres)
    doc['cellsize'] = 1  #  Degree
    doc['NODATA_value'] = np.NaN
    return doc

# Make param collection

In [22]:
kuuselaBase = os.path.join('/home','tyler','Kuusela-Stein', 'Data', 'Data')
pressures = [str(10.0)]
measurements = ['Temp']
trends = ['Trend', 'NoTrend', 'Trend2']
models = ['Space', 'SpaceTime']
spaceTimeParams  = ['nResGrid', 'nll', 'sigmaOpt', 'thetaLatOpt', 'thetaLongOpt', 'thetasOpt', 'thetatOpt']
spaceParams = ['aOpt', 'latGrid', 'longGrid', 'nResGrid', 'nll', 'sigmaOpt', 'theta1Opt', 'theta2Opt']

units = {
    'nResGrid': 'number of profiles used',
    'nll': 'negative log likilihood',
    'sigmaOpt': 'Degrees Celsius',
    'thetaLatOpt': 'Degrees',
    'thetaLongOpt': 'Degrees',
    'thetasOpt': '[]'
}

In [23]:
allIters = [pressures, measurements, models, trends]
grids = list(itertools.product(*allIters))

docs = make_param_docs(grids)
collName = 'ksParams'
coll = create_collection('argo', collName, init_param_collection)
coll.drop()
coll.insert_many(docs)

SpaceTrendTemp*.mat
['/home/tyler/Kuusela-Stein/Data/Data/10.0/outliers_removed/Trend/Results/localMLESpaceTrendTemp_at10.0dbar_5_20_02_2007_2018.mat']
10.0 Temp Space Trend aOpt
10.0 Temp Space Trend latGrid
10.0 Temp Space Trend longGrid
10.0 Temp Space Trend nResGrid
10.0 Temp Space Trend nll
10.0 Temp Space Trend sigmaOpt
10.0 Temp Space Trend theta1Opt
10.0 Temp Space Trend theta2Opt
SpaceNoTrendTemp*.mat
['/home/tyler/Kuusela-Stein/Data/Data/10.0/outliers_removed/NoTrend/Results/localMLESpaceNoTrendTemp_at10.0dbar_5_20_02_2007_2018.mat']
10.0 Temp Space NoTrend aOpt
10.0 Temp Space NoTrend latGrid
10.0 Temp Space NoTrend longGrid
10.0 Temp Space NoTrend nResGrid
10.0 Temp Space NoTrend nll
10.0 Temp Space NoTrend sigmaOpt
10.0 Temp Space NoTrend theta1Opt
10.0 Temp Space NoTrend theta2Opt
SpaceTrend2Temp*.mat
['/home/tyler/Kuusela-Stein/Data/Data/10.0/outliers_removed/Trend2/Results/localMLESpaceTrend2Temp_at10.0dbar_5_20_02_2007_2018.mat']
10.0 Temp Space Trend2 aOpt
10.0 Temp S

<pymongo.results.InsertManyResult at 0x7f26c0eff148>

# Make anomaly collection

In [27]:
def make_path_collection(pres, measurement, model, trend, param):
    modelLabel = model.strip('localMLE')
    presLabel = '_at{}dbar'.format(pres)
    files = '{0}{1}{2}{3}{4}*.mat'.format(param, modelLabel, trend, measurement, presLabel)
    path = os.path.join(kuuselaBase, pres,\
                        'outliers_removed', trend, 'Results',\
                        files)   
    print(path)
    return modelLabel, path

def make_grid_collection(collName):
    coll = create_collection('argo', collName, init_grid_collection)
    coll.drop()
    print(collName)
    return coll

def make_anomaly_collection():
    allIters = [pressures, measurements, models, trends]
    grids = list(itertools.product(*allIters))
    param = 'anomaly'
    dataVariable = 'predGrid'
    collName = 'ksTempAnom'
    coll = make_grid_collection(collName)
    for pres, measurement, model, trend in grids:
        modelLabel, path = make_path_collection(pres, measurement, model, trend, param) 
        anomMats = glob.glob(path)
        gridName = 'ks' + model + measurement + trend
        print(gridName)
        if not anomMats:
            print('file not found: {}'.format(path))
            continue
        print('num of anom mats: {}'.format(len(anomMats)))
        for fileChunk in np.array_split(anomMats, 3):
            docs = make_grid_docs(fileChunk, gridName, trend, param, dataVariable)
            print(len(docs))
            coll.insert_many(docs)
    # make for express testing
    testColl = create_collection('argo-express-test', collName, init_grid_collection)
    testColl.drop()
    testColl.insert_many(docs)
    
make_anomaly_collection()

ksTempAnom
/home/tyler/Kuusela-Stein/Data/Data/10.0/outliers_removed/Trend/Results/anomalySpaceTrendTemp_at10.0dbar*.mat
ksSpaceTempTrend
num of anom mats: 144
48
48
48
/home/tyler/Kuusela-Stein/Data/Data/10.0/outliers_removed/NoTrend/Results/anomalySpaceNoTrendTemp_at10.0dbar*.mat
ksSpaceTempNoTrend
num of anom mats: 12
4
4
4
/home/tyler/Kuusela-Stein/Data/Data/10.0/outliers_removed/Trend2/Results/anomalySpaceTrend2Temp_at10.0dbar*.mat
ksSpaceTempTrend2
num of anom mats: 12
4
4
4
/home/tyler/Kuusela-Stein/Data/Data/10.0/outliers_removed/Trend/Results/anomalySpaceTimeTrendTemp_at10.0dbar*.mat
ksSpaceTimeTempTrend
num of anom mats: 12
4
4
4
/home/tyler/Kuusela-Stein/Data/Data/10.0/outliers_removed/NoTrend/Results/anomalySpaceTimeNoTrendTemp_at10.0dbar*.mat
ksSpaceTimeTempNoTrend
file not found: /home/tyler/Kuusela-Stein/Data/Data/10.0/outliers_removed/NoTrend/Results/anomalySpaceTimeNoTrendTemp_at10.0dbar*.mat
/home/tyler/Kuusela-Stein/Data/Data/10.0/outliers_removed/Trend2/Results/anom

# Make mean collection

In [26]:
def make_mean_collection():
    trends
    param = 'mean'
    dataVariable = 'meanGrid'
    
    allIters = [pressures, measurements, trends]
    grids = list(itertools.product(*allIters))
    collName = 'ksTempMean'
    coll = make_grid_collection(collName)
    for pres, measurement, trend in grids:
        modelLabel, path = make_path_collection(pres, measurement, '', trend, param+'Grid') 
        anomMats = glob.glob(path)
        if not anomMats:
            print('file not found: {}'.format(path))
            continue
        print('num of anom mats: {}'.format(len(anomMats)))
        gridName = 'ksMean' + measurement + trend
        print(gridName)
        for fileChunk in np.array_split(anomMats, 3):
            docs = make_grid_docs(fileChunk, gridName, trend, param, dataVariable)
            print(len(docs))
            coll.insert_many(docs)
    # make for express testing
    testColl = create_collection('argo-express-test', gridName, init_grid_collection)
    testColl.drop()
    testColl.insert_many(docs)
make_mean_collection()

ksTempMean
/home/tyler/Kuusela-Stein/Data/Data/10.0/outliers_removed/Trend/Results/meanGridTrendTemp_at10.0dbar*.mat
num of anom mats: 12
ksMeanTempTrend
4
4
4
/home/tyler/Kuusela-Stein/Data/Data/10.0/outliers_removed/NoTrend/Results/meanGridNoTrendTemp_at10.0dbar*.mat
num of anom mats: 12
ksMeanTempNoTrend
4
4
4
/home/tyler/Kuusela-Stein/Data/Data/10.0/outliers_removed/Trend2/Results/meanGridTrend2Temp_at10.0dbar*.mat
num of anom mats: 12
ksMeanTempTrend2
4
4
4


In [25]:
param.capitalize()

'Anomaly'