In [1]:
import pandas as pd  
import numpy as np
from glob import glob
from scipy.io import savemat, loadmat
from datetime import datetime, timedelta

import pdb
import pymongo
import psycopg2

import os
import gzip
from pchipOceanSlices import PchipOceanSlices

from importlib import reload

import visualizeProfs as vp
%config InlineBackend.figure_format = 'retina'
from pchipOceanSlices import PchipOceanSlices

# Get reject list of profiles

In [3]:
rList = glob('/storage/kakapo/*temp*')
wl = glob('/storage/kakapo/*.xml')
header = ['platform_number', 'cycle', 'lat', 'lon', 'c1', 'c2', 'c3', 'c4', 'c5']

def get_lines(filename):
    with open(filename, "r") as f:
        data = f.readlines()
        lines = []
        for line in data: 
            line = ' '.join(line.split())
            line = line.split(' ')
            line[-1] = line[-1].replace('/n', '')
            lines.append(line)
        print(filename)
    return lines

def parse_kakapo(filename, header):
    '''parses reject and white list in kakapo'''
    lines = get_lines(filename)
    df = pd.DataFrame(data=lines, columns=header)
    df['profile_id'] = df['platform_number'].apply(lambda x: str(x)) + '_'  + df['cycle'].apply(lambda x: str(x))
    return df

rejectDf = parse_kakapo(rList[0], header)
whiteList = parse_kakapo(wl[0], header)

/storage/kakapo/reject_cycle_fromvar_temp_01x01_finl
/storage/kakapo/ar_whitelist_2019.xml


In [4]:
print(rejectDf.shape)
rejectDf.head()

(2135, 10)


Unnamed: 0,platform_number,cycle,lat,lon,c1,c2,c3,c4,c5,profile_id
0,6903190,18,-60.917,304.997,0,2.154,0.257,6.0xtimes1975.0,,6903190_18
1,5903417,88,-37.717,332.474,1,0.343,0.051,6.0xtimes1975.0,,5903417_88
2,6901697,81,-37.674,332.432,1,0.551,0.051,6.0xtimes1975.0,,6901697_81
3,3900529,66,-37.284,285.835,1,0.23,0.038,6.0xtimes1975.0,,3900529_66
4,5904120,53,-36.369,330.982,1,0.64,0.048,6.0xtimes1975.0,,5904120_53


In [5]:
ok_ids = whiteList['profile_id'].tolist()

In [6]:
rejectDf = rejectDf[~rejectDf['profile_id'].isin(ok_ids)]
rejectList = rejectDf['profile_id'].tolist()

In [7]:
print(rejectDf.shape)
rejectDf.head()

(1113, 10)


Unnamed: 0,platform_number,cycle,lat,lon,c1,c2,c3,c4,c5,profile_id
0,6903190,18,-60.917,304.997,0,2.154,0.257,6.0xtimes1975.0,,6903190_18
1,5903417,88,-37.717,332.474,1,0.343,0.051,6.0xtimes1975.0,,5903417_88
2,6901697,81,-37.674,332.432,1,0.551,0.051,6.0xtimes1975.0,,6901697_81
3,3900529,66,-37.284,285.835,1,0.23,0.038,6.0xtimes1975.0,,3900529_66
4,5904120,53,-36.369,330.982,1,0.64,0.048,6.0xtimes1975.0,,5904120_53


# Init database

In [5]:
def create_collection(dbName, collectionName):
    dbUrl = 'mongodb://localhost:27017/'
    client = pymongo.MongoClient(dbUrl)
    db = client[dbName]
    coll = db[collectionName]
    coll = init_profiles_collection(coll)
    return coll    

def init_profiles_collection(coll):
    try:
        coll.create_index([('date', pymongo.DESCENDING)])
        coll.create_index([('dac', pymongo.DESCENDING)])
        coll.create_index([('lat', pymongo.DESCENDING)])
        coll.create_index([('lon', pymongo.DESCENDING)])
    except:
        logging.warning('not able to get collections or set indexes')
    return coll

coll = create_collection('JG', 'profiles')

# Parse data files

In [10]:
files = glob('/storage/kakapo/*_padj.dat.gz')
header = ['latitude', 'longitude', 'day', 'month', 'year', 'hr', 'min',
          'sec', 'pressure', 'temperature', 'potential temperature',
          'salinity', 'DMODE status', 'WMO', 'cycle number', 'original netcdf file']

def get_gzip_lines(filename):
    with gzip.open(filename, "r") as f:
        data = f.readlines()
        lines = []
        for line in data:
            line = line.decode('utf-8')
            line = ' '.join(line.split())
            line = line.split(' ')
            line[-1] = line[-1].replace('/n', '')
            lines.append(line)
    return lines

def make_docs_from_lines(lines, presRange=[0,2000]):
    df = pd.DataFrame(data=lines, columns=header)
    df['profile_id'] = df['WMO'].apply(lambda x: str(x)) + '_'  + df['cycle number'].apply(lambda x: str(x))
    df = df[~df['profile_id'].isin(rejectList)]
    docs = []
    df = df.rename({'latitude': 'lat', 'longitude': 'lon', 'pressure':'pres',
                    'salinity':'psal', 'temperature': 'temp'}, axis=1)
    floatCols = ['lat', 'lon', 'pres', 'temp', 'psal']
    df[floatCols] = df[floatCols].astype(float)
    if presRange:
        df = df[(df['pres'].astype(float) >= presRange[0]) & (df['pres'].astype(float) <= presRange[1])]
    for profile_id, profDf in df.groupby('profile_id'):
        dl = profDf[['year', 'month', 'day']].astype(int).iloc[0]
        date = datetime(*dl)
        doc = {}
        doc['_id'] = profile_id
        doc['lat'] = profDf.lat.iloc[0]
        doc['lon'] = profDf.lon.iloc[0]
        doc['date'] = date
        meas = profDf[['pres', 'temp', 'psal']].to_dict(orient='records')
        if len(meas) == 0:
            continue
        
        doc['measurements'] = meas
        docs.append(doc)
        
    return docs
        
        

In [11]:
#coll.drop()
rejectProfiles = rejectDf.profile_id.tolist()
for fdx, filename in enumerate(files):
    lines = get_gzip_lines(filename)
    documents = make_docs_from_lines(lines, )
    print('index: {0}, filename: {1} lines: {2}'.format(fdx, filename, len(documents)))
    try:
        coll.insert_many(documents, ordered=False)
    except pymongo.errors.BulkWriteError as bwe:
        writeErrors = bwe.details['writeErrors']
        problem_idx = []
        for we in writeErrors:
            problem_idx.append(we['index'])
        trouble_list = [documents[i] for i in problem_idx]
        for doc in trouble_list:
            coll.replace_one({'_id': doc['_id']}, doc, upsert=True)

index: 0, filename: /storage/kakapo/200401_padj.dat.gz lines: 1787
index: 1, filename: /storage/kakapo/200402_padj.dat.gz lines: 1707
index: 2, filename: /storage/kakapo/200403_padj.dat.gz lines: 1914
index: 3, filename: /storage/kakapo/200404_padj.dat.gz lines: 1875
index: 4, filename: /storage/kakapo/200405_padj.dat.gz lines: 2062
index: 5, filename: /storage/kakapo/200406_padj.dat.gz lines: 2009
index: 6, filename: /storage/kakapo/200407_padj.dat.gz lines: 2063
index: 7, filename: /storage/kakapo/200408_padj.dat.gz lines: 2179
index: 8, filename: /storage/kakapo/200409_padj.dat.gz lines: 2340
index: 9, filename: /storage/kakapo/200410_padj.dat.gz lines: 2527
index: 10, filename: /storage/kakapo/200411_padj.dat.gz lines: 2489
index: 11, filename: /storage/kakapo/200412_padj.dat.gz lines: 2701
index: 12, filename: /storage/kakapo/200501_padj.dat.gz lines: 2811
index: 13, filename: /storage/kakapo/200502_padj.dat.gz lines: 2660
index: 14, filename: /storage/kakapo/200503_padj.dat.gz li

index: 121, filename: /storage/kakapo/201402_padj.dat.gz lines: 10111
index: 122, filename: /storage/kakapo/201403_padj.dat.gz lines: 11188
index: 123, filename: /storage/kakapo/201404_padj.dat.gz lines: 11076
index: 124, filename: /storage/kakapo/201405_padj.dat.gz lines: 11399
index: 125, filename: /storage/kakapo/201406_padj.dat.gz lines: 11033
index: 126, filename: /storage/kakapo/201407_padj.dat.gz lines: 11431
index: 127, filename: /storage/kakapo/201408_padj.dat.gz lines: 11358
index: 128, filename: /storage/kakapo/201409_padj.dat.gz lines: 10888
index: 129, filename: /storage/kakapo/201410_padj.dat.gz lines: 11121
index: 130, filename: /storage/kakapo/201411_padj.dat.gz lines: 10910
index: 131, filename: /storage/kakapo/201412_padj.dat.gz lines: 11551
index: 132, filename: /storage/kakapo/201501_padj.dat.gz lines: 11853
index: 133, filename: /storage/kakapo/201502_padj.dat.gz lines: 10736
index: 134, filename: /storage/kakapo/201503_padj.dat.gz lines: 11701
index: 135, filename

# test for rejects

In [30]:
print(coll.find({'_id': rejectList[0]}).count())


0


  """Entry point for launching an IPython kernel.


# Querying db into mat file

In [11]:
startYear = 2007
endYear = 2016
JGDict = {}

profCycleNumberAggr = []
profFloatIDAggr = []
profJulDayAggr = []
profLatAggr = []
profLongAggr = []
profModeAggr = ''
profMonthAggr = []
profPresAggr = []
profPsalAggr = []
profTempAggr = []
profYearAggr = []

In [12]:
def repeat_array(val, nTimes):
    arr = [val for x in range(nTimes)]
    return arr

def records_to_arrays(meas):
    temp, pres, psal = [], [], []
    for row in meas:
        temp.append(row['temp'])
        pres.append(row['pres'])
        psal.append(row['psal'])
    return temp, pres, psal

def make_file_name(prefix='/storage/s/'):
    fileName = prefix + 'prof'
    fileName += str(obs).capitalize()
    fileName += '_at{}dbar'.format(str(presLevel))
    fileName += '_{0}_{1}'.format(minYear, maxYear)
    fileName += '.mat'
    return fileName

In [13]:
ed = datetime(endYear, 1, 1, 0)
dbLen = coll.count_documents({"date": {"$lte": ed} })
print(coll.count_documents({"date": {"$lte": ed} }))

1107110


In [24]:
from itertools import product
from calendar import monthrange
allIters = [[year for year in range(2004, 2016+1)], [month for month in range(1, 12+1)]]
yearMonth = list(product(*allIters))

In [28]:
year, month = yearMonth[1]
monthRange = monthrange(year, month)
print(monthRange)

(6, 29)


In [41]:
datetime(year, 1, monthRange[1], 23, 59, 59)

datetime.datetime(2004, 1, 29, 23, 59, 59)

In [44]:
12//12

1

In [60]:
for year, month in yearMonth:
    monthRange = monthrange(year, month)
    start = datetime(year, month, 1, 0)
    end = datetime(year + month//12, month%12 + 1, 1, 0, 0)
    ymQuery = {"date": {"$gte": start, "$lt": end}}
    cursor = coll.find(ymQuery).sort('date')
    print('year: {0}, month: {1}, nProf: {2}'.format(year, month, coll.count_documents(ymQuery)))
    
    profCycleNumberAggr = []
    profFloatIDAggr = []
    profJulDayAggr = []
    profLatAggr = []
    profLongAggr = []
    profModeAggr = ''
    profMonthAggr = []
    profPresAggr = []
    profPsalAggr = []
    profTempAggr = []
    profYearAggr = []
    for idx, doc in enumerate(cursor):

        date = doc['date']
        dateStr = date.strftime("%d-%b-%Y %H:%M:%S")
        profYear = date.year
        profMonth = date.month
        profLen = len(doc['measurements'])

        platCyc = doc['_id'].split('_')
        platform = platCyc[0]
        cycle = platCyc[1]

        temp, pres, psal = records_to_arrays(doc['measurements'])

        if len(temp) != profLen or len(pres) != profLen or len(psal) != profLen:
            pdb.set_trace()

        profCycleNumberAggr.append(cycle)
        profFloatIDAggr.append(platform)
        profJulDayAggr.append(dateStr)
        profLatAggr.append(doc['lat'])
        profLongAggr.append(doc['lon'])
        profYearAggr.append(profYear)
        profMonthAggr.append(profMonth)
        profModeAggr += ''
        profPresAggr.append(pres)
        profPsalAggr.append(psal)
        profTempAggr.append(temp)

    JGDict['profCycleNumberAggr'] = profCycleNumberAggr
    JGDict['profFloatIDAggr'] = profFloatIDAggr
    JGDict['profJulDayAggr'] = profJulDayAggr
    JGDict['profLatAggr'] = profLatAggr
    JGDict['profLongAggr'] = profLongAggr
    JGDict['profYearAggr'] = profYearAggr
    JGDict['profMonthAggr'] = profMonthAggr
    JGDict['profModeAggr'] = profModeAggr
    JGDict['profPresAggr'] = profPresAggr
    JGDict['profPsalAggr'] = profPsalAggr
    JGDict['profTempAggr'] = profTempAggr
    fileName = '/storage/forDonata/JG_{0}_{1}.mat'.format(year, month)
    savemat(fileName, JGDict)

year: 2004, month: 1, nProf: 1787
year: 2004, month: 2, nProf: 1703
year: 2004, month: 3, nProf: 1914
year: 2004, month: 4, nProf: 1875
year: 2004, month: 5, nProf: 2062
year: 2004, month: 6, nProf: 2008
year: 2004, month: 7, nProf: 2063
year: 2004, month: 8, nProf: 2177
year: 2004, month: 9, nProf: 2338
year: 2004, month: 10, nProf: 2525
year: 2004, month: 11, nProf: 2487
year: 2004, month: 12, nProf: 2699
year: 2005, month: 1, nProf: 2807
year: 2005, month: 2, nProf: 2658
year: 2005, month: 3, nProf: 3024
year: 2005, month: 4, nProf: 3040
year: 2005, month: 5, nProf: 3408
year: 2005, month: 6, nProf: 3550
year: 2005, month: 7, nProf: 3787
year: 2005, month: 8, nProf: 3798
year: 2005, month: 9, nProf: 3721
year: 2005, month: 10, nProf: 4149
year: 2005, month: 11, nProf: 4242
year: 2005, month: 12, nProf: 4558
year: 2006, month: 1, nProf: 4719
year: 2006, month: 2, nProf: 4551
year: 2006, month: 3, nProf: 5098
year: 2006, month: 4, nProf: 4930
year: 2006, month: 5, nProf: 5082
year: 20

In [118]:
JGDict['profCycleNumberAggr'] = profCycleNumberAggr
JGDict['profFloatIDAggr'] = profFloatIDAggr
JGDict['profJulDayAggr'] = profJulDayAggr
JGDict['profLatAggr'] = profLatAggr
JGDict['profLongAggr'] = profLongAggr
JGDict['profYearAggr'] = profYearAggr
JGDict['profMonthAggr'] = profMonthAggr
JGDict['profModeAggr'] = profModeAggr
JGDict['profPresAggr'] = profPresAggr
JGDict['profPsalAggr'] = profPsalAggr
JGDict['profTempAggr'] = profTempAggr
with open('/storage/JGtest.mat', 'ab') as f
    savemat(f, JGDict)

  arr[empties] = ' '


In [83]:
profJulDayAggr

[]

In [71]:
doc

{'_id': '1900774_0',
 'lat': 9.979,
 'lon': 337.884,
 'date': datetime.datetime(2007, 7, 6, 0, 0),
 'measurements': [{'pres': 5.0, 'temp': 27.562, 'psal': 36.13},
  {'pres': 10.0, 'temp': 27.461, 'psal': 36.126},
  {'pres': 15.0, 'temp': 27.435, 'psal': 36.126},
  {'pres': 20.0, 'temp': 27.417, 'psal': 36.127},
  {'pres': 25.0, 'temp': 27.09, 'psal': 36.127},
  {'pres': 30.0, 'temp': 26.32, 'psal': 36.108},
  {'pres': 35.0, 'temp': 25.022, 'psal': 36.043},
  {'pres': 40.0, 'temp': 23.416, 'psal': 36.021},
  {'pres': 45.0, 'temp': 20.803, 'psal': 35.955},
  {'pres': 50.0, 'temp': 19.427, 'psal': 35.923},
  {'pres': 55.0, 'temp': 18.081, 'psal': 35.842},
  {'pres': 60.0, 'temp': 17.02, 'psal': 35.755},
  {'pres': 65.0, 'temp': 16.7, 'psal': 35.722},
  {'pres': 70.0, 'temp': 16.353, 'psal': 35.688},
  {'pres': 75.0, 'temp': 15.818, 'psal': 35.631},
  {'pres': 80.0, 'temp': 15.188, 'psal': 35.564},
  {'pres': 85.0, 'temp': 14.648, 'psal': 35.499},
  {'pres': 90.0, 'temp': 14.418, 'psal': 3

In [17]:
cursor = coll.find()
doc = cursor.next()