- A sample notebook to convert Jim Thomson's matlab structures into netcdf format
- I tried to make it as general as possible, but can't guarantee it will work for all the different structure types!
- It should be easy enough to convert this into a loop if you want to run it on lots of different files
- Author: Lettie Roach, lroach@uw.edu - last updated Apr 2020

In [60]:
import numpy as np
import matplotlib as mpl
import glob
import os
import matplotlib.pyplot as plt
from scipy.io import savemat,loadmat,whosmat
import pandas as pd
import datetime, time
import xarray as xr

A function to convert matlab datenum to python datetime

In [61]:
def matlab2datetime(matlab_datenum):
    
    if matlab_datenum!=matlab_datenum: #if there is NaN in the time dimension
        mytime = 0 # this sets time to 1970, which we will then screen out below
    else:
        day = datetime.datetime.fromordinal(int(matlab_datenum))
        dayfrac = datetime.timedelta(days=matlab_datenum%1) - datetime.timedelta(days=366)
        mytime = day + dayfrac
    return mytime

Edit these

In [62]:
datadir = '/Users/lettieroach/Google Drive/SODA_Signature500_Data/dataProducts/ice/'
os.listdir(datadir)
fname = 'SODA_C_sigIce.mat'

Read in SWIFT metadata

In [63]:
df = pd.read_csv('SWIFT_README.csv')
df = df.set_index('netcdf_var').transpose()
attr_dict = df.to_dict()

In [64]:
struct = loadmat(datadir+fname,squeeze_me=True)
[f for f in struct]

['__header__', '__version__', '__globals__', 'sigIce']

Read into a pandas dataframe with python datetime format times

In [65]:
identifier = 'sigIce' # check this makes sense with struct above, may need to edit
my_dict = struct[identifier]
struct

{'__header__': b'MATLAB 5.0 MAT-file, Platform: MACI64, Created on: Thu Dec  5 14:12:29 2019',
 '__version__': '1.0',
 '__globals__': [],
 'sigIce': array(('SODA_C', 77.73676666666667, -139.14153333333334, array([737342.        , 737342.00694444, 737342.01388889, ...,
        737679.97916667, 737679.98611111, 737679.99305556]), array([37.35479984, 37.36276658, 37.36134977, ..., 36.85244967,
        36.84871632, 36.84704984]), array([27.23256532, 27.24052691, 27.23910495, ..., 26.69566294,
        26.69187964, 26.69016321]), array([27.1348463 , 27.1359896 , 27.13788465, ..., 26.5925713 ,
        26.58918286, 26.58843907]), array([26.90065421, 25.83514633, 26.88463611, ..., 26.58846693,
        26.26826893, 26.63813596]), array([        nan,         nan,         nan, ..., 25.38891909,
                nan,         nan]), array([0.23419209, 1.30084327, 0.25324855, ...,        nan, 0.32091393,
               nan]), array([nan, nan, nan, ..., nan, nan, nan]), array([ 0.06986713, -0.08021482,

In [66]:
myvars = ['mooringName','lat','lon','mattime','pAbsolute','pWater','waterDepth','altimeterDistAst',
         'altimeterDistLE','iceDraftAST','iceDraftLE','velEast','velNorth','velUp1','velUp2']

In [67]:
mydict = {}
for var in myvars[:]:
    mydict[var] = my_dict[var].flatten()[0]
df = pd.DataFrame(mydict)
df = df.rename(columns={'mattime':'time'})
print(df)

      mooringName        lat         lon           time  pAbsolute     pWater  \
0          SODA_C  77.736767 -139.141533  737342.000000  37.354800  27.232565   
1          SODA_C  77.736767 -139.141533  737342.006944  37.362767  27.240527   
2          SODA_C  77.736767 -139.141533  737342.013889  37.361350  27.239105   
3          SODA_C  77.736767 -139.141533  737342.020833  37.352483  27.230233   
4          SODA_C  77.736767 -139.141533  737342.027778  37.348050  27.225795   
...           ...        ...         ...            ...        ...        ...   
48667      SODA_C  77.736767 -139.141533  737679.965278  36.857733  26.701046   
48668      SODA_C  77.736767 -139.141533  737679.972222  36.855350  26.698613   
48669      SODA_C  77.736767 -139.141533  737679.979167  36.852450  26.695663   
48670      SODA_C  77.736767 -139.141533  737679.986111  36.848716  26.691880   
48671      SODA_C  77.736767 -139.141533  737679.993056  36.847050  26.690163   

       waterDepth  altimete

In [68]:
df['time'] = [matlab2datetime(tval) for tval in df['time']]
df['time'] = np.array(df['time'],dtype='datetime64[s]')
df = df.set_index('time')
print(df)

                    mooringName        lat         lon  pAbsolute     pWater  \
time                                                                           
2018-10-09 00:00:00      SODA_C  77.736767 -139.141533  37.354800  27.232565   
2018-10-09 00:10:00      SODA_C  77.736767 -139.141533  37.362767  27.240527   
2018-10-09 00:19:59      SODA_C  77.736767 -139.141533  37.361350  27.239105   
2018-10-09 00:30:00      SODA_C  77.736767 -139.141533  37.352483  27.230233   
2018-10-09 00:39:59      SODA_C  77.736767 -139.141533  37.348050  27.225795   
...                         ...        ...         ...        ...        ...   
2019-09-11 23:09:59      SODA_C  77.736767 -139.141533  36.857733  26.701046   
2019-09-11 23:19:59      SODA_C  77.736767 -139.141533  36.855350  26.698613   
2019-09-11 23:29:59      SODA_C  77.736767 -139.141533  36.852450  26.695663   
2019-09-11 23:39:59      SODA_C  77.736767 -139.141533  36.848716  26.691880   
2019-09-11 23:49:59      SODA_C  77.7367

Any NaNs in the time index were set to 1970-01-01, so here we screen them out

In [69]:
df = df.loc[df.index > '2000-01-01']
ntime = len(df.index)
print(ntime)

48672


In [70]:
ds = df.to_xarray()
ds.attrs= {'description' : identifier,
              'data' : 'SODA cruise, Sam Brenner, APL',
              'netcdf_created_by': 'Lettie Roach, 2020, UW'}

# for variables - add description and units
for var in attr_dict:
        if var in ds:
            ds[var].attrs['units'] = attr_dict[var]['units']
            ds[var].attrs['description'] = attr_dict[var]['description']
   
ds.to_netcdf(fname+'_converted.nc')