# O2 gapfill projection
    - Needs to enter a 6 digit input parameter as follows : 
    - First digit = Algorithm type (1=RF, 2=NN)
    - Second digit = Data Source (1=Ship only, 2=Ship+Argo)
    - Third digit = Ocean basin (1=Atlantic, 2=Pacific, 3=Indian, 4=Southern, 5=Arctic)
    - Fourth digit = T/S data source (1=EN4)
    - Fifth digit = predictor variable set (1=default)
    - Sixth digit = hyperparameter set (1=default)

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import xarray as xr
import sklearn as skl
import gsw
import cartopy.crs as ccrs
from scipy.interpolate import interp1d
import os
import warnings
warnings.filterwarnings('ignore')
import joblib
from multiprocessing import Pool

In [5]:
#
# version information
#
ver = '1.2.1.1.1.1'
# 
# The version information will determine which basin / algorithm will be used to calculate the O2 maps. 

In [7]:
selection = ver.split('.')
basin = ['Atlantic','Pacific','Indian','Southern','Arctic']
#
if selection[0] == '1':
    print('Random Forst algorithm will be used.')
    alg = 'RF'
elif selection[0] == '2':
    print('Neural Network algorithm will be used.')
    alg = 'NN'
else:
    print('error - incorrect algorithm type')
#
if selection[1] == '1':
    print('Ship-based O2 data will be used. end_year = 2011')
    endyear=2011
elif selection[1] == '2':
    print('Ship-based and Argo-O2 data will be used. end_year = 2021')
    endyear=2021
else:
    print('error - incorrect input data type')
#
if selection[2] == '1':
    print(basin[int(selection[2])-1]+' Ocean will be mapped')
elif selection[2] == '2':
    print(basin[int(selection[2])-1]+' Ocean will be mapped')
elif selection[2] == '3':
    print(basin[int(selection[2])-1]+' Ocean will be mapped')
elif selection[2] == '4':
    print(basin[int(selection[2])-1]+' Ocean will be mapped')
elif selection[2] == '5':
    print(basin[int(selection[2])-1]+' Ocean will be mapped')
else:
    print('error - incorrect O2 data type')
#
if selection[3] == '1':
    print('EN4 dataset will be used for T/S input. ')
else:
    print('error - incorrect T/S data type')
#
if selection[4] == '1':
    print('Predictor variables include T, S, lon, lat, depth (pressure), year, month')
else:
    print('error - incorrect predictor variable type')
#
if selection[5] == '1':
    print('Hyperparameter set is optimized via K-fold CV')
else:
    print('error - incorrect hyperparameter type')

Random Forst algorithm will be used.
Ship-based and Argo-O2 data will be used. end_year = 2021
Atlantic Ocean will be mapped
EN4 dataset will be used for T/S input. 
Predictor variables include T, S, lon, lat, depth (pressure), year, month
Hyperparameter set is optimized via K-fold CV


In [8]:
# Define the input and output folders
#
os.system('echo $USER > userid')
usrid=np.genfromtxt('userid',dtype='<U32')
os.system('rm userid')
diro = '/glade/derecho/scratch/'+str(usrid)+'/WOD18_OSDCTD/'
dirf = '/glade/campaign/univ/ugit0034/EN4/L09_20x180x360/'
dirin = '/glade/campaign/univ/ugit0034/WOD18_OSDCTD/'
fosd='_1x1bin_osd_'
fctd='_1x1bin_ctd_'
fmer='_1x1bin_osdctd_'
var=['o2','TSN2']

In [9]:
# obtain vertical grid
ds=xr.open_dataset(dirin+var[0]+fmer+str(1965)+'.nc')
Z=ds.depth.to_numpy()
Nz=np.size(Z)

In [10]:
# select analysis period
# do not change the start year from 1965 (this is when Carpenter 1965 established modern Winkler method)
yrs=np.arange(1965,endyear,1)
t=np.arange('1965-01',str(endyear)+'-01',dtype='datetime64[M]')

In [11]:
dirout='/glade/campaign/univ/ugit0034/ML4O2_results/'
MLmodel = joblib.load(dirout+f'algorithm_v{ver}.sav')
# read in additional parameters
params = np.load(dirout+f'ML_params_v{ver}.npz')
Xm=params['Xm']
Xstd=params['Xstd']
ym=params['ym']
ystd=params['ystd']

FileNotFoundError: [Errno 2] No such file or directory: '/glade/campaign/univ/ugit0034/ML4O2_results/algorithm_v1.2.1.1.1.1.sav'

In [12]:
# basin mask
dsm=xr.open_dataset('/glade/campaign/univ/ugit0034/wod18/basin_mask_01.nc')
ma = dsm.basin_mask.sel(depth=Z).to_numpy()

In [9]:
zlev=300
kind=[idx for idx,elem in enumerate(Z) if elem==zlev]
maz=np.squeeze(ma[kind,:,:])
#
mon=["%.2d" % i for i in np.arange(1,13,1)]
#
dc=xr.open_dataset(dirf+'EN4_TSN2_L09_180x360_'+str(1965)+mon[0]+'.nc')
y=dc.lat.to_numpy()
x=dc.lon.to_numpy()
# use alternative x coordinate: longitude - 20
xa0 = x - 20
xalt = np.where(xa0<0,xa0+360,xa0)
#
Ny=np.size(y)
Nx=np.size(x)
Nt=np.size(yrs)*12
xx,yy=np.meshgrid(xalt,y)
#
depth1 = dc.depth.to_numpy()
Nz1 = np.size(depth1)
#

In [10]:
# apply basin mask 
def apply_basinmask(datain):
    if selection[2] == '1':
        dataout=np.where((maz==1),datain,np.nan)
    elif selection[2] == '2':
        dataout=np.where((maz==2),datain,np.nan)
    elif selection[2] == '3':
        dataout=np.where((maz==3)&(maz==56),datain,np.nan)
    elif selection[2] == '4':
        dataout=np.where((maz==10),datain,np.nan)
    elif selection[2] == '5':
        dataout=np.where((maz==11),datain,np.nan)
    else:
        print('error - incorrect O2 data type')
    #
    return dataout

In [11]:
# get input data from full model
def get_inputdata(zlev,it,year,mn):
    #dc = xr.open_dataset(dirf+'EN4_TSN2_G10_180x360_'+str(year)+mon[mn]+'.nc')
    dc = xr.open_dataset(dirf+'EN4_TSN2_L09_180x360_'+str(year)+mon[mn]+'.nc')
    soa=dc.SA.interp(depth=zlev).to_numpy().squeeze()
    toa=dc.CT.interp(depth=zlev).to_numpy().squeeze()
    return soa,toa#,mld

In [12]:
# generate data matrix
def gen_datamatrix(xi,yi,it,x1,x2,x3,x4):
    X1 = x1.flatten() # 
    X2 = x2.flatten() # 
    X3 = x3.flatten() # 
    X4 = x4.flatten() # 
    tt0  = np.ones((Ny,Nx))*it
    X5 = tt0.flatten() # decimal year 
    X6 = X5%12         # month
    xxi = xi.flatten() # lon
    yyi = yi.flatten() # lat
    # 
    #ml1 = mld.flatten()
    #X6 = np.where(ml1>zlev-zoff,X6,2)
    # remove nan
    #print([np.size(X1),np.size(X2),np.size(X3),np.size(X4),np.size(X5)])
    dd = X1+X2+X3+X4+X5
    X11=X1[np.isnan(dd)==False]
    X21=X2[np.isnan(dd)==False]
    X31=X3[np.isnan(dd)==False]
    X41=X4[np.isnan(dd)==False]
    X51=X5[np.isnan(dd)==False]
    X61=X6[np.isnan(dd)==False]
    #
    Xi=xxi[np.isnan(dd)==False]
    Yi=yyi[np.isnan(dd)==False]
    #
    zin = np.ones(np.size(X11))*zlev
    # Normalize data
    # generate data matrix and standardize it
    X = np.array([X11, X21, X31, X41, zin, X51, X61])
    Xa = (X.T - Xm)/Xstd
    Nsample = np.size(X11)
    #print(Nsample)
    return Xa,Xi,Yi

In [13]:
def map_yearly(year):
    Nx=np.size(x)
    Ny=np.size(y)
    zlev_arr=np.array([zlev])
    o2est2=np.zeros((12,1,Ny,Nx))
    xxi,yyi=np.meshgrid(np.arange(0,Nx,1),np.arange(0,Ny,1))
    if year%10 == 5:
        print('year = '+str(year))
    t=np.arange(str(year)+'-01',str(year+1)+'-01',dtype='datetime64[M]')
    for month in range(12):
        it = month+(year-1965)*12
        soa,toa = get_inputdata(zlev,it,year,month)
        # apply mask
        soa=apply_basinmask(soa)
        toa=apply_basinmask(toa)
        # generate data matrix
        Xa,xi,yi=gen_datamatrix(xxi,yyi,it,soa,toa,xx,yy)
        temp = np.shape(Xa)
        Nsample=temp[0]
        # projection
        out = reg.predict(Xa)
        # map it back to lon-lat grid
        temp = np.nan*np.zeros((Ny,Nx))
        for n in range(Nsample):
            temp[yi[n],xi[n]]=out[n]
        o2est2[month,0,:,:] = temp*ystd + ym
    da1=xr.DataArray(data=o2est2,name='o2est',dims=['time','depth','lat','lon'],
                 coords={'time':t,'depth':zlev_arr,'lat':yout,'lon':xout})
    ds=da1.to_dataset()
    ds.to_netcdf(diro+'temp/o2est_'+str(year)+'.nc')
    return 0

In [14]:
zlevels = depth1
#
# reconstruction in parallel mode
#
reg=MLmodel
xout=dc.lon
yout=dc.lat
#
for zlev_cnt,zlev in enumerate(zlevels):
    print(f'calculating {zlev}m')
    maz = dsm.basin_mask.interp(depth=zlev).to_numpy()
    #kind=[idx for idx,elem in enumerate(Z) if elem==zlev]
    #maz=np.squeeze(ma[kind,:,:])
    os.system('rm '+diro+'/temp/*.nc')
    #
    if __name__ == '__main__':
        with Pool(10) as p:
            print(p.map(map_yearly, yrs))
    #
    # save the result as a netCDF file
    #
    dtemp=xr.open_mfdataset(diro+'temp/o2est*.nc')
    dtemp.to_netcdf(diro+'/O2map_v'+ver+'_z'+str(int(zlev))+'.nc')

calculating 6.0m
year = 1965
year = 1975
year = 1985
year = 1995
year = 2005
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
calculating 10.0m
year = 1965
year = 1975
year = 1985
year = 1995
year = 2005
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
calculating 20.0m
year = 1965
year = 1975
year = 1985
year = 1995
year = 2005
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
calculating 30.0m
year = 1965
year = 1975
year = 1985
year = 1995
year = 2005
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
calculating 40.0m
year = 1965year = 1975

year = 1985
year = 1995
year = 2005
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [15]:
ds=xr.open_mfdataset(f'{diro}O2map_v{ver}*')

In [16]:
#os.system(f'rm {dirout}O2map_v{ver}.nc')
ds.to_netcdf(f'{dirout}O2map_v{ver}.nc')