# Reading WOD2018 netCDF
    - read in the data
    - display location of data
    - save data in numpy array

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import xarray as xr
from tqdm import tqdm

In [None]:
# open downloaded netCDF files
fn=['ocldb1679866445.14641_OSD.nc',\
    'ocldb1679866445.14641_OSD2.nc',\
    'ocldb1679866445.14641_OSD3.nc',\
    'ocldb1679866445.14641_OSD4.nc',\
    'ocldb1679866445.14641_OSD5.nc',\
    'ocldb1679866445.14641_OSD6.nc']

In [None]:
# data variables
vname=['Temperature','Salinity','Nitrate','Phosphate','Oxygen'] # desired data variable
Nvars=np.size(vname)
Ndepth=137

In [None]:
# first open each file and count the number of profiles
Nf=len(fn)
Nprof=np.zeros(Nf)
Nobs=np.zeros(Nf)
N=np.zeros(Nf)
#
for n in range(len(fn)):
    ds=xr.open_dataset('WOD18/'+fn[n])
    # get the dimensions
    Nprof[n] = np.size(ds.time)
    Nobs[n]  = np.size(ds.z)
# display the numbers
print('total # of profiles = ',str(np.sum(Nprof)))
print('total # of samples = ',str(np.sum(Nobs)))

In [None]:
# get the lon lat and time of each profiles
#
lon=np.zeros(int(np.sum(Nprof)))
lat=np.zeros(int(np.sum(Nprof)))
year=np.zeros(int(np.sum(Nprof)),dtype=int)
month=np.zeros(int(np.sum(Nprof)),dtype=int)
NZcast=np.zeros(int(np.sum(Nprof)))
Zbot=np.zeros(int(np.sum(Nprof)))
QC=np.zeros((Nvars,int(np.sum(Nprof))),dtype=int)
cnt0=0
for n in range(len(fn)):
    cnt1=cnt0+int(Nprof[n])
    ds=xr.open_dataset('WOD18/'+fn[n])
    # get the dimensions
    lon[cnt0:cnt1] = ds.lon.to_numpy()
    lat[cnt0:cnt1] = ds.lat.to_numpy()
    year[cnt0:cnt1] = ds.time.dt.year.to_numpy()
    month[cnt0:cnt1] = ds.time.dt.month.to_numpy()
    NZcast[cnt0:cnt1] = ds.z_row_size.to_numpy()
    Zbot[cnt0:cnt1] = ds.Bottom_Depth.to_numpy()
    for m in range(Nvars):
        QC[m,cnt0:cnt1]=ds[vname[m]+'_WODprofileflag'].to_numpy()
    cnt0=cnt1

In [None]:
# display the distribution of data in space
plt.plot(lon,lat,'.',markersize=0.2)
plt.xlim(-180,180)
plt.ylim(-90,90)
plt.xlabel('longitude')
plt.ylabel('latitude')

In [None]:
# display the distribution of data in time
yr0=np.min(year)
yr1=np.max(year)
b=np.arange(yr0,yr1+1,1)
profcount = plt.hist(year,b)
plt.xlim(1950,2020)
plt.xlabel('year')
plt.ylabel('number of profiles')

### read in data as (profile) x (depth) while appling Quality Control 

In [None]:
# prepare data array and coordinates
z=np.nan*np.zeros((int(np.sum(Nprof)),137))
cnt=0
# read in depth
for m in range(len(fn)):
    ind0=0
    ds=xr.open_dataset('WOD18/'+fn[m])
    for n in range(int(Nprof[m])):
        M=int(NZcast[cnt])
        ind1=int(ind0+M)
        z[cnt,0:M]=ds['z'][ind0:ind1].to_numpy()
        cnt=cnt+1
        ind0=ind1

In [None]:
# standard depth
zstd=np.nanmean(z,axis=0)
Ndepth=np.size(zstd)

In [None]:
#---------------------
# QC profile fucntion
#---------------------
def qc_profile(vindex):
    # set up empty array
    data=np.nan*np.zeros((int(np.sum(Nprof)),Ndepth),dtype=float)
    # set up counters
    cnt =0 # counter for profiles
    for m in range(len(fn)):
        ind0=0 # counter for data read in point
        ds=xr.open_dataset('WOD18/'+fn[m])
        #print(fn[m])
        for n in range(int(Nprof[m])):
            M=int(NZcast[cnt])
            ind1=int(ind0+M)
            # If the QC flag shows acceptable value (0), proceed to store data
            QCtest = (QC[vindex,cnt]==0)
            if QCtest == True:
                data[cnt,0:M]=ds[vname[vindex]][ind0:ind1].to_numpy()
            # update the counters
            cnt=cnt+1 # counting the profile number
            ind0=ind1 # reset the read in point
    return data

In [None]:
# helper function to save netCDF file yearly
def generate_netcdf(dirname,vname,data,lon,lat,year,month,zstd,Nprof):
    profile=np.arange(0,int(np.sum(Nprof)),1)
    da=xr.DataArray(data=data,name=vname,dims=['profile','depth'],\
               coords={'profile':profile,'depth':zstd})
    ds=da.to_dataset()
    ds['year']=xr.DataArray(data=year,dims=['profile'],\
               coords={'profile':profile})
    ds['month']=xr.DataArray(data=month,dims=['profile'],\
               coords={'profile':profile})
    ds['lat']=xr.DataArray(data=lat,dims=['profile'],\
               coords={'profile':profile})
    ds['lon']=xr.DataArray(data=lon,dims=['profile'],\
               coords={'profile':profile})
    ds.to_netcdf(dirname+'/'+vname+'_WOD18_OSD_qc0_profiles.nc')

In [None]:
! mkdir -p WOD_QC_profiles
dirname='WOD_QC_profiles'
#
for m in range(np.size(vname)):
    data=qc_profile(m)
    generate_netcdf(dirname,vname[m],data,lon,lat,year,month,zstd,Nprof)