Inputs

In [1]:
# =========================================
# For accessing directories
# =========================================
root_dir = "/local/data/artemis/workspace/vbennington/LDEO_HPD/2021models"
data_output_dir = f"{root_dir}/data"


Modules


In [3]:
# standard imports
import os
import datetime
from pathlib import Path
from collections import defaultdict
import scipy
import random
import numpy as np
import xarray as xr
import pandas as pd
import joblib
import pickle

# machine learning libraries
from sklearn.model_selection import train_test_split

# Python file with supporting functions
import pre_HPD

Predefined Values

In [5]:
# Loading references


In [4]:
# =========================================
# Setting the date range to unify the date type
# =========================================
# Define date range
date_range_start = '1982-01-01T00:00:00.000000000'
#date_range_start = '1990-01-01T00:00:00.000000000'
date_range_end = '2020-12-31T00:00:00.000000000'

# create date vector
dates = pd.date_range(start=date_range_start, 
                      end=date_range_end,freq='MS') + np.timedelta64(14, 'D')

Loop to load in data, clean it, and save it

In [5]:

print(datetime.datetime.now())

df = pre_HPD.create_inputs(dates,12)
#df = pre_SOCAT.create_pco2_inputs(dates,12)
df.head()
# Save the pandas data frame to my workspace
pre_HPD.save_clean_data(df, data_output_dir)

print(datetime.datetime.now())

2022-03-01 18:32:54.757180
468
<xarray.DataArray 'time' ()>
array('2020-12-15T00:00:00.000000000', dtype='datetime64[ns]')
Coordinates:
    time     datetime64[ns] 2020-12-15
<xarray.DataArray 'time' (time: 468)>
array(['1982-01-15T00:00:00.000000000', '1982-02-15T00:00:00.000000000',
       '1982-03-15T00:00:00.000000000', ..., '2020-10-15T00:00:00.000000000',
       '2020-11-15T00:00:00.000000000', '2020-12-15T00:00:00.000000000'],
      dtype='datetime64[ns]')
Coordinates:
  * time     (time) datetime64[ns] 1982-01-15 1982-02-15 ... 2020-12-15
<xarray.Dataset>
Dimensions:                (time: 468, xlon: 360, ylat: 180)
Coordinates:
  * time                   (time) datetime64[ns] 1982-01-15 ... 2020-12-15
  * xlon                   (xlon) float64 -179.5 -178.5 -177.5 ... 178.5 179.5
  * ylat                   (ylat) float64 -89.5 -88.5 -87.5 ... 87.5 88.5 89.5
Data variables:
    SSS                    (time, ylat, xlon) float64 ...
    sst                    (time, ylat, xlon) flo

In [7]:
# Look at the data to make sure it looks okay:
df = pd.read_pickle(f'{data_output_dir}/data_clean_2D_mon_1x1_198201-202012.pkl')

In [9]:
df.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,SSS,sst,chl,pCO2,XCO2,socat_mask,mld,cesm_sfco2_1x1_A,fesom2_sfco2_1x1_A,mpi_sfco2_1x1_A,...,chl_log,chl_anom,sss_anom,sst_anom,T0,T1,A,B,C,net_mask
time,xlon,ylat,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
2020-12-15,179.5,85.5,31.680887,-1.79,,,419.006388,,74.964204,385.598667,283.037109,366.973877,...,,,0.178074,0.0,0.966848,-0.255353,0.996917,0.000685,0.078456,
2020-12-15,179.5,86.5,31.724632,-1.79,,,419.004079,,74.175315,386.120408,277.0896,366.156189,...,,,0.253362,0.0,0.966848,-0.255353,0.998135,0.000533,0.061046,
2020-12-15,179.5,87.5,31.649817,-1.79,,,419.001771,,71.900835,386.35733,272.581818,353.915161,...,,,0.294222,0.0,0.966848,-0.255353,0.999048,0.000381,0.043618,
2020-12-15,179.5,88.5,31.661079,-1.79,,,418.999463,,0.0,386.701442,268.717682,325.212097,...,,,0.324067,0.0,0.966848,-0.255353,0.999657,0.000228,0.026176,
2020-12-15,179.5,89.5,31.754524,-1.79,,,418.997154,,0.0,387.368222,267.311676,316.2724,...,,,0.323254,0.0,0.966848,-0.255353,0.999962,7.6e-05,0.008726,
