In [1]:
# basic libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_predict, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.svm import SVR
from sklearn.cluster import KMeans

#others
from xgboost import XGBRegressor
import cartopy.crs as ccrs
import cartopy.mpl.ticker as cticker
import time
import xarray as xr

# Variables from config file
from config import BASE_DIR, FILE_NAMES, LABELS, ATTRIBUTES


In [2]:
df_data_1grid = pd.read_csv(f"{BASE_DIR}/dataset.csv")
df_data_interp_50 = pd.read_csv(f"{BASE_DIR}/dataset_interp_50.csv")
df_data_interp_100 = pd.read_csv(f"{BASE_DIR}/dataset_interp_100.csv")

In [3]:
df_data_1grid.head()

Unnamed: 0,skn,year,month,data_in,Lat_DD,Lon_DD,Lon_DD_updated,lat,lon,air,...,pottmp.1,pr_wtr,shum,shum.1,shum.2,shum.3,shum.4,shum.5,skt,slp
0,1.0,1948,1,3.2,18.916176,-155.674994,204.325006,20.0,205.0,295.39603,...,-3.926514,29.034512,2.592493,-25.859348,0.589191,7.106412,2.945999,9.869999,23.385218,1014.0849
1,2.0,1948,1,5.95,19.10866,-155.825545,204.174455,20.0,205.0,295.39603,...,-3.926514,29.034512,2.592493,-25.859348,0.589191,7.106412,2.945999,9.869999,23.385218,1014.0849
2,2.2,1948,1,11.5,19.16474,-155.68228,204.31772,20.0,205.0,295.39603,...,-3.926514,29.034512,2.592493,-25.859348,0.589191,7.106412,2.945999,9.869999,23.385218,1014.0849
3,2.25,1948,1,5.515941,19.160603,-155.822488,204.177512,20.0,205.0,295.39603,...,-3.926514,29.034512,2.592493,-25.859348,0.589191,7.106412,2.945999,9.869999,23.385218,1014.0849
4,2.26,1948,1,4.310617,19.225323,-155.778876,204.221124,20.0,205.0,295.39603,...,-3.926514,29.034512,2.592493,-25.859348,0.589191,7.106412,2.945999,9.869999,23.385218,1014.0849


In [109]:
# load only relevant portion of netCDF files
lat_hawaii = [15, 17.5, 20, 22.5, 25]
lon_hawaii = np.array([-162.5, -160, -157.5, -155, -152.5]) + 360
dim_interpolation = 100

datasets = []
for file_name, label, attribute in zip(FILE_NAMES, LABELS, ATTRIBUTES):
    print(file_name, label, attribute)
    # Linear Interpolation
    ds = (
        xr.open_dataset(f"{BASE_DIR}/{file_name}")
        .loc[dict(lat=lat_hawaii, lon=lon_hawaii)]
        .rename({f"{attribute}": f"{label}"})
    )
    if "level" in ds.dims:
        ds = ds.squeeze("level").drop("level")
    datasets.append({
        "file": file_name,
        "dataset": ds,
        "label": label,
        "attribute": attribute
    })
# combine them into one dataset
ds_combined = xr.merge([datasets[i]["dataset"] for i in range(len(datasets))])

air.2m.mon.mean.regridded.nc air2m air
air.1000-500.mon.mean.nc air1000_500 air
hgt500.mon.mean.nc hgt500 hgt
hgt1000.mon.mean.nc hgt1000 hgt
omega500.mon.mean.nc omega500 omega
pottmp.1000-500.mon.mean.nc pottemp1000-500 pottmp
pottmp.1000-850.mon.mean.nc pottemp1000-850 pottmp
pwtr.mon.mean.nc pr_wtr pr_wtr
shum_x_uwnd.700.mon.mean.nc shum-uwnd-700 shum
shum_x_uwnd.925.mon.mean.nc shum-uwnd-925 shum
shum_x_vwnd.700.mon.mean.nc shum-vwnd-700 shum
shum_x_vwnd.925.mon.mean.nc shum-vwnd-950 shum
shum700.mon.mean.nc shum700 shum
shum925.mon.mean.nc shum925 shum
skt.mon.mean.regridded.nc skt skt
slp.mon.mean.nc slp slp


In [111]:
# Check df_data_1grid

In [80]:
new_columns = list(df_data_1grid.columns[:9])
new_columns.extend([item['label'] for item in datasets])
df_data_1grid.columns = new_columns

In [110]:
# pick random samples and for each sample, make sure all 16 values match what is in netcdf files
num_data = df_data_1grid.shape[0]
num_samples = 300
epsilon = 1e-3
samples = np.random.randint(low=0, high=num_data, size=num_samples)
for i in samples[:]:
    row = df_data_1grid.iloc[i]
    year = int(row.year)
    month = int(row.month)
    lat = row.lat
    lon = row.lon
    for item in datasets:
        label = item['label']
        reanalysis_data = ds_combined.loc[dict(time=f"{year}-{month}-01", lat=lat, lon=lon)][label].to_numpy()
        #print(reanalysis_data, row[label])
        #print(abs(reanalysis_data - row[label]))
        assert abs(reanalysis_data - row[label]) < epsilon

In [112]:
# check df_data_interp_50

In [114]:
df_data_interp_50.head()

Unnamed: 0,skn,year,month,data_in,name,lat,lon,elevation,Observer,Status2010,...,pottemp1000-850,pr_wtr,shum-uwnd-700,shum-uwnd-925,shum-vwnd-700,shum-vwnd-950,shum700,shum925,skt,slp
0,1.0,1948,1,3.2,KALAE,18.916176,-155.674994,35.0,USCG,Discontinued,...,-3.960722,28.72755,-7.064851,-61.055853,-3.270553,-27.510409,2.313204,10.367102,24.926305,1014.17422
1,1.0,1948,2,1.33,KALAE,18.916176,-155.674994,35.0,USCG,Discontinued,...,-4.556628,25.03054,0.487241,-39.02811,-3.446772,-12.94747,2.178714,8.839245,23.464826,1015.279124
2,1.0,1948,3,2.31,KALAE,18.916176,-155.674994,35.0,USCG,Discontinued,...,-4.401711,31.932421,-4.474437,-71.785581,0.40506,-7.952975,3.359346,9.383184,23.786142,1015.81458
3,1.0,1948,4,3.23,KALAE,18.916176,-155.674994,35.0,USCG,Discontinued,...,-4.265197,33.586917,3.420109,-41.628213,-2.01182,-4.123647,3.621286,10.094225,24.766062,1014.821248
4,1.0,1948,5,1.14,KALAE,18.916176,-155.674994,35.0,USCG,Discontinued,...,-3.750196,35.868127,-13.244502,-92.765777,0.826419,-9.357693,3.55802,11.498285,25.877674,1015.903627


In [None]:
# # load only relevant portion of netCDF files
# lat_hawaii = [15, 17.5, 20, 22.5, 25]
# lon_hawaii = np.array([-162.5, -160, -157.5, -155, -152.5]) + 360
# dim_interpolation = 100

# datasets = []
# for file_name, label, attribute in zip(FILE_NAMES, LABELS, ATTRIBUTES):
#     print(file_name, label, attribute)
#     # Linear Interpolation
#     ds = xr.open_dataset(f"{BASE_DIR}/{file_name}").loc[dict(lat=lat_hawaii, lon=lon_hawaii)].interp(
#             lat=np.linspace(
#                 lat_hawaii[0],
#                 lat_hawaii[-1],
#                 dim_interpolation
#             ),
#             lon=np.linspace(
#                 lon_hawaii[0],
#                 lon_hawaii[-1],
#                 dim_interpolation
#             ),
#             method="linear"
#         ).rename({f"{attribute}": f"{label}"})
#     if "level" in ds.dims:
#         ds = ds.squeeze("level").drop("level")
#     datasets.append({
#         "file": file_name,
#         "dataset": ds,
#         "label": label,
#         "attribute": attribute
#     })

In [146]:
num_data = df_data_interp_50.shape[0]
num_samples = 300
epsilon = 1e-6
samples = np.random.randint(low=0, high=num_data, size=num_samples)

lat_hawaii = [15, 17.5, 20, 22.5, 25]
lon_hawaii = np.array([-162.5, -160, -157.5, -155, -152.5]) + 360
dim_interpolation = 50


for i, (_, row) in enumerate(df_data_interp_50.iloc[samples].iterrows()):
    year, month = int(row.year), int(row.month)
    lat, lon = row.lat, row.lon
    for item in datasets:
        label = item['label']
        reanalysis_data = (
            ds_combined.loc[dict(time=f"{year}-{month}-01", lat=lat_hawaii, lon=lon_hawaii)]
            .interp(
                lat=np.linspace(
                    lat_hawaii[0],
                    lat_hawaii[-1],
                    dim_interpolation
                ),
                lon=np.linspace(
                    lon_hawaii[0],
                    lon_hawaii[-1],
                    dim_interpolation
                ),
                method='linear'
            )
            .sel(lat=lat, lon=lon, method='nearest')[label]
        ).to_numpy()
        assert abs(row[label] - reanalysis_data) < epsilon
    print(f"{i+1}/{num_samples}", end='\r')

300/300

In [140]:
type(df_data_interp_50.iloc[samples])

pandas.core.frame.DataFrame

In [132]:
df_data_interp_50.iloc[samples[:10]]

Unnamed: 0,skn,year,month,data_in,name,lat,lon,elevation,Observer,Status2010,...,pottemp1000-850,pr_wtr,shum-uwnd-700,shum-uwnd-925,shum-vwnd-700,shum-vwnd-950,shum700,shum925,skt,slp
504789,704.0,1980,5,3.12,HONO SUBSTATION,21.305725,-157.861146,50.0,NWS,Discontinued,...,-4.396795,32.194132,-9.989649,-68.924848,1.158798,-10.092972,3.022897,10.678082,25.261308,1017.370826
766374,990.3,1954,12,23.1,OLOKELE,22.033525,-159.552203,1310.0,HAWAIIAN SUGAR,Discontinued,...,-4.599219,28.257926,-13.305789,-79.057687,-7.513889,-30.972087,2.793265,9.570959,24.952726,1017.080367
128524,94.2,1955,8,11.12,KAUMANA (500),19.700573,-155.112231,500.0,MATSUNAGA J,Discontinued,...,-3.17518,34.309458,-10.239203,-79.252535,-5.061135,-14.12099,2.929264,11.989245,26.282629,1016.474586
134737,99.0,1994,2,0.328013,PTA West,19.771667,-155.702222,4290.0,RAWS,Current,...,-4.354822,32.741307,1.17404,-43.975875,8.156027,15.323181,3.551938,10.046877,24.513761,1014.235489
770820,996.0,2006,7,1.7,KAMOOLOA,21.96853,-159.474981,720.0,MCBRYDE SUGAR,Discontinued,...,-4.957861,33.472682,-20.220208,-103.217745,-1.958574,-16.516528,2.724101,11.483122,26.243814,1017.790945
349537,412.0,1952,6,0.0,GAGE 24,20.835129,-156.366628,850.0,LIBBY,Discontinued,...,-3.968268,34.814833,-17.064007,-103.27748,1.455855,-9.164549,2.467306,12.112,25.882569,1017.99079
447074,542.1,1968,8,2.53,PUUOHOKU RANCH,21.14624,-156.732462,710.0,PUUOHOKU RANCH,Current,...,-3.150795,38.011363,-14.633166,-96.041248,-1.139328,0.444127,2.812448,12.532898,27.872861,1015.749699
616960,804.2,1948,8,1.397158,YAMAGUCHI,21.435168,-158.163918,40.0,WAIANAE CO,Discontinued,...,-4.189324,40.156239,-8.017417,-77.906903,-2.336726,-17.688308,4.454122,12.116571,27.453036,1015.083029
341155,402.0,1971,10,0.02,"VIL 10 (Field 510, Camp 10)",20.85846,-156.403849,355.0,HC&S,Discontinued,...,-3.254418,32.143299,-7.727875,-73.136879,0.259766,6.678868,1.98604,11.615041,26.632915,1016.131701
49562,52.0,1994,3,6.39,HALEMAUMAU,19.401978,-155.280555,3650.0,HAW VOL NAT PARK,Current,...,-3.763002,30.724317,-5.162977,-81.038588,-1.120151,-18.655766,2.467959,9.968898,24.676621,1017.015625
