In [2]:
# basic libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_predict, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.svm import SVR
from sklearn.cluster import KMeans

#others
from xgboost import XGBRegressor
import cartopy.crs as ccrs
import cartopy.mpl.ticker as cticker
import time
import xarray as xr

# Variables from config file
from config import BASE_DIR, FILE_NAMES, LABELS, ATTRIBUTES


In [3]:
df_data_1grid = pd.read_csv(f"{BASE_DIR}/dataset.csv")
df_data_interp_50 = pd.read_csv(f"{BASE_DIR}/dataset_interp_50.csv")
df_data_interp_100 = pd.read_csv(f"{BASE_DIR}/dataset_interp_100.csv")

In [3]:
df_data_1grid.head()

Unnamed: 0,skn,year,month,data_in,Lat_DD,Lon_DD,Lon_DD_updated,lat,lon,air,...,pottmp.1,pr_wtr,shum,shum.1,shum.2,shum.3,shum.4,shum.5,skt,slp
0,1.0,1948,1,3.2,18.916176,-155.674994,204.325006,20.0,205.0,295.39603,...,-3.926514,29.034512,2.592493,-25.859348,0.589191,7.106412,2.945999,9.869999,23.385218,1014.0849
1,2.0,1948,1,5.95,19.10866,-155.825545,204.174455,20.0,205.0,295.39603,...,-3.926514,29.034512,2.592493,-25.859348,0.589191,7.106412,2.945999,9.869999,23.385218,1014.0849
2,2.2,1948,1,11.5,19.16474,-155.68228,204.31772,20.0,205.0,295.39603,...,-3.926514,29.034512,2.592493,-25.859348,0.589191,7.106412,2.945999,9.869999,23.385218,1014.0849
3,2.25,1948,1,5.515941,19.160603,-155.822488,204.177512,20.0,205.0,295.39603,...,-3.926514,29.034512,2.592493,-25.859348,0.589191,7.106412,2.945999,9.869999,23.385218,1014.0849
4,2.26,1948,1,4.310617,19.225323,-155.778876,204.221124,20.0,205.0,295.39603,...,-3.926514,29.034512,2.592493,-25.859348,0.589191,7.106412,2.945999,9.869999,23.385218,1014.0849


In [7]:
# load only relevant portion of netCDF files
lat_hawaii = [15, 17.5, 20, 22.5, 25]
lon_hawaii = np.array([-162.5, -160, -157.5, -155, -152.5]) + 360
dim_interpolation = 100

datasets = []
for file_name, label, attribute in zip(FILE_NAMES, LABELS, ATTRIBUTES):
    print(file_name, label, attribute)
    # Linear Interpolation
    ds = (
        xr.open_dataset(f"{BASE_DIR}/{file_name}")
        .loc[dict(lat=lat_hawaii, lon=lon_hawaii)]
        .rename({f"{attribute}": f"{label}"})
    )
    if "level" in ds.dims:
        ds = ds.squeeze("level").drop("level")
    datasets.append({
        "file": file_name,
        "dataset": ds,
        "label": label,
        "attribute": attribute
    })
# combine them into one dataset
ds_combined = xr.merge([datasets[i]["dataset"] for i in range(len(datasets))])

air.2m.mon.mean.regridded.nc air2m air
air.1000-500.mon.mean.nc air1000_500 air
hgt500.mon.mean.nc hgt500 hgt
hgt1000.mon.mean.nc hgt1000 hgt
omega500.mon.mean.nc omega500 omega
pottmp.1000-500.mon.mean.nc pottemp1000-500 pottmp
pottmp.1000-850.mon.mean.nc pottemp1000-850 pottmp
pwtr.mon.mean.nc pr_wtr pr_wtr
shum_x_uwnd.700.mon.mean.nc shum-uwnd-700 shum
shum_x_uwnd.925.mon.mean.nc shum-uwnd-925 shum
shum_x_vwnd.700.mon.mean.nc shum-vwnd-700 shum
shum_x_vwnd.925.mon.mean.nc shum-vwnd-950 shum
shum700.mon.mean.nc shum700 shum
shum925.mon.mean.nc shum925 shum
skt.mon.mean.regridded.nc skt skt
slp.mon.mean.nc slp slp


In [111]:
# Check df_data_1grid

In [80]:
new_columns = list(df_data_1grid.columns[:9])
new_columns.extend([item['label'] for item in datasets])
df_data_1grid.columns = new_columns

In [110]:
# pick random samples and for each sample, make sure all 16 values match what is in netcdf files
num_data = df_data_1grid.shape[0]
num_samples = 300
epsilon = 1e-3
samples = np.random.randint(low=0, high=num_data, size=num_samples)
for i in samples[:]:
    row = df_data_1grid.iloc[i]
    year = int(row.year)
    month = int(row.month)
    lat = row.lat
    lon = row.lon
    for item in datasets:
        label = item['label']
        reanalysis_data = ds_combined.loc[dict(time=f"{year}-{month}-01", lat=lat, lon=lon)][label].to_numpy()
        #print(reanalysis_data, row[label])
        #print(abs(reanalysis_data - row[label]))
        assert abs(reanalysis_data - row[label]) < epsilon

In [112]:
# check df_data_interp_50 / df_data_interp_100

In [154]:
df, dim_interpolation = df_data_interp_50, 50

num_data = df.shape[0]
num_samples = 10
epsilon = 1e-6
samples = np.random.randint(low=0, high=num_data, size=num_samples)

lat_hawaii = [15, 17.5, 20, 22.5, 25]
lon_hawaii = np.array([-162.5, -160, -157.5, -155, -152.5]) + 360


for i, (_, row) in enumerate(df.iloc[samples].iterrows()):
    year, month = int(row.year), int(row.month)
    lat, lon = row.lat, row.lon
    for item in datasets:
        label = item['label']
        reanalysis_data = (
            ds_combined.loc[dict(time=f"{year}-{month}-01", lat=lat_hawaii, lon=lon_hawaii)]
            .interp(
                lat=np.linspace(
                    lat_hawaii[0],
                    lat_hawaii[-1],
                    dim_interpolation
                ),
                lon=np.linspace(
                    lon_hawaii[0],
                    lon_hawaii[-1],
                    dim_interpolation
                ),
                method='linear'
            )
            .sel(lat=lat, lon=lon, method='nearest')[label]
        ).to_numpy()
        assert abs(row[label] - reanalysis_data) < epsilon
    print(f"{i+1}/{num_samples}", end='\r')

10/10

# Dataset with 5 grids

In [5]:
df_data_5grid = pd.read_csv(f"{BASE_DIR}/dataset_5girds.csv")

In [9]:
lat_hawaii = [20, 22.5]
lon_hawaii = np.array([-160, -157.5, -155]) + 360
df_data_5grid.head()

Unnamed: 0,skn,year,month,data_in,name,lat,lon,elevation,Observer,Status2010,...,skt_2,skt_3,skt_4,skt_5,slp_0,slp_1,slp_2,slp_3,slp_4,slp_5
0,1.0,1948,1,3.2,KALAE,18.916176,-155.674994,35.0,USCG,Discontinued,...,23.385218,23.6341,23.194658,23.052715,1014.1755,1013.94574,1014.0849,1015.1941,1014.69965,1014.4063
1,2.0,1948,1,5.95,MANUKA,19.10866,-155.825545,1750.0,STATE DIV OF FTRY,Current,...,23.385218,23.6341,23.194658,23.052715,1014.1755,1013.94574,1014.0849,1015.1941,1014.69965,1014.4063
2,2.2,1948,1,11.5,KAHUKU SHED 3,19.16474,-155.68228,4890.0,KAHUKU RANCH,Discontinued,...,23.385218,23.6341,23.194658,23.052715,1014.1755,1013.94574,1014.0849,1015.1941,1014.69965,1014.4063
3,2.25,1948,1,5.515941,RESERVOIR (2940),19.160603,-155.822488,2940.0,PUUWAAWAA RANCH,Discontinued,...,23.385218,23.6341,23.194658,23.052715,1014.1755,1013.94574,1014.0849,1015.1941,1014.69965,1014.4063
4,2.26,1948,1,4.310617,CASTLE,19.225323,-155.778876,5680.0,PUUWAAWAA RANCH,Discontinued,...,23.385218,23.6341,23.194658,23.052715,1014.1755,1013.94574,1014.0849,1015.1941,1014.69965,1014.4063
