In [8]:
# basic libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_predict, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.svm import SVR
from sklearn.cluster import KMeans

#others
from xgboost import XGBRegressor
import cartopy.crs as ccrs
import cartopy.mpl.ticker as cticker
import time
import xarray as xr
from multiprocessing import pool
import multiprocessing as mp

# Variables from config file
from config import BASE_DIR, FILE_NAMES, LABELS, ATTRIBUTES


In [9]:
df_data = pd.read_excel(f"{BASE_DIR}/FilledDataset2012.xlsx", sheet_name="Data_in")
df_metadata = pd.read_excel(f"{BASE_DIR}/FilledDataset2012.xlsx", sheet_name="Header")

## Conevrt df_data into cell-based matrix

In [10]:
X = []
for index, row in df_data.iterrows():
    if row.Year < 1948:
        # No need to keep data older than 1948 becase no data exists in netCDF files
        continue
    for i, cell in enumerate(row[2:]):
        X.append([row.SKN, row.Year, i + 1, cell])

In [11]:
df_data_by_cell = pd.DataFrame(X, columns = ["skn", "year", "month", "data_in"]).dropna()
df_data_by_cell = df_data_by_cell.replace(r'^\s*$', np.nan, regex=True).dropna()

In [12]:
df_data_by_cell.head(3)

Unnamed: 0,skn,year,month,data_in
0,1.0,1948,1,3.2
1,1.0,1948,2,1.33
2,1.0,1948,3,2.31


In [13]:
df_data_w_coord = (
    df_data_by_cell
    .merge(right=df_metadata, left_on="skn", right_on="SKN")
    .drop(columns="SKN", axis=1)
    .rename(columns={"Lat_DD": "lat", "Lon_DD": "lon", "ElevFT": "elevation", "Name": "name"})
)

In [14]:
df_data_w_coord.head()

Unnamed: 0,skn,year,month,data_in,name,lat,lon,elevation,Observer,Status2010,NumMos,MinYear,MaxYear
0,1.0,1948,1,3.2,KALAE,18.916176,-155.674994,35.0,USCG,Discontinued,277,1924,1949
1,1.0,1948,2,1.33,KALAE,18.916176,-155.674994,35.0,USCG,Discontinued,277,1924,1949
2,1.0,1948,3,2.31,KALAE,18.916176,-155.674994,35.0,USCG,Discontinued,277,1924,1949
3,1.0,1948,4,3.23,KALAE,18.916176,-155.674994,35.0,USCG,Discontinued,277,1924,1949
4,1.0,1948,5,1.14,KALAE,18.916176,-155.674994,35.0,USCG,Discontinued,277,1924,1949


## Load the cdf files

In [96]:
# load only relevant portion of netCDF files
lat_hawaii = [15, 17.5, 20, 22.5, 25]
lon_hawaii = np.array([-162.5, -160, -157.5, -155, -152.5]) + 360
dim_interpolation = 100

datasets = []
for file_name, label, attribute in zip(FILE_NAMES, LABELS, ATTRIBUTES):
    print(file_name, label, attribute)
    # Linear Interpolation
    ds = xr.open_dataset(f"{BASE_DIR}/{file_name}").loc[dict(lat=lat_hawaii, lon=lon_hawaii)].interp(
            lat=np.linspace(
                lat_hawaii[0],
                lat_hawaii[-1],
                dim_interpolation
            ),
            lon=np.linspace(
                lon_hawaii[0],
                lon_hawaii[-1],
                dim_interpolation
            ),
            method="linear"
        ).rename({f"{attribute}": f"{label}"})
    if "level" in ds.dims:
        ds = ds.squeeze("level").drop("level")
    datasets.append({
        "file": file_name,
        "dataset": ds,
        "label": label,
        "attribute": attribute
    })

air.2m.mon.mean.regridded.nc air2m air
air.1000-500.mon.mean.nc air1000_500 air
hgt500.mon.mean.nc hgt500 hgt
hgt1000.mon.mean.nc hgt1000 hgt
omega500.mon.mean.nc omega500 omega
pottmp.1000-500.mon.mean.nc pottemp1000-500 pottmp
pottmp.1000-850.mon.mean.nc pottemp1000-850 pottmp
pwtr.mon.mean.nc pr_wtr pr_wtr
shum_x_uwnd.700.mon.mean.nc shum-uwnd-700 shum
shum_x_uwnd.925.mon.mean.nc shum-uwnd-925 shum
shum_x_vwnd.700.mon.mean.nc shum-vwnd-700 shum
shum_x_vwnd.925.mon.mean.nc shum-vwnd-950 shum
shum700.mon.mean.nc shum700 shum
shum925.mon.mean.nc shum925 shum
skt.mon.mean.regridded.nc skt skt
slp.mon.mean.nc slp slp


In [97]:
ds_combined = xr.merge([datasets[i]["dataset"] for i in range(len(datasets))])

In [125]:
start = time.time()
df_data_w_coord["values"] = df_data_w_coord[:10000].apply(lambda x: ds_combined.sel(time=f"{x.year}-{x.month}-01", lat=x.lat, lon=x.lon, method="nearest").to_array().to_numpy(), axis=1)
end = time.time()

elapsed = end - start
print("{:.3f}".format(elapsed))
df_data_w_coord.shape[0]/10000 /60 / 60

31.714


0.024043361111111113

In [195]:
df_new = pd.DataFrame(np.stack(temp[:1000]), columns=[item["label"] for item in datasets])

In [197]:
df_complete = df_data_w_coord.merge(right=df_new, left_index=True, right_index=True)

In [201]:
df_complete.iloc[:, -16:]

Unnamed: 0,air2m,air1000_500,hgt500,hgt1000,omega500,pottemp1000-500,pottemp1000-850,pr_wtr,shum-uwnd-700,shum-uwnd-925,shum-vwnd-700,shum-vwnd-950,shum700,shum925,skt,slp
0,296.362251,30.593334,5806.633404,122.751710,0.042301,-27.330571,-3.976694,28.678739,-7.071470,-60.664267,-3.335292,-27.741849,2.318151,10.345333,24.909237,1014.198085
1,295.698619,30.687574,5804.879838,131.451407,0.061452,-27.079310,-4.568755,24.988922,0.450107,-38.805761,-3.496950,-13.066533,2.172999,8.830637,23.457119,1015.299784
2,296.247879,32.318486,5804.061553,135.965785,0.000998,-25.239094,-4.402203,31.862511,-4.668788,-71.785439,0.292838,-8.205906,3.347150,9.370667,23.775679,1015.848733
3,297.114453,30.351517,5833.912169,128.456565,0.021241,-27.828150,-4.269346,33.526352,3.466445,-41.395023,-1.941327,-4.107908,3.605182,10.080273,24.751422,1014.841114
4,297.999804,29.935155,5860.363696,137.629523,0.021391,-28.495221,-3.757156,35.827885,-13.206782,-92.122989,0.893711,-9.247211,3.550030,11.478545,25.866196,1015.936653
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,296.906681,28.491009,5862.321329,130.482239,0.045089,-30.014192,-4.576253,26.923517,-1.560929,-56.440910,-1.583419,-12.138320,1.818282,10.123859,24.904489,1014.810056
996,296.844133,30.173438,5828.628092,119.377709,0.038028,-27.978345,-4.406501,27.473520,14.866678,2.650107,-2.327225,-2.350704,2.063757,9.439475,24.927414,1013.573476
997,297.546112,31.795552,5851.889382,144.702178,0.019684,-26.114675,-4.306964,30.490829,-2.121278,-63.731989,-0.154896,-15.897088,2.440878,9.826243,25.398733,1016.515991
998,297.746299,31.571815,5831.390970,132.311443,-0.003214,-26.448509,-3.671636,35.186785,-16.463036,-104.415512,4.470596,3.396484,3.763373,11.349424,25.830715,1015.201865


In [220]:
sample = 104
lat = df_complete.iloc[sample]['lat']
lon = df_complete.iloc[sample]['lon']
year = df_complete.iloc[sample]['year']
month = df_complete.iloc[sample]['month']

for item in datasets:
    value_from_xarray = float(item["dataset"].sel(time=f"{year}-{month}-01", lat=lat, lon=lon, method="nearest")[item["label"]])
    value_in_df = df_complete.iloc[sample][item["label"]]
    assert abs(value_from_xarray - value_in_df) < 1e-6
    
    