In [1]:
# basic libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_predict, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.svm import SVR
from sklearn.cluster import KMeans

#others
from xgboost import XGBRegressor
import cartopy.crs as ccrs
import cartopy.mpl.ticker as cticker
import time
import xarray as xr
from multiprocessing import pool
import multiprocessing as mp
import dask.dataframe as dd
from dask.multiprocessing import get

# Variables from config file
from config import BASE_DIR, FILE_NAMES, LABELS, ATTRIBUTES


In [2]:
df_data = pd.read_excel(f"{BASE_DIR}/FilledDataset2012.xlsx", sheet_name="Data_in")
df_metadata = pd.read_excel(f"{BASE_DIR}/FilledDataset2012.xlsx", sheet_name="Header")

In [3]:
X = []
for index, row in df_data.iterrows():
    if row.Year < 1948:
        # No need to keep data older than 1948 becase no data exists in netCDF files
        continue
    for i, cell in enumerate(row[2:]):
        X.append([row.SKN, row.Year, i + 1, cell])
        
df_data_by_cell = pd.DataFrame(X, columns = ["skn", "year", "month", "data_in"]).dropna()
df_data_by_cell = df_data_by_cell.replace(r'^\s*$', np.nan, regex=True).dropna()

In [4]:
df_data_w_coord = (
    df_data_by_cell
    .merge(right=df_metadata, left_on="skn", right_on="SKN")
    .drop(columns="SKN", axis=1)
    .rename(columns={"Lat_DD": "lat", "Lon_DD": "lon", "ElevFT": "elevation", "Name": "name"})
)

In [5]:
# load only relevant portion of netCDF files
lat_hawaii = [20, 22.5]
lon_hawaii = np.array([-160, -157.5, -155]) + 360

datasets = []
for file_name, label, attribute in zip(FILE_NAMES, LABELS, ATTRIBUTES):
    print(file_name, label, attribute)
    # Linear Interpolation
    ds = xr.open_dataset(f"{BASE_DIR}/{file_name}").loc[dict(lat=lat_hawaii, lon=lon_hawaii)].rename({f"{attribute}": f"{label}"})
    if "level" in ds.dims:
        ds = ds.squeeze("level").drop("level")
    datasets.append({
        "file": file_name,
        "dataset": ds,
        "label": label,
        "attribute": attribute
    })

air.2m.mon.mean.regridded.nc air2m air
air.1000-500.mon.mean.nc air1000_500 air
hgt500.mon.mean.nc hgt500 hgt
hgt1000.mon.mean.nc hgt1000 hgt
omega500.mon.mean.nc omega500 omega
pottmp.1000-500.mon.mean.nc pottemp1000-500 pottmp
pottmp.1000-850.mon.mean.nc pottemp1000-850 pottmp
pwtr.mon.mean.nc pr_wtr pr_wtr
shum_x_uwnd.700.mon.mean.nc shum-uwnd-700 shum
shum_x_uwnd.925.mon.mean.nc shum-uwnd-925 shum
shum_x_vwnd.700.mon.mean.nc shum-vwnd-700 shum
shum_x_vwnd.925.mon.mean.nc shum-vwnd-950 shum
shum700.mon.mean.nc shum700 shum
shum925.mon.mean.nc shum925 shum
skt.mon.mean.regridded.nc skt skt
slp.mon.mean.nc slp slp


In [6]:
# the reanalysis data will solely depend on (year, month)
# (Lat, Lon) = (-160, 22.5), (-157.5, 20), (-157.5, 22.5), (-155, 20), (-155, 22.5)

In [7]:
#datasets[0]["dataset"].loc[dict(time="1948-04-01")]

In [8]:
ds_combined = xr.merge([datasets[i]["dataset"] for i in range(len(datasets))])

In [9]:
# compute the min year and max year in df_data_by_cell
min_year, max_year = df_data_by_cell.year.min(), df_data_by_cell.year.max()
# make a df
df_year = pd.DataFrame(range(min_year, max_year + 1), columns=["year"])
df_month = pd.DataFrame(range(1, 13), columns=["month"])
df_year_month = df_year.merge(right=df_month, how='cross')
df_year_month

Unnamed: 0,year,month
0,1948,1
1,1948,2
2,1948,3
3,1948,4
4,1948,5
...,...,...
775,2012,8
776,2012,9
777,2012,10
778,2012,11


In [10]:
df_year_month["matrix"] = df_year_month.apply(
    lambda row: ds_combined.loc[dict(time=f"{row.year}-{row.month}-01")].to_array().to_numpy().reshape(16, 6),
    axis=1
)

In [11]:
df_year_month.head()

Unnamed: 0,year,month,matrix
0,1948,1,"[[295.72696, 295.30087, 295.39603, 295.29828, ..."
1,1948,2,"[[295.32834, 294.9482, 294.80408, 294.7269, 29..."
2,1948,3,"[[295.80078, 295.27454, 294.9782, 295.36456, 2..."
3,1948,4,"[[296.61438, 296.05194, 295.79654, 296.1045, 2..."
4,1948,5,"[[297.48294, 296.84482, 296.50006, 297.02432, ..."


In [12]:
for i in range(len(datasets)):
    df_year_month[datasets[i]['label']] = df_year_month.apply(
        lambda row: row['matrix'][i],
        axis=1
    )

In [13]:
df_reanalysis_by_month = df_year_month.drop('matrix', axis=1)

In [14]:
df_complete = pd.merge(
    df_data_w_coord,
    df_reanalysis_by_month,
    on=["year", "month"]
)

In [15]:
for i in range(len(datasets)):
    print(datasets[i]['label'])
    label = datasets[i]['label']
    for i in range(6):
        df_complete[f"{label}_{i}"] = df_complete.apply(
            lambda row: row[label][i],
            axis=1
        )

air2m
air1000_500
hgt500
hgt1000
omega500
pottemp1000-500
pottemp1000-850
pr_wtr
shum-uwnd-700
shum-uwnd-925
shum-vwnd-700
shum-vwnd-950
shum700
shum925
skt
slp


In [16]:
df_complete.to_csv(f"{BASE_DIR}/dataset_5girds.csv", index=False)

In [17]:
print()




In [18]:
print("DONE!!")

DONE!!
