In [5]:
# basic libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_predict, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.svm import SVR
from sklearn.cluster import KMeans

#others
from xgboost import XGBRegressor
import cartopy.crs as ccrs
import cartopy.mpl.ticker as cticker
import time
import xarray as xr
from multiprocessing import pool
import multiprocessing as mp
import dask.dataframe as dd
from dask.multiprocessing import get

# Variables from config file
from config import BASE_DIR, FILE_NAMES, LABELS, ATTRIBUTES


In [6]:
df_data = pd.read_excel(f"{BASE_DIR}/FilledDataset2012.xlsx", sheet_name="Data_in")
df_metadata = pd.read_excel(f"{BASE_DIR}/FilledDataset2012.xlsx", sheet_name="Header")

## Conevrt df_data into cell-based matrix

In [7]:
X = []
for index, row in df_data.iterrows():
    if row.Year < 1948:
        # No need to keep data older than 1948 becase no data exists in netCDF files
        continue
    for i, cell in enumerate(row[2:]):
        X.append([row.SKN, row.Year, i + 1, cell])

In [8]:
df_data_by_cell = pd.DataFrame(X, columns = ["skn", "year", "month", "data_in"]).dropna()
df_data_by_cell = df_data_by_cell.replace(r'^\s*$', np.nan, regex=True).dropna()

In [9]:
df_data_by_cell.head(3)

Unnamed: 0,skn,year,month,data_in
0,1.0,1948,1,3.2
1,1.0,1948,2,1.33
2,1.0,1948,3,2.31


In [10]:
df_data_w_coord = (
    df_data_by_cell
    .merge(right=df_metadata, left_on="skn", right_on="SKN")
    .drop(columns="SKN", axis=1)
    .rename(columns={"Lat_DD": "lat", "Lon_DD": "lon", "ElevFT": "elevation", "Name": "name"})
)

In [11]:
df_data_w_coord.head()

Unnamed: 0,skn,year,month,data_in,name,lat,lon,elevation,Observer,Status2010,NumMos,MinYear,MaxYear
0,1.0,1948,1,3.2,KALAE,18.916176,-155.674994,35.0,USCG,Discontinued,277,1924,1949
1,1.0,1948,2,1.33,KALAE,18.916176,-155.674994,35.0,USCG,Discontinued,277,1924,1949
2,1.0,1948,3,2.31,KALAE,18.916176,-155.674994,35.0,USCG,Discontinued,277,1924,1949
3,1.0,1948,4,3.23,KALAE,18.916176,-155.674994,35.0,USCG,Discontinued,277,1924,1949
4,1.0,1948,5,1.14,KALAE,18.916176,-155.674994,35.0,USCG,Discontinued,277,1924,1949


## Load the cdf files

In [12]:
# load only relevant portion of netCDF files
lat_hawaii = [15, 17.5, 20, 22.5, 25]
lon_hawaii = np.array([-162.5, -160, -157.5, -155, -152.5]) + 360
dim_interpolation = 100

datasets = []
for file_name, label, attribute in zip(FILE_NAMES, LABELS, ATTRIBUTES):
    print(file_name, label, attribute)
    # Linear Interpolation
    ds = xr.open_dataset(f"{BASE_DIR}/{file_name}").loc[dict(lat=lat_hawaii, lon=lon_hawaii)].interp(
            lat=np.linspace(
                lat_hawaii[0],
                lat_hawaii[-1],
                dim_interpolation
            ),
            lon=np.linspace(
                lon_hawaii[0],
                lon_hawaii[-1],
                dim_interpolation
            ),
            method="linear"
        ).rename({f"{attribute}": f"{label}"})
    if "level" in ds.dims:
        ds = ds.squeeze("level").drop("level")
    datasets.append({
        "file": file_name,
        "dataset": ds,
        "label": label,
        "attribute": attribute
    })

air.2m.mon.mean.regridded.nc air2m air
air.1000-500.mon.mean.nc air1000_500 air
hgt500.mon.mean.nc hgt500 hgt
hgt1000.mon.mean.nc hgt1000 hgt
omega500.mon.mean.nc omega500 omega
pottmp.1000-500.mon.mean.nc pottemp1000-500 pottmp
pottmp.1000-850.mon.mean.nc pottemp1000-850 pottmp
pwtr.mon.mean.nc pr_wtr pr_wtr
shum_x_uwnd.700.mon.mean.nc shum-uwnd-700 shum
shum_x_uwnd.925.mon.mean.nc shum-uwnd-925 shum
shum_x_vwnd.700.mon.mean.nc shum-vwnd-700 shum
shum_x_vwnd.925.mon.mean.nc shum-vwnd-950 shum
shum700.mon.mean.nc shum700 shum
shum925.mon.mean.nc shum925 shum
skt.mon.mean.regridded.nc skt skt
slp.mon.mean.nc slp slp


In [13]:
ds_combined = xr.merge([datasets[i]["dataset"] for i in range(len(datasets))])

In [10]:
start = time.time()
df_data_w_coord["values"] = df_data_w_coord[:].apply(lambda x: ds_combined.sel(time=f"{x.year}-{x.month}-01", lat=x.lat, lon=x.lon, method="nearest").to_array().to_numpy(), axis=1)
end = time.time()

elapsed = end - start
print("{:.3f}".format(elapsed))

2698.391


In [22]:
df_columns = pd.DataFrame(np.stack(df_data_w_coord["values"].to_numpy()), columns=[item["label"] for item in datasets])
df_complete = df_data_w_coord.merge(right=df_columns, left_index=True, right_index=True)

In [25]:
df_complete_cleaned = df_complete.drop(["values"], axis=1)

# SAVE

In [28]:
df_complete_cleaned.to_csv(f"{BASE_DIR}/dataset_interp_100.csv", index=False)

# Make sure the data contains appropriate information

In [29]:
df_complete_cleaned.iloc[:, -16:]

Unnamed: 0,air2m,air1000_500,hgt500,hgt1000,omega500,pottemp1000-500,pottemp1000-850,pr_wtr,shum-uwnd-700,shum-uwnd-925,shum-vwnd-700,shum-vwnd-950,shum700,shum925,skt,slp
0,296.362251,30.593334,5806.633404,122.751710,0.042301,-27.330571,-3.976694,28.678739,-7.071470,-60.664267,-3.335292,-27.741849,2.318151,10.345333,24.909237,1014.198085
1,295.698619,30.687574,5804.879838,131.451407,0.061452,-27.079310,-4.568755,24.988922,0.450107,-38.805761,-3.496950,-13.066533,2.172999,8.830637,23.457119,1015.299784
2,296.247879,32.318486,5804.061553,135.965785,0.000998,-25.239094,-4.402203,31.862511,-4.668788,-71.785439,0.292838,-8.205906,3.347150,9.370667,23.775679,1015.848733
3,297.114453,30.351517,5833.912169,128.456565,0.021241,-27.828150,-4.269346,33.526352,3.466445,-41.395023,-1.941327,-4.107908,3.605182,10.080273,24.751422,1014.841114
4,297.999804,29.935155,5860.363696,137.629523,0.021391,-28.495221,-3.757156,35.827885,-13.206782,-92.122989,0.893711,-9.247211,3.550030,11.478545,25.866196,1015.936653
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
865556,297.995419,31.703741,5831.560443,127.786192,-0.048245,-26.411901,-4.071232,37.199071,-10.971772,-58.268284,11.101084,34.798849,3.928131,11.046495,25.887307,1014.628779
865557,296.571468,31.182524,5828.009179,139.246010,0.021443,-26.660939,-5.140021,29.540491,-2.681146,-23.355273,-1.154544,-9.246627,2.959969,10.032223,24.957601,1016.030567
865558,296.814220,32.801624,5782.829683,97.455520,-0.029063,-24.851507,-4.379451,33.330581,30.969944,45.846457,1.263497,19.544940,3.676980,10.523040,24.344514,1011.158543
865559,296.108382,32.080295,5812.050135,147.444799,0.005625,-25.491792,-4.154294,28.700418,-8.066655,-57.231843,1.251480,1.193581,2.435232,9.481364,24.064122,1017.053687


In [31]:
sample = 104
lat = df_complete_cleaned.iloc[sample]['lat']
lon = df_complete_cleaned.iloc[sample]['lon']
year = df_complete_cleaned.iloc[sample]['year']
month = df_complete_cleaned.iloc[sample]['month']

for item in datasets:
    value_from_xarray = float(item["dataset"].sel(time=f"{year}-{month}-01", lat=lat, lon=lon, method="nearest")[item["label"]])
    value_in_df = df_complete_cleaned.iloc[sample][item["label"]]
    assert abs(value_from_xarray - value_in_df) < 1e-6
    
    

In [35]:
num_samples = df_complete_cleaned.shape[0]
np.random.randint(low=0, high=num_samples, size=100)

for sample in np.random.randint(low=0, high=num_samples, size=100):    
    lat = df_complete_cleaned.iloc[sample]['lat']
    lon = df_complete_cleaned.iloc[sample]['lon']
    year = df_complete_cleaned.iloc[sample]['year']
    month = df_complete_cleaned.iloc[sample]['month']

    for item in datasets:
        value_from_xarray = float(item["dataset"].sel(time=f"{year}-{month}-01", lat=lat, lon=lon, method="nearest")[item["label"]])
        value_in_df = df_complete_cleaned.iloc[sample][item["label"]]
        assert abs(value_from_xarray - value_in_df) < 1e-6
    

# Repeat the same but for dim_interp = 50

In [17]:
df_data = pd.read_excel(f"{BASE_DIR}/FilledDataset2012.xlsx", sheet_name="Data_in")
df_metadata = pd.read_excel(f"{BASE_DIR}/FilledDataset2012.xlsx", sheet_name="Header")

X = []
for index, row in df_data.iterrows():
    if row.Year < 1948:
        # No need to keep data older than 1948 becase no data exists in netCDF files
        continue
    for i, cell in enumerate(row[2:]):
        X.append([row.SKN, row.Year, i + 1, cell])
        
df_data_by_cell = pd.DataFrame(X, columns = ["skn", "year", "month", "data_in"]).dropna()
df_data_by_cell = df_data_by_cell.replace(r'^\s*$', np.nan, regex=True).dropna()

df_data_w_coord = (
    df_data_by_cell
    .merge(right=df_metadata, left_on="skn", right_on="SKN")
    .drop(columns="SKN", axis=1)
    .rename(columns={"Lat_DD": "lat", "Lon_DD": "lon", "ElevFT": "elevation", "Name": "name"})
)

# load only relevant portion of netCDF files
lat_hawaii = [15, 17.5, 20, 22.5, 25]
lon_hawaii = np.array([-162.5, -160, -157.5, -155, -152.5]) + 360
dim_interpolation = 50

datasets = []
for file_name, label, attribute in zip(FILE_NAMES, LABELS, ATTRIBUTES):
    print(file_name, label, attribute)
    # Linear Interpolation
    ds = xr.open_dataset(f"{BASE_DIR}/{file_name}").loc[dict(lat=lat_hawaii, lon=lon_hawaii)].interp(
            lat=np.linspace(
                lat_hawaii[0],
                lat_hawaii[-1],
                dim_interpolation
            ),
            lon=np.linspace(
                lon_hawaii[0],
                lon_hawaii[-1],
                dim_interpolation
            ),
            method="linear"
        ).rename({f"{attribute}": f"{label}"})
    if "level" in ds.dims:
        ds = ds.squeeze("level").drop("level")
    datasets.append({
        "file": file_name,
        "dataset": ds,
        "label": label,
        "attribute": attribute
    })
    
ds_combined = xr.merge([datasets[i]["dataset"] for i in range(len(datasets))])

start = time.time()
df_data_w_coord["values"] = df_data_w_coord[:].apply(lambda x: ds_combined.sel(time=f"{x.year}-{x.month}-01", lat=x.lat, lon=x.lon, method="nearest").to_array().to_numpy(), axis=1)
end = time.time()

elapsed = end - start
print("{:.3f}".format(elapsed))

df_columns = pd.DataFrame(np.stack(df_data_w_coord["values"].to_numpy()), columns=[item["label"] for item in datasets])
df_complete = df_data_w_coord.merge(right=df_columns, left_index=True, right_index=True)

air.2m.mon.mean.regridded.nc air2m air
air.1000-500.mon.mean.nc air1000_500 air
hgt500.mon.mean.nc hgt500 hgt
hgt1000.mon.mean.nc hgt1000 hgt
omega500.mon.mean.nc omega500 omega
pottmp.1000-500.mon.mean.nc pottemp1000-500 pottmp
pottmp.1000-850.mon.mean.nc pottemp1000-850 pottmp
pwtr.mon.mean.nc pr_wtr pr_wtr
shum_x_uwnd.700.mon.mean.nc shum-uwnd-700 shum
shum_x_uwnd.925.mon.mean.nc shum-uwnd-925 shum
shum_x_vwnd.700.mon.mean.nc shum-vwnd-700 shum
shum_x_vwnd.925.mon.mean.nc shum-vwnd-950 shum
shum700.mon.mean.nc shum700 shum
shum925.mon.mean.nc shum925 shum
skt.mon.mean.regridded.nc skt skt
slp.mon.mean.nc slp slp
2729.564


In [20]:
df_complete

Unnamed: 0,skn,year,month,data_in,name,lat,lon,elevation,Observer,Status2010,...,pottemp1000-850,pr_wtr,shum-uwnd-700,shum-uwnd-925,shum-vwnd-700,shum-vwnd-950,shum700,shum925,skt,slp
0,1.0,1948,1,3.2,KALAE,18.916176,-155.674994,35.0,USCG,Discontinued,...,-3.960722,28.727550,-7.064851,-61.055853,-3.270553,-27.510409,2.313204,10.367102,24.926305,1014.174220
1,1.0,1948,2,1.33,KALAE,18.916176,-155.674994,35.0,USCG,Discontinued,...,-4.556628,25.030540,0.487241,-39.028110,-3.446772,-12.947470,2.178714,8.839245,23.464826,1015.279124
2,1.0,1948,3,2.31,KALAE,18.916176,-155.674994,35.0,USCG,Discontinued,...,-4.401711,31.932421,-4.474437,-71.785581,0.405060,-7.952975,3.359346,9.383184,23.786142,1015.814580
3,1.0,1948,4,3.23,KALAE,18.916176,-155.674994,35.0,USCG,Discontinued,...,-4.265197,33.586917,3.420109,-41.628213,-2.011820,-4.123647,3.621286,10.094225,24.766062,1014.821248
4,1.0,1948,5,1.14,KALAE,18.916176,-155.674994,35.0,USCG,Discontinued,...,-3.750196,35.868127,-13.244502,-92.765777,0.826419,-9.357693,3.558020,11.498285,25.877674,1015.903627
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
865556,1147.0,1973,11,13.58,FLD 960-MOLOAA,22.174441,-159.336092,260.0,LIHUE PLANTATION,Discontinued,...,-4.069999,37.201568,-10.976623,-58.252345,11.079192,34.763361,3.927000,11.044428,25.895823,1014.619019
865557,1147.0,1973,12,8.23,FLD 960-MOLOAA,22.174441,-159.336092,260.0,LIHUE PLANTATION,Discontinued,...,-5.134090,29.555025,-2.770587,-23.641632,-1.141120,-9.206854,2.958999,10.035143,24.967075,1016.021519
865558,1147.0,1974,1,7.51,FLD 960-MOLOAA,22.174441,-159.336092,260.0,LIHUE PLANTATION,Discontinued,...,-4.379499,33.321245,30.775875,45.592352,1.233498,19.484145,3.672143,10.516714,24.356072,1011.164786
865559,1147.0,1974,2,2.31,FLD 960-MOLOAA,22.174441,-159.336092,260.0,LIHUE PLANTATION,Discontinued,...,-4.146092,28.712857,-8.104909,-57.448194,1.238933,1.215644,2.434285,9.486143,24.075650,1017.037990


In [21]:
df_complete_cleaned = df_complete.drop(["values"], axis=1)
df_complete_cleaned.to_csv(f"{BASE_DIR}/dataset_interp_50.csv", index=False)

In [22]:
df_new = pd.read_csv(f"{BASE_DIR}/dataset_interp_50.csv")

In [23]:
df_new

Unnamed: 0,skn,year,month,data_in,name,lat,lon,elevation,Observer,Status2010,...,pottemp1000-850,pr_wtr,shum-uwnd-700,shum-uwnd-925,shum-vwnd-700,shum-vwnd-950,shum700,shum925,skt,slp
0,1.0,1948,1,3.20,KALAE,18.916176,-155.674994,35.0,USCG,Discontinued,...,-3.960722,28.727550,-7.064851,-61.055853,-3.270553,-27.510409,2.313204,10.367102,24.926305,1014.174220
1,1.0,1948,2,1.33,KALAE,18.916176,-155.674994,35.0,USCG,Discontinued,...,-4.556628,25.030540,0.487241,-39.028110,-3.446772,-12.947470,2.178714,8.839245,23.464826,1015.279124
2,1.0,1948,3,2.31,KALAE,18.916176,-155.674994,35.0,USCG,Discontinued,...,-4.401711,31.932421,-4.474437,-71.785581,0.405060,-7.952975,3.359346,9.383184,23.786142,1015.814580
3,1.0,1948,4,3.23,KALAE,18.916176,-155.674994,35.0,USCG,Discontinued,...,-4.265197,33.586917,3.420109,-41.628213,-2.011820,-4.123647,3.621286,10.094225,24.766062,1014.821248
4,1.0,1948,5,1.14,KALAE,18.916176,-155.674994,35.0,USCG,Discontinued,...,-3.750196,35.868127,-13.244502,-92.765777,0.826419,-9.357693,3.558020,11.498285,25.877674,1015.903627
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
865556,1147.0,1973,11,13.58,FLD 960-MOLOAA,22.174441,-159.336092,260.0,LIHUE PLANTATION,Discontinued,...,-4.069999,37.201568,-10.976623,-58.252345,11.079192,34.763361,3.927000,11.044428,25.895823,1014.619019
865557,1147.0,1973,12,8.23,FLD 960-MOLOAA,22.174441,-159.336092,260.0,LIHUE PLANTATION,Discontinued,...,-5.134090,29.555025,-2.770587,-23.641632,-1.141120,-9.206854,2.958999,10.035143,24.967075,1016.021519
865558,1147.0,1974,1,7.51,FLD 960-MOLOAA,22.174441,-159.336092,260.0,LIHUE PLANTATION,Discontinued,...,-4.379499,33.321245,30.775875,45.592352,1.233498,19.484145,3.672143,10.516714,24.356072,1011.164786
865559,1147.0,1974,2,2.31,FLD 960-MOLOAA,22.174441,-159.336092,260.0,LIHUE PLANTATION,Discontinued,...,-4.146092,28.712857,-8.104909,-57.448194,1.238933,1.215644,2.434285,9.486143,24.075650,1017.037990
