In [2]:
print()




In [1]:
# basic libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.ensemble import GradientBoostingRegressor as GBR
from sklearn.svm import SVR
from sklearn.cluster import KMeans
#others
import cartopy.crs as ccrs

In [3]:
# Load the dataset
base_dir = "../dataset"
df_data_original = pd.read_csv(f"{base_dir}/dataset.csv")

# make sure there is no NaN value
assert df_data_original.isnull().values.any() == False

In [4]:
# convert (year, month) into (months) since 1948 Jan to create continuous index
base_year = 1948
df_data_original["cum_months"] = df_data_original.apply(
    lambda x: 
        int(12 * (x["year"] - base_year) + x["month"])
    ,
    axis=1
)

In [5]:
df_data_clean = df_data_original.drop(labels=["year", "lat", "lon"], axis=1)
df_data_clean.head(10)

Unnamed: 0,skn,month,data_in,Lat_DD,Lon_DD,Lon_DD_updated,air,air.1,hgt,hgt.1,...,pr_wtr,shum,shum.1,shum.2,shum.3,shum.4,shum.5,skt,slp,cum_months
0,1.0,1,3.2,18.916176,-155.674994,204.325006,295.39603,31.299995,5799.5483,121.48387,...,29.034512,2.592493,-25.859348,0.589191,7.106412,2.945999,9.869999,23.385218,1014.0849,1
1,2.0,1,5.95,19.10866,-155.825545,204.174455,295.39603,31.299995,5799.5483,121.48387,...,29.034512,2.592493,-25.859348,0.589191,7.106412,2.945999,9.869999,23.385218,1014.0849,1
2,2.2,1,11.5,19.16474,-155.68228,204.31772,295.39603,31.299995,5799.5483,121.48387,...,29.034512,2.592493,-25.859348,0.589191,7.106412,2.945999,9.869999,23.385218,1014.0849,1
3,2.25,1,5.515941,19.160603,-155.822488,204.177512,295.39603,31.299995,5799.5483,121.48387,...,29.034512,2.592493,-25.859348,0.589191,7.106412,2.945999,9.869999,23.385218,1014.0849,1
4,2.26,1,4.310617,19.225323,-155.778876,204.221124,295.39603,31.299995,5799.5483,121.48387,...,29.034512,2.592493,-25.859348,0.589191,7.106412,2.945999,9.869999,23.385218,1014.0849,1
5,2.34,1,3.0,19.186302,-155.886763,204.113237,295.39603,31.299995,5799.5483,121.48387,...,29.034512,2.592493,-25.859348,0.589191,7.106412,2.945999,9.869999,23.385218,1014.0849,1
6,3.2,1,10.11,19.114216,-155.697213,204.302787,295.39603,31.299995,5799.5483,121.48387,...,29.034512,2.592493,-25.859348,0.589191,7.106412,2.945999,9.869999,23.385218,1014.0849,1
7,3.4,1,6.694721,19.127549,-155.757767,204.242233,295.39603,31.299995,5799.5483,121.48387,...,29.034512,2.592493,-25.859348,0.589191,7.106412,2.945999,9.869999,23.385218,1014.0849,1
8,4.0,1,5.08,19.004216,-155.663882,204.336118,295.39603,31.299995,5799.5483,121.48387,...,29.034512,2.592493,-25.859348,0.589191,7.106412,2.945999,9.869999,23.385218,1014.0849,1
9,4.1,1,3.64,19.055327,-155.692214,204.307786,295.39603,31.299995,5799.5483,121.48387,...,29.034512,2.592493,-25.859348,0.589191,7.106412,2.945999,9.869999,23.385218,1014.0849,1


In [6]:
Y = np.array(df_data_clean["data_in"])
X = np.array(df_data_clean.drop(labels=["data_in", "skn"], axis=1))

In [7]:
model = LinearRegression()
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [8]:
# X = temp
# X.shape

In [9]:
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=None)

In [10]:
model.fit(Xtrain, Ytrain)
Yhat = model.predict(Xtest)

In [11]:
mse(Ytest, Yhat)

32.42063110620935

# What if we consider a sigle model per station?

In [12]:
df_station_one = df_data_clean[df_data_clean["skn"] == 1]
df_station_one

Unnamed: 0,skn,month,data_in,Lat_DD,Lon_DD,Lon_DD_updated,air,air.1,hgt,hgt.1,...,pr_wtr,shum,shum.1,shum.2,shum.3,shum.4,shum.5,skt,slp,cum_months
0,1.0,1,3.20,18.916176,-155.674994,204.325006,295.39603,31.299995,5799.5483,121.48387,...,29.034512,2.592493,-25.859348,0.589191,7.106412,2.945999,9.869999,23.385218,1014.08490,1
359,1.0,2,1.33,18.916176,-155.674994,204.325006,294.80408,32.330000,5788.7930,133.86208,...,27.199657,7.456773,-19.957973,5.228316,15.838112,2.857000,9.155001,22.271116,1015.47690,2
712,1.0,3,2.31,18.916176,-155.674994,204.325006,294.97820,32.189995,5801.8066,143.29033,...,31.619999,-0.286887,-75.183790,0.860660,8.627603,3.586000,10.271000,22.553074,1016.62714,3
1067,1.0,4,3.23,18.916176,-155.674994,204.325006,295.79654,29.920006,5829.8667,134.83333,...,32.350327,0.527679,-44.071280,-1.160946,8.856949,3.518000,10.671000,23.421500,1015.43164,4
1421,1.0,5,1.14,18.916176,-155.674994,204.325006,296.50006,29.710007,5857.9033,144.38710,...,31.281930,-11.553710,-103.509090,-3.594981,-6.201051,2.261000,11.924999,24.446610,1016.57830,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170420,1.0,4,0.23,18.916176,-155.674994,204.325006,295.88340,30.219994,5854.1000,164.33333,...,24.795328,-5.883221,-68.650500,-0.014962,-10.520789,1.497000,10.627001,23.959173,1018.88380,712
170797,1.0,5,0.23,18.916176,-155.674994,204.325006,296.38644,29.610008,5873.7417,157.67741,...,28.584837,-8.370892,-73.083950,-0.082898,3.040518,2.071999,11.261000,23.981080,1018.16650,713
171167,1.0,8,0.40,18.916176,-155.674994,204.325006,297.74350,28.770004,5890.2256,141.09677,...,32.350320,-16.198235,-76.977440,-2.513524,-6.196371,3.490999,11.916000,25.385939,1016.16724,716
171542,1.0,9,0.23,18.916176,-155.674994,204.325006,297.99707,29.659996,5878.0670,138.23334,...,34.170334,-12.184459,-85.939170,-1.133435,0.596836,4.048000,11.936001,25.671793,1015.80700,717


In [13]:
Y = np.array(df_station_one["data_in"])
X = np.array(df_station_one.drop(labels=["data_in", "skn"], axis=1))
X = scaler.fit_transform(X)

Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=None)

In [14]:
model = LinearRegression()
model.fit(Xtrain, Ytrain)
Yhat = model.predict(Xtest)
mse(Ytest, Yhat)

2.665339813072339

In [15]:
model = SVR(C=0.1)
model.fit(Xtrain, Ytrain)
Yhat = model.predict(Xtest)
mse(Ytest, Yhat)

3.718690504531787

In [16]:
mse(Ytrain, model.predict(Xtrain))

4.465146882130877

## Get some help from clustering

In [17]:
model = LinearRegression()
encoder = OneHotEncoder(sparse=False)

In [18]:
df_station_label = pd.read_csv(f"{base_dir}/station_cluster.csv")
df_data_clean_w_label = df_data_clean.merge(right=df_station_label, left_on="skn", right_on="SKN")
X = df_data_clean_w_label.drop(labels=["skn", "Lat_DD", "Lon_DD", "Lon_DD_updated", "data_in"], axis=1).to_numpy()
Y = df_data_clean_w_label["data_in"].to_numpy()

In [19]:
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2)
model.fit(Xtrain, Ytrain)
yhat = model.predict(Xtest)
mse(Ytest, yhat)

34.11822579192755

## One Hot encode

In [20]:
X = df_data_clean_w_label.drop(labels=["skn", "Lat_DD", "Lon_DD", "Lon_DD_updated", "data_in", "label"], axis=1)
Y = df_data_clean_w_label["data_in"].to_numpy()

In [21]:
label = encoder.fit_transform(df_data_clean_w_label["label"].to_numpy().reshape(-1,1))
X = np.hstack((X, label))

In [22]:
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2)
model.fit(Xtrain, Ytrain)
yhat = model.predict(Xtest)
mse(Ytest, yhat)

19.186931714644846

## Gradient Boosting Regressor?

In [23]:
model = GBR()
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2)
model.fit(Xtrain, Ytrain)
yhat = model.predict(Xtest)
mse(Ytest, yhat)

15.809573718810393