In [1]:
# basic imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# sklearn
from sklearn.model_selection import train_test_split, KFold, cross_val_predict
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

#tensorflow
import tensorflow as tf
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from tensorflow.keras.layers import Dense, Input

# others
from copy import deepcopy
from xgboost import XGBRegressor

# Variables from config file
from config import BASE_DIR, FILE_NAMES, LABELS, ATTRIBUTES, BEST_MODEL_COLUMNS, ISLAND_RANGES

2022-04-22 19:50:45.238319: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1


In [2]:
# Split the stations by the number of samples available
columns = deepcopy(LABELS)
columns.extend(["season_wet", "elevation", "lat", "lon"])
for item in columns:
    print(item, end=' ')

# load datasets
df_train = pd.read_csv(f"{BASE_DIR}/train.csv", usecols=columns + ['year', 'month', 'skn', 'data_in'])
df_valid = pd.read_csv(f"{BASE_DIR}/valid.csv", usecols=columns + ['year', 'month', 'skn', 'data_in'])
df_test = pd.read_csv(f"{BASE_DIR}/test.csv", usecols=columns + ['year', 'month', 'skn', 'data_in'])
df_combined = pd.concat([df_train, df_valid, df_test])

air2m air1000_500 hgt500 hgt1000 omega500 pottemp1000-500 pottemp1000-850 pr_wtr shum-uwnd-700 shum-uwnd-925 shum-vwnd-700 shum-vwnd-950 shum700 shum925 skt slp season_wet elevation lat lon 

In [19]:
Xtrain = np.array(df_train[columns])
Xtest = np.array(df_test[columns])

Ytrain = np.array(df_train['data_in'])
Ytest = np.array(df_test['data_in'])

model = RandomForestRegressor(n_jobs=-1)
model.fit(Xtrain, Ytrain)
yhat = model.predict(Xtest)

In [20]:
mean_squared_error(Ytest, yhat, squared=False)

4.190609621589842

In [21]:
model = LinearRegression()
model.fit(Xtrain, Ytrain)
yhat = model.predict(Xtest)

In [22]:
mean_squared_error(Ytest, yhat, squared=False)

5.779447159157981

In [10]:
np.random.seed(42)
skn = df_combined['skn'].sample().values[0]

In [21]:
df_station = df_combined[df_combined['skn'] == skn]

train, test = train_test_split(df_station, test_size=0.2, random_state=42)
Xtrain, Ytrain = train[columns], train['data_in']
Xtest, Ytest = test[columns], test['data_in']

#Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=42)
random_forest = RandomForestRegressor()
random_forest.fit(Xtrain, Ytrain)
yhat_rf = random_forest.predict(Xtest)
rmse_rf = mean_squared_error(Ytest, yhat_rf, squared=False)

linear_regression = LinearRegression()
linear_regression.fit(Xtrain, Ytrain)
yhat_lr = linear_regression.predict(Xtest)
rmse_lr = mean_squared_error(Ytest, yhat_lr, squared=False)

In [22]:
print("RMSE on Linear Regression: {:.3f}\nRMSE on Random Forest: {:.3f}".format(rmse_lr, rmse_rf))

RMSE on Linear Regression: 3.602
RMSE on Random Forest: 3.717


In [24]:
train

Unnamed: 0,skn,year,month,data_in,lat,lon,elevation,air2m,air1000_500,hgt500,...,pr_wtr,shum-uwnd-700,shum-uwnd-925,shum-vwnd-700,shum-vwnd-950,shum700,shum925,skt,slp,season_wet
405876,985.0,1993,3,0.00,21.925198,-159.533872,720.0,294.72772,30.360000,5813.3550,...,24.219034,1.943488,-38.082367,-4.309502,-22.954956,1.689999,8.795000,23.167370,1019.35390,1
120875,985.0,1996,2,6.48,21.925198,-159.533872,720.0,295.36447,30.479996,5796.4830,...,26.813107,15.460807,-16.310812,-3.257273,-18.091896,2.230999,9.374001,24.262577,1016.42650,1
349460,985.0,1986,8,4.68,21.925198,-159.533872,720.0,299.07065,31.179993,5878.3228,...,38.753227,-15.926044,-84.959625,1.241417,-5.664062,3.546999,12.313000,26.706755,1015.42960,0
78645,985.0,1980,6,9.49,21.925198,-159.533872,720.0,297.54855,30.609993,5859.4330,...,33.359005,-9.918607,-89.686590,-2.791296,-18.354948,2.709999,10.990999,25.379122,1018.13257,0
109590,985.0,1959,8,10.52,21.925198,-159.533872,720.0,299.00180,30.069992,5864.4840,...,39.018715,-15.940995,-85.660450,-0.604106,-11.602450,3.355999,12.343000,27.037159,1014.74560,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85849,985.0,1957,1,19.29,21.925198,-159.533872,720.0,296.35907,33.159996,5793.2256,...,32.236774,0.143909,-32.960712,13.929511,46.243275,2.878000,9.839001,24.161053,1015.31080,1
128540,985.0,1961,6,4.20,21.925198,-159.533872,720.0,298.14328,30.820000,5870.5000,...,35.745000,-25.710815,-95.920620,-5.886437,-23.639221,3.382999,11.365000,26.272293,1017.60700,0
327660,985.0,1983,1,3.50,21.925198,-159.533872,720.0,295.80615,27.750008,5848.1934,...,24.930970,11.188534,9.086721,-1.939131,-6.791214,1.539000,9.565001,23.959509,1016.11060,1
523938,985.0,2007,6,2.57,21.925198,-159.533872,720.0,298.33490,30.619995,5882.1333,...,34.138664,-13.645160,-99.152990,0.635250,-2.634515,2.541000,11.975000,26.166464,1017.72620,0
