In [36]:
import pandas as pd
import numpy as np
import os
from config import C_COMMON, C_SINGLE, C_GRID
from copy import deepcopy

# import models
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from xgboost import XGBRegressor

In [40]:
df = pd.read_csv('/home/yusukemh/sadow_lts/personal/yusukemh/pi_casc/processed_datasets/dataset_6grid.csv')

In [42]:
df_train = df.query('year < 1984')
df_valid = df.query('1984 <= year < 1997')
df_test = df.query('1997 <= year')

assert df_train.shape[0] + df_valid.shape[0] + df_test.shape[0] == df.shape[0]

In [4]:
assert df_train.shape[0] + df_valid.shape[0] + df_test.shape[0] == df.shape[0]

In [49]:
# lat, lon and 16 reanalysis variables
columns = deepcopy(C_SINGLE + C_COMMON)
for r in ['fold', 'skn', 'year', 'month', 'data_in', 'season_wet', 'elevation']:
    columns.remove(r)

In [50]:
columns

['air2m',
 'air1000_500',
 'hgt500',
 'hgt1000',
 'omega500',
 'pottemp1000-500',
 'pottemp1000-850',
 'pr_wtr',
 'shum-uwnd-700',
 'shum-uwnd-925',
 'shum-vwnd-700',
 'shum-vwnd-950',
 'shum700',
 'shum925',
 'skt',
 'slp',
 'lat',
 'lon']

In [54]:
Xtrain, Ytrain = np.array(df_train[columns]), np.array(df_train['data_in'])
Xvalid, Yvalid = np.array(df_valid[columns]), np.array(df_valid['data_in'])
Xtest, Ytest = np.array(df_test[columns]), np.array(df_test['data_in'])

In [None]:
# hyperparameters acquired in external experiment
linear_regression = LinearRegression()
random_forest = RandomForestRegressor(
    n_estimators=270,
    max_depth=None,
    min_samples_split=3,
    n_jobs=-1,
    verbose=False,
)

gradient_boost = GradientBoostingRegressor(
    n_estimators=240, 
    learning_rate=0.1,
    max_depth=7,
    min_samples_split=4,
    verbose=False
)

xgboost = XGBRegressor(
    n_estimators=210,
    learning_rate=0.1,
    max_depth=9,
    verbosity=0
)

linear_regression.fit(Xtrain, Ytrain)
random_forest.fit(Xtrain, Ytrain)
gradient_boost.fit(Xtrain, Ytrain)
xgboost.fit(Xtrain, Ytrain)

mse_wout_elev = [
    mean_squared_error(Ytest, linear_regression.predict(Xtest)),
    mean_squared_error(Ytest, random_forest.predict(Xtest)),
    mean_squared_error(Ytest, gradient_boost.predict(Xtest)),
    mean_squared_error(Ytest, xgboost.predict(Xtest)),
]

print(f"{mse_wout_elev=}")