In [2]:
import pandas as pd
import numpy as np
import os
from config import C_COMMON, C_SINGLE, C_GRID
from copy import deepcopy
import time

# import models
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from xgboost import XGBRegressor

In [2]:
df = pd.read_csv('/home/yusukemh/sadow_lts/personal/yusukemh/pi_casc/processed_datasets/dataset_6grid.csv')

# base model

In [3]:
df = pd.read_csv('/home/yusukemh/sadow_lts/personal/yusukemh/pi_casc/processed_datasets/dataset_6grid.csv')
# split
df_train = df.query('year < 1984')
df_valid = df.query('1984 <= year < 1997')
df_test = df.query('1997 <= year')

assert df_train.shape[0] + df_valid.shape[0] + df_test.shape[0] == df.shape[0]

# Define columns: lat, lon and 16 reanalysis variables
columns = deepcopy(C_SINGLE + C_COMMON)
for r in ['fold', 'skn', 'year', 'month', 'data_in', 'season_wet', 'elevation']:
    columns.remove(r)

In [4]:
df_train.shape[0]/df.shape[0], df_valid.shape[0]/df.shape[0], df_test.shape[0]/df.shape[0]

(0.5938163244321155, 0.19930748194473488, 0.20687619362314955)

In [5]:
columns

['air2m',
 'air1000_500',
 'hgt500',
 'hgt1000',
 'omega500',
 'pottemp1000-500',
 'pottemp1000-850',
 'pr_wtr',
 'shum-uwnd-700',
 'shum-uwnd-925',
 'shum-vwnd-700',
 'shum-vwnd-950',
 'shum700',
 'shum925',
 'skt',
 'slp',
 'lat',
 'lon']

In [6]:
Xtrain, Ytrain = np.array(df_train[columns]), np.array(df_train['data_in'])
Xvalid, Yvalid = np.array(df_valid[columns]), np.array(df_valid['data_in'])
Xtest, Ytest = np.array(df_test[columns]), np.array(df_test['data_in'])

In [7]:
for _ in range(5):
    # hyperparameters acquired in external experiment
    linear_regression = LinearRegression()
    random_forest = RandomForestRegressor(
        n_estimators=270,
        max_depth=None,
        min_samples_split=3,
        n_jobs=-1,
        verbose=False,
    )

    # gradient_boost = GradientBoostingRegressor(
    #     n_estimators=240,
    #     learning_rate=0.1,
    #     max_depth=7,
    #     min_samples_split=4,
    #     verbose=False
    # )

    xgboost = XGBRegressor(
        n_estimators=210,
        learning_rate=0.1,
        max_depth=9,
        n_jobs=-1,
        subsample=0.9,
        seed=np.random.randint(100),
        verbosity=0
    )
    linear_regression.fit(Xtrain, Ytrain)
    xgboost.fit(Xtrain, Ytrain)
    # gradient_boost.fit(Xtrain, Ytrain)
    random_forest.fit(Xtrain, Ytrain)

    mse_base = [
        mean_squared_error(Ytest, linear_regression.predict(Xtest)),
        mean_squared_error(Ytest, random_forest.predict(Xtest)),
        # mean_squared_error(Ytest, gradient_boost.predict(Xtest)),
        mean_squared_error(Ytest, xgboost.predict(Xtest)),
    ]

    print(f"{mse_base=}")

mse_base=[28.55376711301076, 17.78627188163026, 16.257087519664214]
mse_base=[28.55376711301076, 17.77116719555559, 16.19685882533633]
mse_base=[28.55376711301076, 17.66887617269526, 16.11259673255068]
mse_base=[28.55376711301076, 17.814586917186457, 15.950886489985368]
mse_base=[28.55376711301076, 17.85522221089766, 16.605416872783817]


# include season_wet

In [8]:
df = pd.read_csv('/home/yusukemh/sadow_lts/personal/yusukemh/pi_casc/processed_datasets/dataset_6grid.csv')
# split
df_train = df.query('year < 1984')
df_valid = df.query('1984 <= year < 1997')
df_test = df.query('1997 <= year')

assert df_train.shape[0] + df_valid.shape[0] + df_test.shape[0] == df.shape[0]

# Define columns: lat, lon and 16 reanalysis variables
columns = deepcopy(C_SINGLE + C_COMMON)
for r in ['fold', 'skn', 'year', 'month', 'data_in', 'elevation']:
    columns.remove(r)

In [9]:
columns

['air2m',
 'air1000_500',
 'hgt500',
 'hgt1000',
 'omega500',
 'pottemp1000-500',
 'pottemp1000-850',
 'pr_wtr',
 'shum-uwnd-700',
 'shum-uwnd-925',
 'shum-vwnd-700',
 'shum-vwnd-950',
 'shum700',
 'shum925',
 'skt',
 'slp',
 'season_wet',
 'lat',
 'lon']

In [10]:
Xtrain, Ytrain = np.array(df_train[columns]), np.array(df_train['data_in'])
Xvalid, Yvalid = np.array(df_valid[columns]), np.array(df_valid['data_in'])
Xtest, Ytest = np.array(df_test[columns]), np.array(df_test['data_in'])

In [11]:
for _ in range(5):
    # hyperparameters acquired in external experiment
    linear_regression = LinearRegression()
    random_forest = RandomForestRegressor(
        n_estimators=270,
        max_depth=None,
        min_samples_split=3,
        n_jobs=-1,
        verbose=False,
    )

    # gradient_boost = GradientBoostingRegressor(
    #     n_estimators=240,
    #     learning_rate=0.1,
    #     max_depth=7,
    #     min_samples_split=4,
    #     verbose=False
    # )

    xgboost = XGBRegressor(
        n_estimators=210,
        learning_rate=0.1,
        max_depth=9,
        n_jobs=-1,
        subsample=0.9,
        seed=np.random.randint(100),
        verbosity=0
    )
    linear_regression.fit(Xtrain, Ytrain)
    xgboost.fit(Xtrain, Ytrain)
    # gradient_boost.fit(Xtrain, Ytrain)
    random_forest.fit(Xtrain, Ytrain)

    mse_season = [
        mean_squared_error(Ytest, linear_regression.predict(Xtest)),
        mean_squared_error(Ytest, random_forest.predict(Xtest)),
        # mean_squared_error(Ytest, gradient_boost.predict(Xtest)),
        mean_squared_error(Ytest, xgboost.predict(Xtest)),
    ]

    print(f"{mse_season=}")

mse_season=[28.53627284656808, 17.58145521752626, 15.629585733934086]
mse_season=[28.53627284656808, 17.588620027244907, 15.599226688490853]
mse_season=[28.53627284656808, 17.51413244474182, 15.873650094247918]
mse_season=[28.53627284656808, 17.640903030139693, 15.631470012113601]
mse_season=[28.53627284656808, 17.51830491316549, 15.687771754921602]


In [12]:
print()




# include elevation

In [3]:
df = pd.read_csv('/home/yusukemh/sadow_lts/personal/yusukemh/pi_casc/processed_datasets/dataset_6grid.csv')
# split
df_train = df.query('year < 1984')
df_valid = df.query('1984 <= year < 1997')
df_test = df.query('1997 <= year')

assert df_train.shape[0] + df_valid.shape[0] + df_test.shape[0] == df.shape[0]

# Define columns: lat, lon and 16 reanalysis variables
columns = deepcopy(C_SINGLE + C_COMMON)
for r in ['fold', 'skn', 'year', 'month', 'data_in', 'season_wet']:
    columns.remove(r)

In [4]:
columns

['air2m',
 'air1000_500',
 'hgt500',
 'hgt1000',
 'omega500',
 'pottemp1000-500',
 'pottemp1000-850',
 'pr_wtr',
 'shum-uwnd-700',
 'shum-uwnd-925',
 'shum-vwnd-700',
 'shum-vwnd-950',
 'shum700',
 'shum925',
 'skt',
 'slp',
 'elevation',
 'lat',
 'lon']

In [5]:
Xtrain, Ytrain = np.array(df_train[columns]), np.array(df_train['data_in'])
Xvalid, Yvalid = np.array(df_valid[columns]), np.array(df_valid['data_in'])
Xtest, Ytest = np.array(df_test[columns]), np.array(df_test['data_in'])

In [6]:
for _ in range(5):
    # hyperparameters acquired in external experiment
    linear_regression = LinearRegression()
    random_forest = RandomForestRegressor(
        n_estimators=270,
        max_depth=None,
        min_samples_split=3,
        n_jobs=-1,
        verbose=False,
    )

    # gradient_boost = GradientBoostingRegressor(
    #     n_estimators=240,
    #     learning_rate=0.1,
    #     max_depth=7,
    #     min_samples_split=4,
    #     verbose=False
    # )

    xgboost = XGBRegressor(
        n_estimators=210,
        learning_rate=0.1,
        max_depth=9,
        n_jobs=-1,
        subsample=0.9,
        seed=np.random.randint(100),
        verbosity=0
    )
    linear_regression.fit(Xtrain, Ytrain)
    xgboost.fit(Xtrain, Ytrain)
    # gradient_boost.fit(Xtrain, Ytrain)
    random_forest.fit(Xtrain, Ytrain)

    mse_elevation = [
        mean_squared_error(Ytest, linear_regression.predict(Xtest)),
        mean_squared_error(Ytest, random_forest.predict(Xtest)),
        # mean_squared_error(Ytest, gradient_boost.predict(Xtest)),
        mean_squared_error(Ytest, xgboost.predict(Xtest)),
    ]

    print(f"{mse_elevation=}")

mse_elevation=[28.357063071091094, 17.87955954825587, 16.18592539724243]
mse_elevation=[28.357063071091094, 17.89045772774496, 16.038790956249198]
mse_elevation=[28.357063071091094, 17.857733681439473, 16.151139385692876]
mse_elevation=[28.357063071091094, 17.989950574056042, 16.116957669566933]
mse_elevation=[28.357063071091094, 17.892353331108275, 16.052188333607656]


# include season and elevetion

In [17]:
df = pd.read_csv('/home/yusukemh/sadow_lts/personal/yusukemh/pi_casc/processed_datasets/dataset_6grid.csv')
# split
df_train = df.query('year < 1984')
df_valid = df.query('1984 <= year < 1997')
df_test = df.query('1997 <= year')

assert df_train.shape[0] + df_valid.shape[0] + df_test.shape[0] == df.shape[0]

# Define columns: lat, lon and 16 reanalysis variables
columns = deepcopy(C_SINGLE + C_COMMON)
for r in ['fold', 'skn', 'year', 'month', 'data_in']:
    columns.remove(r)

In [18]:
columns

['air2m',
 'air1000_500',
 'hgt500',
 'hgt1000',
 'omega500',
 'pottemp1000-500',
 'pottemp1000-850',
 'pr_wtr',
 'shum-uwnd-700',
 'shum-uwnd-925',
 'shum-vwnd-700',
 'shum-vwnd-950',
 'shum700',
 'shum925',
 'skt',
 'slp',
 'season_wet',
 'elevation',
 'lat',
 'lon']

In [19]:
Xtrain, Ytrain = np.array(df_train[columns]), np.array(df_train['data_in'])
Xvalid, Yvalid = np.array(df_valid[columns]), np.array(df_valid['data_in'])
Xtest, Ytest = np.array(df_test[columns]), np.array(df_test['data_in'])

In [21]:
for _ in range(5):
    # hyperparameters acquired in external experiment
    linear_regression = LinearRegression()
    random_forest = RandomForestRegressor(
        n_estimators=270,
        max_depth=None,
        min_samples_split=3,
        n_jobs=-1,
        verbose=False,
    )

    # gradient_boost = GradientBoostingRegressor(
    #     n_estimators=240,
    #     learning_rate=0.1,
    #     max_depth=7,
    #     min_samples_split=4,
    #     verbose=False
    # )

    xgboost = XGBRegressor(
        n_estimators=210,
        learning_rate=0.1,
        max_depth=9,
        n_jobs=-1,
        subsample=0.9,
        seed=np.random.randint(100),
        verbosity=0
    )
    linear_regression.fit(Xtrain, Ytrain)
    xgboost.fit(Xtrain, Ytrain)
    # gradient_boost.fit(Xtrain, Ytrain)
    random_forest.fit(Xtrain, Ytrain)

    mse_elevation = [
        mean_squared_error(Ytest, linear_regression.predict(Xtest)),
        mean_squared_error(Ytest, random_forest.predict(Xtest)),
        # mean_squared_error(Ytest, gradient_boost.predict(Xtest)),
        mean_squared_error(Ytest, xgboost.predict(Xtest)),
    ]

    print(f"{mse_elevation=}")

mse_elevation=[28.361727224643428, 17.726565081322807, 15.549596008556053]
mse_elevation=[28.361727224643428, 17.572321477732814, 15.692316187129334]
mse_elevation=[28.361727224643428, 17.645544978550003, 15.50373426037234]
mse_elevation=[28.361727224643428, 17.662670631672064, 15.58376857974831]
mse_elevation=[28.361727224643428, 17.642892294225827, 15.605503052755706]


# aggregate the result

In [None]:
mse_base = np.array([
    [28.55376711301076, 17.827354120626243, 16.137055180031865],
    [28.55376711301076, 17.729751839262384, 16.31463228926157],
    [28.55376711301076, 17.79919408808512, 16.076161784594714],
    [28.55376711301076, 17.788756089407414, 16.341311856808936],
    [28.55376711301076, 17.784662904669247, 16.23757490754904]
])

mse_season = np.array([
    [28.55376711301076, 17.827354120626243, 16.137055180031865],
    [28.55376711301076, 17.729751839262384, 16.31463228926157],
    [28.55376711301076, 17.79919408808512, 16.076161784594714],
    [28.55376711301076, 17.788756089407414, 16.341311856808936],
    [28.55376711301076, 17.784662904669247, 16.23757490754904]
])

mse_elevation = np.array([
    [28.55376711301076, 17.827354120626243, 16.137055180031865],
    [28.55376711301076, 17.729751839262384, 16.31463228926157],
    [28.55376711301076, 17.79919408808512, 16.076161784594714],
    [28.55376711301076, 17.788756089407414, 16.341311856808936],
    [28.55376711301076, 17.784662904669247, 16.23757490754904]
])

mse_both = np.array([
    [28.55376711301076, 17.827354120626243, 16.137055180031865],
    [28.55376711301076, 17.729751839262384, 16.31463228926157],
    [28.55376711301076, 17.79919408808512, 16.076161784594714],
    [28.55376711301076, 17.788756089407414, 16.341311856808936],
    [28.55376711301076, 17.784662904669247, 16.23757490754904]
])


In [19]:
xgboost = XGBRegressor(
        n_estimators=210,
        learning_rate=0.1,
        max_depth=9,
        n_jobs=-1,
        subsample=0.9,
        seed=42,
        verbosity=0
)

xgboost.fit(Xtrain, Ytrain)
mean_squared_error(Ytest, xgboost.predict(Xtest)),

(16.12653608279494,)

In [18]:
xgboost = XGBRegressor(
        n_estimators=210,
        learning_rate=0.1,
        max_depth=9,
        n_jobs=-1,
        subsample=0.9,
        verbosity=0,
        seed=12
)

xgboost.fit(Xtrain, Ytrain)
mean_squared_error(Ytest, xgboost.predict(Xtest)),

(16.036249367238465,)