In [41]:
# basic libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_predict, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.svm import SVR
from sklearn.cluster import KMeans

#others
from xgboost import XGBRegressor
import cartopy.crs as ccrs
import cartopy.mpl.ticker as cticker
import time
import xarray as xr
import sherpa

# Variables from config file
from config import BASE_DIR, FILE_NAMES, LABELS, ATTRIBUTES


In [42]:
# Load the datasets
# Load the dataset
# df_metadata = pd.read_excel(f"{BASE_DIR}/FilledDataset2012.xlsx", sheet_name="Header")
# df_data_original = pd.read_csv(f"{BASE_DIR}/dataset.csv")


# load datasets
df_train = pd.read_csv(f"{BASE_DIR}/train.csv")
df_valid = pd.read_csv(f"{BASE_DIR}/valid.csv")
df_test = pd.read_csv(f"{BASE_DIR}/test.csv")

# Nov-Apr = "wet", May-Oct = "dry"
wet = [11, 12, 1, 2, 3, 4]
dry = [5, 6, 7, 8, 9, 10]
df_train['season_dry'] = df_train.apply(lambda row: 1 if row.month in dry else 0, axis=1)
df_train['season_wet'] = df_train.apply(lambda row: 1 if row.month in wet else 0, axis=1)

df_valid['season_dry'] = df_valid.apply(lambda row: 1 if row.month in dry else 0, axis=1)
df_valid['season_wet'] = df_valid.apply(lambda row: 1 if row.month in wet else 0, axis=1)

df_test['season_dry'] = df_test.apply(lambda row: 1 if row.month in dry else 0, axis=1)
df_test['season_wet'] = df_test.apply(lambda row: 1 if row.month in wet else 0, axis=1)

reanalysis_data = [
    'air2m', 'air1000_500', 'hgt500', 'hgt1000', 'omega500',
    'pottemp1000-500', 'pottemp1000-850', 'pr_wtr', 'shum-uwnd-700',
    'shum-uwnd-925', 'shum-vwnd-700', 'shum-vwnd-950', 'shum700', 'shum925', 
    'skt', 'slp'
]

columns = []
for i in range(6):
    for item in reanalysis_data:
        columns.append(f"{item}_{i}")

columns.extend(['data_in', 'lat', 'lon', 'elevation', 'season_wet', 'season_dry'])
for item in columns:
    print(item, end=' ')

air2m_0 air1000_500_0 hgt500_0 hgt1000_0 omega500_0 pottemp1000-500_0 pottemp1000-850_0 pr_wtr_0 shum-uwnd-700_0 shum-uwnd-925_0 shum-vwnd-700_0 shum-vwnd-950_0 shum700_0 shum925_0 skt_0 slp_0 air2m_1 air1000_500_1 hgt500_1 hgt1000_1 omega500_1 pottemp1000-500_1 pottemp1000-850_1 pr_wtr_1 shum-uwnd-700_1 shum-uwnd-925_1 shum-vwnd-700_1 shum-vwnd-950_1 shum700_1 shum925_1 skt_1 slp_1 air2m_2 air1000_500_2 hgt500_2 hgt1000_2 omega500_2 pottemp1000-500_2 pottemp1000-850_2 pr_wtr_2 shum-uwnd-700_2 shum-uwnd-925_2 shum-vwnd-700_2 shum-vwnd-950_2 shum700_2 shum925_2 skt_2 slp_2 air2m_3 air1000_500_3 hgt500_3 hgt1000_3 omega500_3 pottemp1000-500_3 pottemp1000-850_3 pr_wtr_3 shum-uwnd-700_3 shum-uwnd-925_3 shum-vwnd-700_3 shum-vwnd-950_3 shum700_3 shum925_3 skt_3 slp_3 air2m_4 air1000_500_4 hgt500_4 hgt1000_4 omega500_4 pottemp1000-500_4 pottemp1000-850_4 pr_wtr_4 shum-uwnd-700_4 shum-uwnd-925_4 shum-vwnd-700_4 shum-vwnd-950_4 shum700_4 shum925_4 skt_4 slp_4 air2m_5 air1000_500_5 hgt500_5 hgt1

In [43]:
reanalysis_data = [
    'air2m', 'air1000_500', 'hgt500', 'hgt1000', 'omega500',
    'pottemp1000-500', 'pottemp1000-850', 'pr_wtr', 'shum-uwnd-700',
    'shum-uwnd-925', 'shum-vwnd-700', 'shum-vwnd-950', 'shum700', 'shum925', 
    'skt', 'slp'
]

columns = []
for i in range(6):
    for item in reanalysis_data:
        columns.append(f"{item}_{i}")

columns.extend(['data_in', 'lat', 'lon', 'elevation', 'season_wet', 'season_dry'])
for item in columns:
    print(item, end=' ')

air2m_0 air1000_500_0 hgt500_0 hgt1000_0 omega500_0 pottemp1000-500_0 pottemp1000-850_0 pr_wtr_0 shum-uwnd-700_0 shum-uwnd-925_0 shum-vwnd-700_0 shum-vwnd-950_0 shum700_0 shum925_0 skt_0 slp_0 air2m_1 air1000_500_1 hgt500_1 hgt1000_1 omega500_1 pottemp1000-500_1 pottemp1000-850_1 pr_wtr_1 shum-uwnd-700_1 shum-uwnd-925_1 shum-vwnd-700_1 shum-vwnd-950_1 shum700_1 shum925_1 skt_1 slp_1 air2m_2 air1000_500_2 hgt500_2 hgt1000_2 omega500_2 pottemp1000-500_2 pottemp1000-850_2 pr_wtr_2 shum-uwnd-700_2 shum-uwnd-925_2 shum-vwnd-700_2 shum-vwnd-950_2 shum700_2 shum925_2 skt_2 slp_2 air2m_3 air1000_500_3 hgt500_3 hgt1000_3 omega500_3 pottemp1000-500_3 pottemp1000-850_3 pr_wtr_3 shum-uwnd-700_3 shum-uwnd-925_3 shum-vwnd-700_3 shum-vwnd-950_3 shum700_3 shum925_3 skt_3 slp_3 air2m_4 air1000_500_4 hgt500_4 hgt1000_4 omega500_4 pottemp1000-500_4 pottemp1000-850_4 pr_wtr_4 shum-uwnd-700_4 shum-uwnd-925_4 shum-vwnd-700_4 shum-vwnd-950_4 shum700_4 shum925_4 skt_4 slp_4 air2m_5 air1000_500_5 hgt500_5 hgt1

In [31]:
# idx_bigbog = np.where(df_metadata["Name"] == "Big Bog")[0]
# idx_hilo = np.where(df_metadata["Name"] == "HILO")[0]
# idx_hono = np.where(df_metadata["Name"] == "HONOLULU")[0]
# idx_lihue = np.where(df_metadata["Name"] == "LIHUE")[0]

In [32]:
# skn_bigbog = float(df_metadata.iloc[idx_bigbog]['SKN'])
# skn_hilo = float(df_metadata.iloc[idx_hilo]['SKN'])
# skn_hono = float(df_metadata.iloc[idx_hono]['SKN'])
# skn_lihue = float(df_metadata.iloc[idx_lihue]['SKN'])

In [33]:
# df_test.head()

In [55]:
df_train_bigbog = df_train[df_train["name"] == "Big Bog"]
df_test_bigbog = df_test[df_test["name"]=="Big Bog"]

Xtrain = np.array(df_train[columns].drop(labels=["data_in"], axis=1))
Ytrain = np.array(df_train["data_in"])

# Xvalid = np.array(df_valid[columns].drop(labels=["data_in"], axis=1))
# Yvalid = np.array(df_valid["data_in"])

Xtest = np.array(df_test_bigbog[columns].drop(labels=["data_in"], axis=1))
Ytest = np.array(df_test_bigbog["data_in"])

In [53]:
# hyperparameters obtained by fine tuning
xgboost = XGBRegressor(
    n_estimators=170,
    learning_rate=0.1,
    max_depth=9,
    verbosity=0
)

xgboost.fit(Xtrain, Ytrain)
print("MSE on xgboost (test) : {:.5f}".format(mean_squared_error(Ytest, xgboost.predict(Xtest))))
print("MSE on xgboost (train): {:.5f}".format(mean_squared_error(Ytrain, xgboost.predict(Xtrain))))

MSE on xgboost (test) : 280.83985
MSE on xgboost (train): 4.65344


In [57]:
Xtrain = np.array(df_train_bigbog[columns].drop(labels=["data_in"], axis=1))
Ytrain = np.array(df_train_bigbog["data_in"])

Xtest = np.array(df_test_bigbog[columns].drop(labels=["data_in"], axis=1))
Ytest = np.array(df_test_bigbog["data_in"])

model = LinearRegression()
model.fit(Xtrain, Ytrain)
print("MSE on Linear Regression (test) : {:.5f}".format(mean_squared_error(Ytest, model.predict(Xtest))))
print("MSE on Linear Regression (train): {:.5f}".format(mean_squared_error(Ytrain, model.predict(Xtrain))))

MSE on Linear Regression (test) : 266.88864
MSE on Linear Regression (train): 155.14460


In [59]:
df_train_hilo = df_train[df_train["name"] == "HILO"]
df_test_hilo = df_test[df_test["name"]=="HILO"]

Xtrain = np.array(df_train[columns].drop(labels=["data_in"], axis=1))
Ytrain = np.array(df_train["data_in"])

# Xvalid = np.array(df_valid[columns].drop(labels=["data_in"], axis=1))
# Yvalid = np.array(df_valid["data_in"])

Xtest = np.array(df_test_hilo[columns].drop(labels=["data_in"], axis=1))
Ytest = np.array(df_test_hilo["data_in"])

In [60]:
# hyperparameters obtained by fine tuning
xgboost = XGBRegressor(
    n_estimators=170,
    learning_rate=0.1,
    max_depth=9,
    verbosity=0
)

xgboost.fit(Xtrain, Ytrain)
print("MSE on xgboost (test) : {:.5f}".format(mean_squared_error(Ytest, xgboost.predict(Xtest))))
print("MSE on xgboost (train): {:.5f}".format(mean_squared_error(Ytrain, xgboost.predict(Xtrain))))

MSE on xgboost (test) : 36.22984
MSE on xgboost (train): 4.65344


In [61]:
Xtrain = np.array(df_train_hilo[columns].drop(labels=["data_in"], axis=1))
Ytrain = np.array(df_train_hilo["data_in"])

Xtest = np.array(df_test_hilo[columns].drop(labels=["data_in"], axis=1))
Ytest = np.array(df_test_hilo["data_in"])

model = LinearRegression()
model.fit(Xtrain, Ytrain)
print("MSE on Linear Regression (test) : {:.5f}".format(mean_squared_error(Ytest, model.predict(Xtest))))
print("MSE on Linear Regression (train): {:.5f}".format(mean_squared_error(Ytrain, model.predict(Xtrain))))

MSE on Linear Regression (test) : 33.09867
MSE on Linear Regression (train): 24.18849


In [68]:
sum(df_test["name"] == "HONOLULU")

0

In [62]:
df_train_honolulu = df_train[df_train["name"] == "HONOLULU"]
df_test_honolulu = df_test[df_test["name"]=="HONOLULU"]

Xtrain = np.array(df_train[columns].drop(labels=["data_in"], axis=1))
Ytrain = np.array(df_train["data_in"])

# Xvalid = np.array(df_valid[columns].drop(labels=["data_in"], axis=1))
# Yvalid = np.array(df_valid["data_in"])

Xtest = np.array(df_test_honolulu[columns].drop(labels=["data_in"], axis=1))
Ytest = np.array(df_test_honolulu["data_in"])
# hyperparameters obtained by fine tuning
xgboost = XGBRegressor(
    n_estimators=170,
    learning_rate=0.1,
    max_depth=9,
    verbosity=0
)

xgboost.fit(Xtrain, Ytrain)
print("MSE on xgboost (test) : {:.5f}".format(mean_squared_error(Ytest, xgboost.predict(Xtest))))
print("MSE on xgboost (train): {:.5f}".format(mean_squared_error(Ytrain, xgboost.predict(Xtrain))))

Xtrain = np.array(df_train_honolulu[columns].drop(labels=["data_in"], axis=1))
Ytrain = np.array(df_train_honolulu["data_in"])

Xtest = np.array(df_test_honolulu[columns].drop(labels=["data_in"], axis=1))
Ytest = np.array(df_test_honolulu["data_in"])

model = LinearRegression()
model.fit(Xtrain, Ytrain)
print("MSE on Linear Regression (test) : {:.5f}".format(mean_squared_error(Ytest, model.predict(Xtest))))
print("MSE on Linear Regression (train): {:.5f}".format(mean_squared_error(Ytrain, model.predict(Xtrain))))

ValueError: Found array with 0 sample(s) (shape=(0,)) while a minimum of 1 is required.

In [None]:
df_train_lihue = df_train[df_train["name"] == "LIHUE"]
df_test_lihue = df_test[df_test["name"]=="LIHUE"]

Xtrain = np.array(df_train[columns].drop(labels=["data_in"], axis=1))
Ytrain = np.array(df_train["data_in"])

# Xvalid = np.array(df_valid[columns].drop(labels=["data_in"], axis=1))
# Yvalid = np.array(df_valid["data_in"])

Xtest = np.array(df_test_lihue[columns].drop(labels=["data_in"], axis=1))
Ytest = np.array(df_test_lihue["data_in"])
# hyperparameters obtained by fine tuning
xgboost = XGBRegressor(
    n_estimators=170,
    learning_rate=0.1,
    max_depth=9,
    verbosity=0
)

xgboost.fit(Xtrain, Ytrain)
print("MSE on xgboost (test) : {:.5f}".format(mean_squared_error(Ytest, xgboost.predict(Xtest))))
print("MSE on xgboost (train): {:.5f}".format(mean_squared_error(Ytrain, xgboost.predict(Xtrain))))

Xtrain = np.array(df_train_lihue[columns].drop(labels=["data_in"], axis=1))
Ytrain = np.array(df_train_lihue["data_in"])

Xtest = np.array(df_test_lihue[columns].drop(labels=["data_in"], axis=1))
Ytest = np.array(df_test_lihue["data_in"])

model = LinearRegression()
model.fit(Xtrain, Ytrain)
print("MSE on Linear Regression (test) : {:.5f}".format(mean_squared_error(Ytest, model.predict(Xtest))))
print("MSE on Linear Regression (train): {:.5f}".format(mean_squared_error(Ytrain, model.predict(Xtrain))))