In [1]:
# basic libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_predict, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.svm import SVR
from sklearn.cluster import KMeans

#others
from xgboost import XGBRegressor
import cartopy.crs as ccrs
import cartopy.mpl.ticker as cticker
import time
import xarray as xr
import sherpa

# Variables from config file
from config import BASE_DIR, FILE_NAMES, LABELS, ATTRIBUTES


In [2]:
# Load the datasets
# Load the dataset
# df_metadata = pd.read_excel(f"{BASE_DIR}/FilledDataset2012.xlsx", sheet_name="Header")
# df_data_original = pd.read_csv(f"{BASE_DIR}/dataset.csv")


# load datasets
df_train = pd.read_csv(f"{BASE_DIR}/train.csv")
df_valid = pd.read_csv(f"{BASE_DIR}/valid.csv")
df_test = pd.read_csv(f"{BASE_DIR}/test.csv")

# Nov-Apr = "wet", May-Oct = "dry"
wet = [11, 12, 1, 2, 3, 4]
dry = [5, 6, 7, 8, 9, 10]
df_train['season_dry'] = df_train.apply(lambda row: 1 if row.month in dry else 0, axis=1)
df_train['season_wet'] = df_train.apply(lambda row: 1 if row.month in wet else 0, axis=1)

df_valid['season_dry'] = df_valid.apply(lambda row: 1 if row.month in dry else 0, axis=1)
df_valid['season_wet'] = df_valid.apply(lambda row: 1 if row.month in wet else 0, axis=1)

df_test['season_dry'] = df_test.apply(lambda row: 1 if row.month in dry else 0, axis=1)
df_test['season_wet'] = df_test.apply(lambda row: 1 if row.month in wet else 0, axis=1)

reanalysis_data = [
    'air2m', 'air1000_500', 'hgt500', 'hgt1000', 'omega500',
    'pottemp1000-500', 'pottemp1000-850', 'pr_wtr', 'shum-uwnd-700',
    'shum-uwnd-925', 'shum-vwnd-700', 'shum-vwnd-950', 'shum700', 'shum925', 
    'skt', 'slp'
]

columns = []
for i in range(6):
    for item in reanalysis_data:
        columns.append(f"{item}_{i}")

columns.extend(['data_in', 'lat', 'lon', 'elevation', 'season_wet', 'season_dry'])
for item in columns:
    print(item, end=' ')

air2m_0 air1000_500_0 hgt500_0 hgt1000_0 omega500_0 pottemp1000-500_0 pottemp1000-850_0 pr_wtr_0 shum-uwnd-700_0 shum-uwnd-925_0 shum-vwnd-700_0 shum-vwnd-950_0 shum700_0 shum925_0 skt_0 slp_0 air2m_1 air1000_500_1 hgt500_1 hgt1000_1 omega500_1 pottemp1000-500_1 pottemp1000-850_1 pr_wtr_1 shum-uwnd-700_1 shum-uwnd-925_1 shum-vwnd-700_1 shum-vwnd-950_1 shum700_1 shum925_1 skt_1 slp_1 air2m_2 air1000_500_2 hgt500_2 hgt1000_2 omega500_2 pottemp1000-500_2 pottemp1000-850_2 pr_wtr_2 shum-uwnd-700_2 shum-uwnd-925_2 shum-vwnd-700_2 shum-vwnd-950_2 shum700_2 shum925_2 skt_2 slp_2 air2m_3 air1000_500_3 hgt500_3 hgt1000_3 omega500_3 pottemp1000-500_3 pottemp1000-850_3 pr_wtr_3 shum-uwnd-700_3 shum-uwnd-925_3 shum-vwnd-700_3 shum-vwnd-950_3 shum700_3 shum925_3 skt_3 slp_3 air2m_4 air1000_500_4 hgt500_4 hgt1000_4 omega500_4 pottemp1000-500_4 pottemp1000-850_4 pr_wtr_4 shum-uwnd-700_4 shum-uwnd-925_4 shum-vwnd-700_4 shum-vwnd-950_4 shum700_4 shum925_4 skt_4 slp_4 air2m_5 air1000_500_5 hgt500_5 hgt1

In [3]:
df_train['name'][20:30]

20         FIELD 435
21          FIELD 51
22           NAALEHU
23           HONUAPO
24     MOAULA TUNNEL
25        MOAULA RES
26      HIGASHI CAMP
27            MOAULA
28    ALILI TUNNEL 2
29             ALILI
Name: name, dtype: object

In [4]:
station = "H 23 CAMP 9"

df_train_station = df_train[df_train['name'] == station]
df_test_station  = df_test[df_test['name'] == station]

# xgboost trains on the entire dataset
Xtrain = np.array(df_train[columns].drop(labels=["data_in"], axis=1))
Ytrain = np.array(df_train["data_in"])
# test on the station data
Xtest = np.array(df_test_station[columns].drop(labels=["data_in"], axis=1))
Ytest = np.array(df_test_station["data_in"])

In [5]:
# hyperparameters obtained by fine tuning
xgboost = XGBRegressor(
    n_estimators=170,
    learning_rate=0.1,
    max_depth=9,
    verbosity=0
)

xgboost.fit(Xtrain, Ytrain)
print("MSE on xgboost (test) : {:.5f}".format(mean_squared_error(Ytest, xgboost.predict(Xtest))))
print("MSE on xgboost (train): {:.5f}".format(mean_squared_error(Ytrain, xgboost.predict(Xtrain))))

MSE on xgboost (test) : 7.95053
MSE on xgboost (train): 4.65344


In [7]:
# linear regression trains on the station dataset
Xtrain = np.array(df_train_station[columns].drop(labels=["data_in"], axis=1))
Ytrain = np.array(df_train_station["data_in"])
# test on the station data
Xtest = np.array(df_test_station[columns].drop(labels=["data_in"], axis=1))
Ytest = np.array(df_test_station["data_in"])

model = LinearRegression()
model.fit(Xtrain, Ytrain)
print("MSE on Linear Regression (test) : {:.5f}".format(mean_squared_error(Ytest, model.predict(Xtest))))
print("MSE on Linear Regression (train): {:.9f}".format(mean_squared_error(Ytrain, model.predict(Xtrain))))

MSE on Linear Regression (test) : 98.38768
MSE on Linear Regression (train): 0.000000000


In [21]:
Xtrain.shape

(79, 101)

In [22]:
Xtest.shape

(29, 101)

In [9]:
model.coef_

array([ -1.37472834,  -3.9388546 ,   1.66263802,   3.64402281,
         0.42597255,  -4.89964856,  -1.86897793,  -2.65666209,
        -3.61230509,   3.24624348,   1.03520429,   2.50901055,
        -0.24589212,   9.90739521,  -5.24803597, -31.29209095,
        29.36384864,  15.68540875,  -3.67418031,   4.5585109 ,
        10.48984597,  26.1794364 , -10.00933717,  -2.07021897,
         8.57326135,  -5.46548312,   5.00506894,  -4.28891818,
        26.28797123,   8.61259018,  20.50927849,   3.13582043,
       -14.76622433, -22.73499744,   1.60480643,  -3.19608718,
        11.62962897, -14.66302912,  12.91253628,   2.7244002 ,
        -6.02482954,   2.68608791,   0.28615481,   3.1327232 ,
       -16.39853266, -12.60992938, -20.1791427 , -23.60575842,
       -14.02929353, -12.34466543,  -4.79106681,   3.03305395,
        -3.82026822,   5.63260849,   6.70444031,  -2.40305905,
         8.42467118,  -3.53881079,   2.90015113,  -2.44406572,
         7.74875935, -15.65813399,  12.6652431 , -17.92

In [20]:
for train, test in zip(Xtrain, Ytrain):
    print(train.dot(model.coef_) + model.intercept_, test)

4.669999999998254 4.67
3.069999999999709 3.0700000000000003
4.000000000021828 4.0
4.189999999987776 4.19
3.390000000021246 3.39
1.8299999999726424 1.83
4.640000000021246 4.64
3.7699999999967986 3.77
7.990000000019791 7.99
1.139999999999418 1.14
0.750000000007276 0.75
0.9900000000052387 0.99
6.7000000000116415 6.7
5.25 5.25
3.8800000000192085 3.88
9.809999999975844 9.81
3.1200000000026193 3.12
0.8500000000058208 0.85
2.2400000000125146 2.24
3.1200000000026193 3.12
4.180000000014843 4.18
7.400000000001455 7.4
15.319999999999709 15.32
18.68000000000029 18.68
6.480000000010477 6.48
1.3100000000194996 1.31
4.07999999999447 4.08
2.5300000000133878 2.53
12.920000000020082 12.92
6.010000000016589 6.01
13.419999999998254 13.42
14.71999999997206 14.72
9.750000000007276 9.75
7.649999999986903 7.65
4.080000000001746 4.08
10.800000000010186 10.8
6.580000000023574 6.58
6.169999999998254 6.17
2.1300000000046566 2.13
3.580000000009022 3.58
5.810000000012224 5.8100000000000005
3.900000000001455 3.9
5.0

In [14]:
Ytrain[0]

4.67

In [15]:
np.array([1,2,3]).dot(np.array([2,3,4]))

20

In [28]:
stations = np.random.choice(df_train['name'], size=10, random_state=42?)

In [31]:
for station in stations:
    df_train_station = df_train[df_train['name'] == station]
    df_test_station  = df_test[df_test['name'] == station]
    if df_train_station.shape[0] == 0 or df_test_station.shape[0] == 0:
        continue
    print("=========================================")
    print(f"Running experiment on {station} station.")
    print(f"There are:")
    print(f"{df_train_station.shape[0]} training data and")
    print(f"{df_test_station.shape[0]} test data")
    print("=========================================")

    # xgboost trains on the entire dataset
    Xtrain = np.array(df_train[columns].drop(labels=["data_in"], axis=1))
    Ytrain = np.array(df_train["data_in"])
    # test on the station data
    Xtest = np.array(df_test_station[columns].drop(labels=["data_in"], axis=1))
    Ytest = np.array(df_test_station["data_in"])

    # hyperparameters obtained by fine tuning
    xgboost = XGBRegressor(
        n_estimators=170,
        learning_rate=0.1,
        max_depth=9,
        verbosity=0
    )

    xgboost.fit(Xtrain, Ytrain)
    print("MSE on xgboost (test) : {:.5f}".format(mean_squared_error(Ytest, xgboost.predict(Xtest))))
    print("MSE on xgboost (train): {:.5f}".format(mean_squared_error(Ytrain, xgboost.predict(Xtrain))))

    # linear regression trains on the station dataset
    Xtrain = np.array(df_train_station[columns].drop(labels=["data_in"], axis=1))
    Ytrain = np.array(df_train_station["data_in"])
    # test on the station data
    Xtest = np.array(df_test_station[columns].drop(labels=["data_in"], axis=1))
    Ytest = np.array(df_test_station["data_in"])

    model = LinearRegression()
    model.fit(Xtrain, Ytrain)
    print("MSE on Linear Regression (test) : {:.5f}".format(mean_squared_error(Ytest, model.predict(Xtest))))
    print("MSE on Linear Regression (train): {:.5f}".format(mean_squared_error(Ytrain, model.predict(Xtrain))))

Running experiment on H 23 CAMP 9 station.
There are:
79 training data and
29 test data
MSE on xgboost (test) : 7.95053
MSE on xgboost (train): 4.65344
MSE on Linear Regression (test) : 98.38768
MSE on Linear Regression (train): 0.00000
Running experiment on FIELD 271 station.
There are:
450 training data and
135 test data
MSE on xgboost (test) : 5.64846
MSE on xgboost (train): 4.65344
MSE on Linear Regression (test) : 6.40395
MSE on Linear Regression (train): 4.26141
Running experiment on Kahakuloa station.
There are:
483 training data and
142 test data
MSE on xgboost (test) : 7.78734
MSE on xgboost (train): 4.65344
MSE on Linear Regression (test) : 7.04884
MSE on Linear Regression (train): 4.99478
Running experiment on PAUWELA station.
There are:
450 training data and
135 test data
MSE on xgboost (test) : 7.40762
MSE on xgboost (train): 4.65344
MSE on Linear Regression (test) : 7.71454
MSE on Linear Regression (train): 4.55527
Running experiment on KAHEKA station.
There are:
450 trai

In [10]:
station = "HILO"

df_train_station = df_train[df_train['name'] == station]
df_test_station  = df_test[df_test['name'] == station]

# xgboost trains on the entire dataset
Xtrain = np.array(df_train[columns].drop(labels=["data_in"], axis=1))
Ytrain = np.array(df_train["data_in"])
# test on the station data
Xtest = np.array(df_test_station[columns].drop(labels=["data_in"], axis=1))
Ytest = np.array(df_test_station["data_in"])

# hyperparameters obtained by fine tuning
xgboost = XGBRegressor(
    n_estimators=170,
    learning_rate=0.1,
    max_depth=9,
    verbosity=0
)

xgboost.fit(Xtrain, Ytrain)
print("MSE on xgboost (test) : {:.5f}".format(mean_squared_error(Ytest, xgboost.predict(Xtest))))
print("MSE on xgboost (train): {:.5f}".format(mean_squared_error(Ytrain, xgboost.predict(Xtrain))))

# linear regression trains on the station dataset
Xtrain = np.array(df_train_station[columns].drop(labels=["data_in"], axis=1))
Ytrain = np.array(df_train_station["data_in"])
# test on the station data
Xtest = np.array(df_test_station[columns].drop(labels=["data_in"], axis=1))
Ytest = np.array(df_test_station["data_in"])

model = LinearRegression()
model.fit(Xtrain, Ytrain)
print("MSE on Linear Regression (test) : {:.5f}".format(mean_squared_error(Ytest, model.predict(Xtest))))
print("MSE on Linear Regression (train): {:.5f}".format(mean_squared_error(Ytrain, model.predict(Xtrain))))

MSE on xgboost (test) : 36.22984
MSE on xgboost (train): 4.65344
MSE on Linear Regression (test) : 33.09867
MSE on Linear Regression (train): 24.18849


In [11]:
station = "LIHUE"

df_train_station = df_train[df_train['name'] == station]
df_test_station  = df_test[df_test['name'] == station]

# xgboost trains on the entire dataset
Xtrain = np.array(df_train[columns].drop(labels=["data_in"], axis=1))
Ytrain = np.array(df_train["data_in"])
# test on the station data
Xtest = np.array(df_test_station[columns].drop(labels=["data_in"], axis=1))
Ytest = np.array(df_test_station["data_in"])

# hyperparameters obtained by fine tuning
xgboost = XGBRegressor(
    n_estimators=170,
    learning_rate=0.1,
    max_depth=9,
    verbosity=0
)

xgboost.fit(Xtrain, Ytrain)
print("MSE on xgboost (test) : {:.5f}".format(mean_squared_error(Ytest, xgboost.predict(Xtest))))
print("MSE on xgboost (train): {:.5f}".format(mean_squared_error(Ytrain, xgboost.predict(Xtrain))))

# linear regression trains on the station dataset
Xtrain = np.array(df_train_station[columns].drop(labels=["data_in"], axis=1))
Ytrain = np.array(df_train_station["data_in"])
# test on the station data
Xtest = np.array(df_test_station[columns].drop(labels=["data_in"], axis=1))
Ytest = np.array(df_test_station["data_in"])

model = LinearRegression()
model.fit(Xtrain, Ytrain)
print("MSE on Linear Regression (test) : {:.5f}".format(mean_squared_error(Ytest, model.predict(Xtest))))
print("MSE on Linear Regression (train): {:.5f}".format(mean_squared_error(Ytrain, model.predict(Xtrain))))

MSE on xgboost (test) : 5.79774
MSE on xgboost (train): 4.65344
MSE on Linear Regression (test) : 6.01450
MSE on Linear Regression (train): 5.17918


In [13]:
station = "KALAE"

df_train_station = df_train[df_train['name'] == station]
df_test_station  = df_test[df_test['name'] == station]

# xgboost trains on the entire dataset
Xtrain = np.array(df_train[columns].drop(labels=["data_in"], axis=1))
Ytrain = np.array(df_train["data_in"])
# test on the station data
Xtest = np.array(df_test_station[columns].drop(labels=["data_in"], axis=1))
Ytest = np.array(df_test_station["data_in"])

# hyperparameters obtained by fine tuning
xgboost = XGBRegressor(
    n_estimators=170,
    learning_rate=0.1,
    max_depth=9,
    verbosity=0
)

xgboost.fit(Xtrain, Ytrain)
print("MSE on xgboost (test) : {:.5f}".format(mean_squared_error(Ytest, xgboost.predict(Xtest))))
print("MSE on xgboost (train): {:.5f}".format(mean_squared_error(Ytrain, xgboost.predict(Xtrain))))

# linear regression trains on the station dataset
Xtrain = np.array(df_train_station[columns].drop(labels=["data_in"], axis=1))
Ytrain = np.array(df_train_station["data_in"])
# test on the station data
Xtest = np.array(df_test_station[columns].drop(labels=["data_in"], axis=1))
Ytest = np.array(df_test_station["data_in"])

model = LinearRegression()
model.fit(Xtrain, Ytrain)
print("MSE on Linear Regression (test) : {:.5f}".format(mean_squared_error(Ytest, model.predict(Xtest))))
print("MSE on Linear Regression (train): {:.5f}".format(mean_squared_error(Ytrain, model.predict(Xtrain))))

MSE on xgboost (test) : 3.94315
MSE on xgboost (train): 4.65344
MSE on Linear Regression (test) : 5.24540
MSE on Linear Regression (train): 2.00699


In [32]:
station = "KAUPAKULOA FT 10"

df_train_station = df_train[df_train['name'] == station]
df_test_station  = df_test[df_test['name'] == station]


# linear regression trains on the station dataset
Xtrain = np.array(df_train_station[columns].drop(labels=["data_in"], axis=1))
Ytrain = np.array(df_train_station["data_in"])
# test on the station data
Xtest = np.array(df_test_station[columns].drop(labels=["data_in"], axis=1))
Ytest = np.array(df_test_station["data_in"])

model = LinearRegression()
model.fit(Xtrain, Ytrain)
print("MSE on Linear Regression (test) : {:.5f}".format(mean_squared_error(Ytest, model.predict(Xtest))))
print("MSE on Linear Regression (train): {:.5f}".format(mean_squared_error(Ytrain, model.predict(Xtrain))))

MSE on Linear Regression (test) : 76.46426
MSE on Linear Regression (train): 0.00000


In [40]:
n_data = df_train_station.shape[0]
data = df_train_station['data_in'].to_numpy()