In [88]:
# basic imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# sklearn
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

#tensorflow
#import tensorflow as tf
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from tensorflow.keras.layers import Dense, Input

# others
from copy import deepcopy
from xgboost import XGBRegressor

# Variables from config file
from config import BASE_DIR, FILE_NAMES, LABELS, ATTRIBUTES, BEST_MODEL_COLUMNS, ISLAND_RANGES

In [2]:
# Split the stations by the number of samples available
columns = deepcopy(LABELS)
columns.extend(["season_wet", "elevation", "lat", "lon"])
for item in columns:
    print(item, end=' ')

# # load datasets
# df_train = pd.read_csv(f"{BASE_DIR}/train.csv", usecols=columns + ['year', 'month', 'skn', 'data_in'])
# df_valid = pd.read_csv(f"{BASE_DIR}/valid.csv", usecols=columns + ['year', 'month', 'skn', 'data_in'])
# df_test = pd.read_csv(f"{BASE_DIR}/test.csv", usecols=columns + ['year', 'month', 'skn', 'data_in'])
# df_combined = pd.concat([df_train, df_valid, df_test])

air2m air1000_500 hgt500 hgt1000 omega500 pottemp1000-500 pottemp1000-850 pr_wtr shum-uwnd-700 shum-uwnd-925 shum-vwnd-700 shum-vwnd-950 shum700 shum925 skt slp season_wet elevation lat lon 

In [15]:
df_load = pd.read_csv(f"{BASE_DIR}/nonfilled_dataset.csv")

# inner join with valid station (more than 300 stations)
threshold = 300
df_skn = df_load.groupby('skn').size().reset_index().rename(columns={0: "n_samples"})
df_skn_valid = df_skn[df_skn['n_samples'] > threshold]

df_data = df_load.merge(right=df_skn_valid, left_on='skn', right_on='skn')

In [16]:
# skn = df_data['skn'].sample().values[0]

# df_temp = df_data[df_data['year'] <= 2007]
# df_test = df_data[df_data['year'] > 2007]

# df_temp.sort_values(['year', 'month'], inplace=True)
# df_test.sort_values(['year', 'month'], inplace=True)

# Xtemp = np.array(df_temp[df_temp['skn'] == skn][columns])
# Ytemp = np.array(df_temp[df_temp['skn'] == skn]['data_in'])

# Xtrain, Xvalid, Ytrain, Yvalid = train_test_split(Xtemp, Ytemp, test_size=0.2, shuffle=False)

# Xtest = np.array(df_test[df_test['skn'] == skn][columns])
# Ytest = np.array(df_test[df_test['skn'] == skn]['data_in'])

In [82]:
skn = df_data['skn'].sample().values[0]
print(f'{skn}')
df_station = df_data[df_data['skn'] == skn]

X = np.array(df_station[columns])
Y = np.array(df_station['data_in'])

Xtemp, Xtest, Ytemp, Ytest = train_test_split(X, Y, test_size=0.2, shuffle=False)
Xtrain, Xvalid, Ytrain, Yvalid = train_test_split(Xtemp, Ytemp, test_size=0.2, shuffle=False)

255.0


In [83]:
scaler = StandardScaler()

Xtrain = scaler.fit_transform(Xtrain)
Xvalid = scaler.transform(Xvalid)
Xtest = scaler.transform(Xtest)

In [84]:
def root_mean_squared_error(y_true, y_pred):
    return mean_squared_error(y_true, y_pred, squared=False)

def define_model(num_inputs=20, lr=0.01):
    inputs = Input(shape=(num_inputs,))
    x = Dense(units=20, activation='relu')(inputs)
    x = Dense(units=16, activation='relu')(x)
    x = Dense(units=8, activation='relu')(x)
    x = Dense(units=4, activation='relu')(x)
    outputs = Dense(units=1, kernel_initializer='normal')(x)
    
    model = Model(inputs=inputs, outputs=outputs)
    
    model.compile(
        optimizer=tf.optimizers.Adam(learning_rate=lr),
        loss='mse',
        metrics=[RootMeanSquaredError()]
    )
    
    return model

In [85]:
model = define_model()
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience = 3, mode='min')
epochs=20

history = model.fit(
    Xtrain, Ytrain, 
    epochs=epochs, 
    validation_data = (Xvalid, Yvalid),
    callbacks=[callback],
    batch_size=256
)

yhat = model.predict(Xtest)
print(mean_squared_error(Ytest, yhat, squared=False))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
1.8651224779434123


In [86]:
linear_regression = LinearRegression()
linear_regression.fit(Xtrain, Ytrain)
yhat = linear_regression.predict(Xtest)

print(mean_squared_error(Ytest, yhat))

3.525728016000597


In [89]:
params = {'n_estimators': 260, 'learning_rate': 0.1, 'max_depth': 3, 'early_stopping_rounds': 8, 'verbosity': 0}
xgboost = XGBRegressor(**params)

xgboost.fit(Xtrain, Ytrain)
yhat = xgboost.predict(Xtest)
print(mean_squared_error(Ytest, yhat, squared=False))

1.8924042344701761


In [13]:
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience = 3, mode='min')
# construct a model
model = models.Sequential()
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(8, activation='relu'))
model.add(layers.Dense(4, activation='relu'))
model.add(layers.Dense(1, kernel_initializer='normal'))

epochs = 20
model.compile(
    optimizer = 'adam',
    loss = 'mean_squared_error'
)
history = model.fit(
    Xtrain, Ytrain, 
    epochs=epochs, 
    validation_data = (Xvalid, Yvalid),
    callbacks=[callback],
    batch_size=256
)

2022-04-21 20:18:35.701595: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2022-04-21 20:18:35.706247: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2022-04-21 20:18:35.727848: E tensorflow/stream_executor/cuda/cuda_driver.cc:328] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2022-04-21 20:18:35.727891: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: gpu-0008
2022-04-21 20:18:35.727902: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: gpu-0008
2022-04-21 20:18:35.728030: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 495.29.5
2022-04-21 20:18:35.728075: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 495.29.5
2022-04-21 20:18:35.728084: I tensorflow/stream_executor/cuda

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20


In [49]:
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(19, input_dim=19, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model
# evaluate model
estimator = KerasRegressor(build_fn=baseline_model, epochs=100, batch_size=5, verbose=0)
kfold = KFold(n_splits=10)
results = cross_val_score(estimator, X, Y, cv=kfold)
print("Baseline: %.2f (%.2f) MSE" % (results.mean(), results.std()))

Baseline: nan (nan) MSE


10 fits failed out of a total of 10.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/home/yusukemh/.conda/envs/climate/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/yusukemh/.conda/envs/climate/lib/python3.9/site-packages/tensorflow/python/keras/wrappers/scikit_learn.py", line 157, in fit
    self.model = self.build_fn(**self.filter_sk_params(self.build_fn))
  File "/tmp/ipykernel_234848/1957097577.py", line 3, in baseline_model
    model = Sequential()
NameError: name 'Sequential' is not defined



In [13]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1
