In [47]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.gaussian_process import GaussianProcessRegressor, GaussianProcessClassifier
from sklearn.metrics import mean_squared_error, mean_absolute_error, auc, f1_score, balanced_accuracy_score, roc_curve
from sklearn.model_selection import train_test_split
from math import sqrt

from sklearn.datasets import load_boston, load_breast_cancer
from sklearn.gaussian_process.kernels import *

<h4>Regression (Boston housing dataset)</h4>

In [17]:
# loading the problem
X, y = load_boston(return_X_y=True)
X = pd.DataFrame(X)

scaler = MinMaxScaler()
y = scaler.fit_transform(y.reshape(-1, 1))

In [18]:
# creating the train and validation datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [31]:
#https://scikit-learn.org/stable/modules/classes.html#module-sklearn.gaussian_process
#https://scikit-learn.org/stable/modules/gaussian_process.html#gp-kernels
kernels = {'Dot-Product and White': DotProduct() + WhiteKernel(), \
           'RBF and White': RBF() + WhiteKernel(), \
           'RBF': RBF(), \
           'Rational Quadratic': RationalQuadratic(), \
           'Rational Quadratic and White': RationalQuadratic() + WhiteKernel(), \
           'Matern': Matern(), \
           'Matern and White': Matern() + WhiteKernel(),\
           'Constant': ConstantKernel(), \
           'Constant and White': ConstantKernel() + WhiteKernel()}

In [33]:
# calculating the metrics
df_results_regression = pd.DataFrame(columns=['Kernel', 'MAE', 'RMSE'])

for kernel in kernels:
    gpr = GaussianProcessRegressor(kernel=kernels[kernel], random_state=0, normalize_y=True).fit(X_train, y_train)
    predictions = gpr.predict(X_test)
    mae = mean_absolute_error(y_test, predictions)
    rmse = sqrt(mean_squared_error(y_test, predictions))
    df_results_regression = df_results_regression.append({'Kernel': kernel, \
                                                          'MAE': mae, 'RMSE': rmse}, ignore_index=True)

df_results_regression

Unnamed: 0,Kernel,MAE,RMSE
0,Dot-Product and White,0.085055,0.128512
1,RBF and White,0.083953,0.122438
2,RBF,0.115605,0.179653
3,Rational Quadratic,0.094191,0.147125
4,Rational Quadratic and White,0.083953,0.122437
5,Matern,0.083188,0.124394
6,Matern and White,0.082919,0.122313
7,Constant,0.13889,0.200717
8,Constant and White,0.138891,0.200718


<h4>Classification (Breast cancer dataset)</h4>

In [36]:
# loading the problem
X, y = load_breast_cancer(return_X_y=True)
X = pd.DataFrame(X)

In [37]:
# creating the train and validation datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [40]:
# calculating the metrics
df_results_classification = pd.DataFrame(columns=['Kernel', 'AUC', 'F-score', 'Balanced Accuracy'])

for kernel in kernels:
    gpr = GaussianProcessClassifier(kernel=kernels[kernel], random_state=0, n_jobs=-1, max_iter_predict=1000).fit(X_train, y_train)
    predictions = gpr.predict(X_test)
    
    fpr, tpr, thresholds = roc_curve(y_test, predictions)
    auc_pred = auc(fpr, tpr)

    f1_score_pred = f1_score(y_test, predictions)
    bal_accuracy_score = balanced_accuracy_score(y_test, predictions)
    
    df_results_classification = df_results_classification.append({'Kernel': kernel, \
                                                                  'AUC': auc_pred, \
                                                                  'F-score': f1_score_pred, \
                                                                  'Balanced Accur'}, ignore_index=True)

df_results_regression

In [48]:
gpr = GaussianProcessClassifier(kernel=kernels['RBF'], random_state=0, n_jobs=-1, max_iter_predict=1000).fit(X_train, y_train)
predictions = gpr.predict(X_test)



In [51]:
auc_pred

0.9318831375039694

In [None]:
# calculating the metrics
df_results_regression = pd.DataFrame(columns=['Kernel', 'MAE', 'RMSE'])

for kernel in kernels:
    gpr = GaussianProcessRegressor(kernel=kernels[kernel], random_state=0, normalize_y=True).fit(X_train, y_train)
    predictions = gpr.predict(X_test)
    mae = mean_absolute_error(y_test, predictions)
    rmse = sqrt(mean_squared_error(y_test, predictions))
    df_results_regression = df_results.append({'Kernel': kernel, 'MAE': mae, 'RMSE': rmse}, ignore_index=True)

df_results_regression

In [None]:
# ensuring we are using the GPU
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

In [None]:
def create_model(kernel_size, lstm_dimensionality, dense_dimensionality, number_dense_layers):
    model = Sequential()
    model.add(Conv1D(filters=limit, kernel_size=kernel_size, strides=1, padding='causal', activation='relu', input_shape=[None, 1]))
    model.add(LSTM(lstm_dimensionality))

    for _ in range(number_dense_layers):
        model.add(Dense(dense_dimensionality))

    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_squared_error', 'mean_absolute_error'])

    #model.summary()
    return model

In [None]:
# hyperparameters
kernel_size = limit
lstm_dimensionality = [limit, limit*2]
dense_dimensionality = [10, 50, 100]
number_dense_layers = [1, 4, 8]

batch_size = 512
epochs = 100

In [None]:
models = {}
for lstm_dim in lstm_dimensionality:
    for dense_dim in dense_dimensionality:
        for num_layers in number_dense_layers:
            display(f'LSTM dimensionality: {lstm_dim}, dense layer dimensionality: {dense_dim} @ {num_layers} dense layers.')
            model = create_model(kernel_size, lstm_dim, dense_dim, num_layers)
            model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, verbose=0)
            models[(lstm_dim, dense_dim, num_layers)] = model

In [None]:
# retrieving the predictions
df_comparison = df_validation[['Datetime', target]].copy()
df_comparison[target] = scaler.inverse_transform(y_test.reshape(-1, 1)).flatten()

for model in models:
    df_comparison[str(model)] = scaler.inverse_transform(models[model].predict(x_test)).flatten()

In [None]:
# calculating the metrics
df_results = pd.DataFrame(columns=['LSTM Dimensionality', 'Dense Dimensionality', 'Dense Layers', 'MAE', 'RMSE'])

for model in models:
    mae = mean_absolute_error(df_comparison[target], df_comparison[str(model)])
    rmse = sqrt(mean_squared_error(df_comparison[target], df_comparison[str(model)]))
    df_results = df_results.append({'LSTM Dimensionality': model[0], 'Dense Dimensionality': model[1], \
                                    'Dense Layers': model[2], 'MAE': mae, 'RMSE': rmse}, ignore_index=True)

for col in ['LSTM Dimensionality', 'Dense Dimensionality', 'Dense Layers']:
    df_results[col] = df_results[col].astype(int)

df_results

In [None]:
df_plot = pd.melt(df_comparison, id_vars=['Datetime'], value_vars=list(set(df_comparison.columns) - set(['Datetime'])))

for model in models:    
    plt.figure(figsize=(19, 6))
    sns.lineplot(x='Datetime', y='value', hue='variable', data=df_plot[(df_plot['variable']==target) | (df_plot['variable']==str(model))])
    plt.show()

In [None]:
df_results.to_csv('results.csv', encoding='utf-8', index=False)