In [1]:
# Import helps
from src.helpers import *

# Import base
import pandas as pd
import numpy as np
from datetime import datetime
import time
import sys

# Preprocessing
from sklearn.pipeline import Pipeline

# Feature selection
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier

# Dimentionality reduction
from umap import UMAP

# tensorflow
import tensorflow as tf

# Keras tuner
import keras_tuner

# Warnings
import warnings
warnings.filterwarnings("ignore")

2024-01-06 16:05:52.846070: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-01-06 16:05:52.874869: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-06 16:05:52.874889: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-06 16:05:52.875713: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-06 16:05:52.881184: I tensorflow/core/platform/cpu_feature_guar

In [2]:
# Set seeds for reproducibility
set_seeds()

In [3]:
# Clear any backend
tf.keras.backend.clear_session()

In [4]:
# Reload the saved scaled data
X_train = pd.read_csv('data/train/scaled_X_train.csv',
                      parse_dates=True, index_col='unix')
y_train = pd.read_csv('data/train/y_train.csv',
                      parse_dates=True, index_col='unix')
X_test = pd.read_csv('data/test/scaled_X_test.csv',
                     parse_dates=True, index_col='unix')
y_test = pd.read_csv('data/test/y_test.csv',
                     parse_dates=True, index_col='unix')
X_val = pd.read_csv('data/val/scaled_X_val.csv',
                    parse_dates=True, index_col='unix')
y_val = pd.read_csv('data/val/y_val.csv', parse_dates=True, index_col='unix')

In [5]:
# Calculate the weights for the imbalanced classes
y = pd.concat([y_train, y_val, y_test])
weights = cwts(y.values.flatten())

In [6]:
# Metrics
binary_accuracy = tf.keras.metrics.BinaryAccuracy()
precision = tf.keras.metrics.Precision()
recall = tf.keras.metrics.Recall()

2024-01-06 16:05:58.585030: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-01-06 16:05:58.607907: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-01-06 16:05:58.608042: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

In [7]:
# Run the pipeline
vif = VIFTransform(threshold=5)
rf = RandomForestClassifier(n_jobs=-1, class_weight=weights)
boruta = BorutaPy(rf, n_estimators='auto', verbose=2, perc=90)
pipe = Pipeline([('vif', vif), ('boruta', boruta)], verbose=True)

X_train_pipe = pipe.fit_transform(X_train, y_train.values.ravel())
X_val_pipe = pipe.transform(X_val)

Calculating VIF Factors
Calculating VIF Factors - Complete
[Pipeline] ............... (step 1 of 2) Processing vif, total= 5.7min
Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	41
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	41
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	41
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	41
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	41
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	41
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	41
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	7
Tentative: 	6
Rejected: 	28
Iteration: 	9 / 100
Confirmed: 	7
Tentative: 	6
Rejected: 	28
Iteration: 	10 / 100
Confirmed: 	7
Tentative: 	6
Rejected: 	28
Iteration: 	11 / 100
Confirmed: 	7
Tentative: 	6
Rejected: 	28
Iteration: 	12 / 100
Confirmed: 	7
Tentative: 	5
Rejected: 	29
Iteration: 	13 / 100
Confirmed: 	7
Tentative: 	5
Rejected: 	29
Iteration: 	14 / 100
Confirmed: 	7
Tentative: 	5
Rejected: 	

In [42]:
# Calculate feature length
featurelen = X_train_pipe.shape[-1]

In [50]:
#
class IterableHyperModel(keras_tuner.HyperModel):
    def __init__(self, inputs, model_func, name=None, tunable=True):
        self.inputs = inputs
        self.model_func = model_func

    def build(self, hp):
        # Define the hyperparameters
        # Units
        units_1 = hp.Int('units_1', min_value=16, max_value=512, step=16)
        units_2 = hp.Int('units_2', min_value=16, max_value=512, step=16)
        units_3 = hp.Int('units_3', min_value=16, max_value=512, step=16)
        # Dropout rate
        dr = hp.Float('dropout_rate', min_value=0.0, max_value=0.9)
        # Learning rate
        lr = hp.Float('learning_rate', min_value=0.05, max_value=0.5)
        # Optimizer
        hp_optimizer = hp.Choice('optimizer', ['adam', 'rmsprop', 'adagrad'])
        if hp_optimizer == 'adam':
            optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
        elif hp_optimizer == 'rmsprop':
            optimizer = tf.keras.optimizers.RMSprop(learning_rate=lr)
        elif hp_optimizer == 'adagrad':
            optimizer = tf.keras.optimizers.Adagrad(learning_rate=lr)

        # Activations
        activation_1 = hp.Choice(
            'activation_1', ['relu', 'elu', 'tanh', 'sigmoid', 'selu'])
        activation_2 = hp.Choice(
            'activation_2', ['relu', 'elu', 'tanh', 'sigmoid', 'selu'])
        activation_3 = hp.Choice(
            'activation_3', ['relu', 'elu', 'tanh', 'sigmoid', 'selu'])

        model = self.model_func(self.inputs, units_1, units_2, units_3,
                                dr, lr, optimizer, activation_1, activation_2, activation_3)
        return model

In [51]:
# Baseline model
def baseline(inputs, units_1, units_2, units_3, dr, lr, optimizer, activation_1, activation_2, activation_3):
            
    # Baseline model
    x = tf.keras.layers.LSTM(units_1, activation=activation_1, name=f'lstm-baseline')(inputs)
    outputs = tf.keras.layers.Dense(units=1, activation='sigmoid', name=f'dense-baseline')(x)
    model = tf.keras.Model(inputs, outputs)
    
    # Summary
    model.summary()
    
    # Compile model
    model.compile(optimizer=optimizer, loss='binary_crossentropy',
                      weighted_metrics=[binary_accuracy, precision, recall])
    return model
        

In [52]:
models = [[baseline, 'baseline']]

In [53]:
for model_func, name in models:
    
    # Define the sequence length and reshape the data into the correct array
    seqlen = 6  # Number of time steps in the past to consider

    # Define the tensors
    train_tensors = tf.keras.utils.timeseries_dataset_from_array(
        X_train_pipe, y_train.iloc[seqlen:], seqlen)
    val_tensors = tf.keras.utils.timeseries_dataset_from_array(
        X_val_pipe, y_val.iloc[seqlen:], seqlen)
    
    # Define the input
    inputs = tf.keras.Input(shape=(seqlen, featurelen))
    
    # Define the file paths
    filepath = f'./tensorboard/model_testing/{name}'
    modelpath = f'./models/model_testing/{name}'
    logpath = f'./logs/model_testing/{name}'
    
    # Define the callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(patience=5, monitor='val_precision', mode='max'),
        tf.keras.callbacks.TensorBoard(log_dir=filepath, histogram_freq=1)]
    
    # Initialise tuner and 
    tuner = keras_tuner.RandomSearch(IterableHyperModel(inputs, model_func), objective=keras_tuner.Objective(
        'val_precision', direction='max'), max_trials=3, executions_per_trial=2, overwrite=True, project_name=modelpath, seed=42)
    
    with open(logpath, 'w') as f:
        sys.stdout = f
        tuner.search(train_tensors, epochs=1000, validation_data=val_tensors, callbacks=callbacks)