# Subject: Machine Learning, 1st March 2024 to 3rd May 2024.
## Topic: Group Project (Task B)
## Learning Outcomes:
- MO2: Select and apply machine learning algorithms to formulate 
solutions to different types of machine learning problems, taking 
into account criteria such data availability and characteristics, and 
problem-specific requirements for balancing speed, accuracy, and 
explainability.

- MO3: Implement and evaluate contemporary machine learning 
solutions to application problems using a range of contemporary 
frameworks.

- MO4: Demonstrate an awareness of the ethical and societal 
implications of machine learning solutions. 

In [None]:
import subprocess
import os


current_directory = os.path.dirname(os.path.abspath("main.ipynb"))
current_directory += "/"

def install(requirements):
    """
    Install all the relevent project dependencies.
    """

    try:
        if os.path.isdir('.venv'):
            activate_script = os.path.join('.venv', 'bin', 'activate')
            subprocess.check_call(['source', activate_script], shell=True)

        with open(requirements, 'r') as f:
            requirements = f.read().splitlines()
            subprocess.check_call(['pip', 'install'] + requirements)
        print("Installed dependencies.")

    except FileNotFoundError:
        print(f"File '{requirements}' not found.")
    except subprocess.CalledProcessError:
        pass

if __name__ == "__main__":
    install(current_directory + "project/requirements.txt")



# Preprocessing our data

In [None]:
import pandas as pd
import datetime as dt
import warnings
from pandas.errors import SettingWithCopyWarning

warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

raw_data = pd.read_csv(current_directory + "project/raw_data/BTC-USD.csv")
df = raw_data[["Date", "Close"]]

def to_datetime(s: str):
    year, month, day = s.split("-")
    return dt.datetime(
        int(year),
        int(month),
        int(day)
    )

close = df["Close"]
dates = df["Date"].apply(to_datetime)
print(df)

### Partitioning and Normalisation
Get the training set, validation set and test set for the model to work.

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Prepare X and y
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_close_price = 'ScaledClosePrice'
df[scaled_close_price] = scaler.fit_transform(df[["Close"]])

lookback = 10  # The number of previous steps to include in our next prediction.
X = []
y = []
for i in range(len(df) - lookback):
    # Historical close prices
    X.append(df[scaled_close_price].values[i:i+lookback])

    # Future close price from dataset
    y.append(df[scaled_close_price].values[i+lookback])

X = np.array(X)
y = np.array(y)

In [127]:
# Split the data into Samples, steps, then features
X = np.reshape(X, (X.shape[0], lookback, 1))

train_size = 0.7
test_size = 0.15

# Split the data into train, test and validation
# First, split into train and test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_size, shuffle=False
)

# Calculate remaining size for validation after allocating test set
validation_size = (1.0 - train_size - test_size) / (1.0 - test_size)

# Split the remaining data into validation and new train sets
X_train, X_validation, y_train, y_validation = train_test_split(
    X_train, y_train, test_size=validation_size, shuffle=False
)

print("Training set:", len(X_train)/len(close), "%")
print("Validation set:", len(X_validation)/len(close), "%")
print("Test set:", len(X_test)/len(close), "%")

Training set: 0.6951469583048531 %
Validation set: 0.14900888585099112 %
Test set: 0.14900888585099112 %
<class 'numpy.ndarray'>


In [None]:
import matplotlib.pyplot as plt

# plt.figure(figsize=(10, 6))

# # # Plot the training, validation, and test sets
# plt.plot(dates[:len(X_train)], X_train, label='Training Set', color='blue')
# plt.plot(dates[len(X_train):len(X_train) + len(X_validation)], X_validation, label='Validation Set', color='orange')
# plt.plot(dates[len(X_train) + len(X_validation):], X_test, label='Test Set', color='green')

# plt.xlabel('Date')
# plt.ylabel('Close Price')
# plt.title("Cryptocurrency Market Prices")
# plt.legend()


train_indices = range(len(X_train))
test_indices = range(len(X_train), len(X_train) + len(X_test))
validation_indices = range(len(X_train) + len(X_test), len(X_train) + len(X_test) + len(X_validation))

# Plot train set
plt.plot(train_indices, y_train, label='Train', color='blue')

# Plot validation set
plt.plot(validation_indices, y_validation, label='Validation', color='orange')


# Plot test set
plt.plot(test_indices, y_test, label='Test', color='green')

# Add labels and title
plt.xlabel('Index')
plt.ylabel('Close Price')
plt.title('Train, Test, and Validation Sets')
plt.legend()

In [None]:
# # Visualise the data:
# target_variable = 'Next Close'
# X_reshaped = np.reshape(X, (X.shape[0], -1))
# X_df = pd.DataFrame(X_reshaped, columns=[f"Close_{i}" for i in range(X.shape[1])])
# y_df = pd.DataFrame(y, columns=[target_variable])
# data_df = pd.concat([X_df, y_df], axis=1)

# # Display the DataFrame
# print(data_df)

## Creating the Models

In [None]:
import tensorflow as tf

try:
    from keras.api.models import Sequential
    from keras.api.layers import LSTM
except ImportError:
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import LSTM

# Our LSTM units (neurons)
n_samples = len(dates)-1
timestep_delta = abs(dates.iloc[0].year - dates.iloc[-1].year)

# Sequence length
n_timesteps = int(n_samples / timestep_delta)


def create_model(
    units: int,
    activation: str = "relu",
    recurrent_activation: str = "sigmoid",
    optimizer: str = "adam"
) -> Sequential:

    model = Sequential()
    model.add(LSTM(
        units,
        activation=activation,
        recurrent_activation=recurrent_activation,
        return_sequences=True,
        input_shape=(n_timesteps, 1)
    ))
    model.compile(optimizer=optimizer, loss='mse')
    return model

lstm_model = create_model(
    units=128
)

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

# We use this object to monitor the loss of the model.
# After 'patient' consecutive attempts if no improvement we stop running the model.
# We use val_loss since we are fine-tuning with validation data.
loss_callback = EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True
)

# Fit the model.
# Since we dont have explicit labels for validation monitor performance
# during training instead.
model_data = lstm_model.fit(
    X_train, ,
    epochs=100,
    batch_size=32,
    validation_data=(X_validation, None),
    callbacks=[loss_callback]
)

## Grid Search the Models

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, RegressorMixin

# Create a keras wrapper for sklearn grid search
class KerasRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, func, **kwargs):
        self.func = func
        self.kwargs = kwargs

    def fit(self, X, y, **kwargs):
        self.model = self.func(**kwargs)
        self.model.fit(X, y, **kwargs)
        return self

    def predict(self, X):
        return self.model.predict(X)

regressor_model = KerasRegressor(lstm_model, verbose=0)

grid_search_params = {
    "units": [32, 64, 128]
}

def create_grid_search():
    model = GridSearchCV()