In [1]:
import pandas as pd
import numpy as np
import math
from collections import defaultdict

# Finance
import mplfinance as mpf
import ta
import yfinance as yf

# ML
import torch
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from torch.optim import Adam

# Analysis
import matplotlib.pyplot as plt

# System
from dotenv import load_dotenv
import os
from pathlib import Path
import requests
import sys
import time

sys.path.append('../') # Change the python path at runtime
from src.utils import path as path_yq

In [2]:
load_dotenv()
cur_dir = Path.cwd()

DEVICE = torch.device('mps')


In [4]:
datasets = ["capitaland_basic.csv", "capitaland_basic+fundamentals.csv", "capitaland+frasers+keppel_basic+fundamentals.csv"]

# for idx, data in enumerate(datasets):
#     data_name = data + 'idx'
data_dfs = []

for file in datasets:
    csv_path = Path.joinpath(path_yq.get_root_dir(cur_dir=cur_dir), 'data', file)
    df2 = pd.read_csv(csv_path, index_col=0)
    
    df2.dropna(inplace=True)
    # #print(df2.isna().sum())

    # display(df2)
    # display(df2.columns)

    data_dfs.append(df2)



In [10]:
def convert_numpy_torch(arr) -> torch:
    if isinstance(arr, (pd.DataFrame, pd.Series)):
        arr = arr.to_numpy()
    if isinstance(arr, np.ndarray):
        return torch.from_numpy(arr).float().to(DEVICE)
    else:
        raise ValueError(f"The input\n{arr}\nis not an ndarray, it is a {type(arr)}.")

# Only X needs to be turned into a sequence
def load_sequence(X, seq_len):
    sequences = []
    for i in range(len(X) - seq_len):
        sequences.append(X[i: i + seq_len])
    return np.array(sequences)

def load_data(X, y, batch_size, shuffle):
    dataset = TensorDataset(convert_numpy_torch(X),
                                convert_numpy_torch(y))
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
    return dataloader

class EarlyStopper:
    def __init__(self, patience=5):
        self.min_loss = np.inf
        self.patience = patience
        self.counter = 0
        self.delta = 0 # TODO: Might wanna adjust based on stock volatility

    def stop(self, loss: float) -> bool:
        if loss < self.min_loss - self.delta:
            self.min_loss = loss
            # Reset counter
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                return True
        return False



In [11]:
# Define the LSTM-based neural network model
class Model1(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(Model1, self).__init__()

        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size=input_size, 
                    hidden_size=hidden_size, 
                    num_layers=num_layers, 
                    batch_first=True,
                    dropout=0.2,
                    bidirectional=False)
        # Output layer that maps the output of the LSTM to the desired output size
        self.out = nn.Linear(hidden_size, 1)

    def forward(self, x, h_state, c_state):
        # Forward pass through LSTM
        r_out, (h_state, c_state) = self.lstm(x, (h_state, c_state))
        return self.out(r_out[:, -1, :]), (h_state, c_state)

def plot_loss(model_dict, suptitle, title):
    model_df = pd.DataFrame(model_dict)
    plt.figure(figsize=(10, 6))
    plt.plot(np.arange(1, len(model_df) + 1), model_df[['train_loss']], label='Train Loss')
    plt.plot(np.arange(1, len(model_df) + 1), model_df[['val_loss']], label='Validation Loss')
    final_train_loss = model_df[['train_loss']].iloc[-1].item()
    final_val_loss = model_df[['val_loss']].iloc[-1].item()
    plt.axhline(final_train_loss, color='g', linestyle='--', label=f"{final_train_loss:.5f}")
    plt.axhline(final_val_loss, color='r', linestyle='--', label=f"{final_val_loss:.5f}")
    plt.suptitle(suptitle)
    plt.title(title)
    plt.xlabel(f"Epoch")
    plt.ylabel(f"RMSE Loss")
    plt.legend()
    combined_title = (suptitle + '_' + title).lower()
    final_title = combined_title.replace(' ', '_')
    target_path = Path.joinpath(path_yq.get_root_dir(cur_dir=cur_dir), 'plots', f"{final_title}.png")
    plt.savefig(target_path)
    # plt.show() 

def plot_pred_actual(actual_list, pred_list, suptitle, title):
    assert len(actual_list) == len(pred_list), "Actual and predicted length different."
    # The timing might be different because of the drop last etc.
    plt.figure(figsize=(10, 6))
    plt.plot(pred_list, label='Predict', c='r', alpha=0.5)
    plt.plot(actual_list, label='Actual', c='b', alpha=0.5)
    plt.suptitle(f"{suptitle}")
    plt.title(f"{title}")
    plt.xlabel(f"Time Steps")
    plt.ylabel(f"Price")
    plt.legend()
    plt.tight_layout()
    combined_title = (suptitle + '_' + title).lower()
    final_title = combined_title.replace(' ', '_')  
    target_path = Path.joinpath(path_yq.get_root_dir(cur_dir=cur_dir), 'plots', f"{final_title}.png")
    plt.savefig(target_path)
    # plt.show()

In [12]:
def hp_tuning(**kwargs):
    df = kwargs.get('df')
    df_id = kwargs.get('df_id')
    seq_len = kwargs.get('seq_len')
    scaler_type = kwargs.get('scaler_type')
    lr = kwargs.get('lr')
    hidden_size = kwargs.get('hidden_size')
    num_layers = kwargs.get('num_layers')


    # Fixed params
    batch_size = 1
    patience = 10
    n_epoch = 50
    n_splits = 2

    target_col_name = 'A17U.SI_Next_Close'
    X = df.drop(columns=[target_col_name])
    y = df[target_col_name]

    tscv = TimeSeriesSplit(n_splits=n_splits, gap=0)

    for tscv_idx, (train_idx, val_idx) in enumerate(tscv.split(X.iloc[:])):
        print(f"-----Training for data_{df_id}, seq_len={seq_len}, scaler={scaler_type}, (train_len,val_len)={len(train_idx),len(val_idx)}, lr={lr}, hidden_size={hidden_size}, num_layers={num_layers}-----")
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        if scaler_type == 'standard':
            scaler = StandardScaler()
        elif scaler_type == 'minmax':
            scaler = MinMaxScaler
        else:
            raise ValueError(f"Unsupported scaler type: {scaler_type}.")
        X_train_scaled = scaler.fit_transform(X_train) # Scale around 0 to avoid constant predictions
        X_val_scaled = scaler.transform(X_val)
        
        # Not using drop last since the batch size is 1 so all batches will have equal size
        train_dataloader = load_data(load_sequence(X_train_scaled, seq_len), y_train[seq_len:], batch_size, shuffle=False) 
        iteration = iter(train_dataloader)
        for inputs, labels in train_dataloader:
            print("Train dataloader: batch size: {}, sequence length: {}, n_features: {}."
                .format(inputs.shape[0], inputs.shape[1], inputs.shape[2]))
            break

        # After scaling it returns numpy array
        last_n_rows = X_train_scaled[-seq_len:]
        X_val_extended = np.concatenate([last_n_rows, X_val_scaled])
        # print(last_n_rows, print(X_val_extended))
        assert len(X_val_extended) - seq_len == len(y_val), \
            "Length of X_val_extended and y_val are diff: {}, {}".format(len(X_val_extended), len(y_val))
        val_dataloader = load_data(load_sequence(X_val_extended, seq_len), y_val, batch_size, shuffle=False)

        # Initialize the neural network, transfer it to GPU if available
        model = Model1(input_size=X.shape[-1], hidden_size=hidden_size, num_layers=num_layers).to(DEVICE)

        # Define a loss function and the optimizer 
        criterion = nn.MSELoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
        # train_size, val_size = len(y_train[seq_len:]), len(y_val)
        earlyStopper = EarlyStopper(patience=patience) # Small patience because of small batch size
        model_dict = defaultdict(list)

        start_time = time.time()
        # Training loop
        for epoch in range(n_epoch):
            epoch_train_loss = 0
            epoch_val_loss = 0
            epoch_train_samples = 0
            epoch_val_samples = 0
            pred_list = []
            actual_list = []

            model.train()
            # Iterate over the DataLoader for training data
            for i, (inputs, labels) in enumerate(train_dataloader):
                inputs, lables = inputs.to(DEVICE), labels.to(DEVICE)
                # Initialize the hidden and cell states of LSTM after every batch
                h_state = torch.zeros(num_layers, batch_size, hidden_size).to(DEVICE)
                c_state = torch.zeros(num_layers, batch_size, hidden_size).to(DEVICE)
                # Zero the parameter gradients
                optimizer.zero_grad()

                # Forward pass, backward pass, and optimize
                prediction, (h_state, c_state) = model(inputs, h_state, c_state)
                prediction = prediction.squeeze()
                # Detach states from the graph to prevent backpropagation through the entire sequence
                # h_state = Variable(h_state.data)
                # c_state = Variable(c_state.data)
                loss = criterion(prediction, labels)

                epoch_train_loss += loss.item() * inputs.shape[0]
                epoch_train_samples += inputs.shape[0]

                loss.backward()
                nn.utils.clip_grad_value_(model.parameters(), clip_value=1.0)
                for name, param in model.named_parameters():
                    # print(f"requires_grad: {param.requires_grad}")
                    if param.grad is not None:
                        # print(f"Layer: {name} | Max Gradient: {param.grad.data.abs().max()}")
                        pass
                    else:
                        print(f"Gradient is None")
                
                optimizer.step()
                
                # Detach states from computation graph to prevent BPP the entire history
                h_state = h_state.detach()
                c_state = c_state.detach()

            model.eval()
            with torch.no_grad():
                for i, (inputs, labels) in enumerate(val_dataloader):
                    inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)

                    h_state = torch.zeros(num_layers, batch_size, hidden_size).to(DEVICE)
                    c_state = torch.zeros(num_layers, batch_size, hidden_size).to(DEVICE)
                    prediction, (h_state, c_state) = model(inputs, h_state, c_state)
                    prediction = prediction.squeeze()
                    # print(f"Prediction: {prediction}")
                    loss = criterion(prediction, labels)

                    epoch_val_loss += loss.item() * inputs.shape[0]
                    epoch_val_samples += inputs.shape[0]

                    pred_list.append(prediction.cpu().item())
                    actual_list.append(labels.cpu().item())

            # Add train, val loss to dict for each epoch
            avg_train_loss = math.sqrt(epoch_train_loss / epoch_train_samples) # Take the mean of MSE for all batches 
            avg_val_loss = math.sqrt(epoch_val_loss / epoch_val_samples)

            model_dict['train_loss'].append(avg_train_loss)
            model_dict['val_loss'].append(avg_val_loss)

            if (epoch + 1) % 10 == 0:
                # Plot predicted vs actual for certain epochs
                suptitle = f"Actual vs Predicted for Epoch {epoch + 1}"
                title = f"data_{df_id}, seq_len={seq_len}, scaler={scaler_type}, (train_len,val_len)={len(train_idx),len(val_idx)}, lr={lr}, hidden_size={hidden_size}, num_layers={num_layers}"
                plot_pred_actual(actual_list=actual_list, pred_list=pred_list, suptitle=suptitle, title=title)
            
            if (earlyStopper.stop(loss=avg_val_loss) == True) or (epoch == n_epoch - 1):
                # Plot the predicted vs actual for final epoch
                print(f"Stopping at epoch {epoch + 1}.")
                suptitle = f"Actual vs Predicted for Epoch {epoch + 1}"
                title = f"stopping_epoch={epoch + 1}, data_{df_id}, seq_len={seq_len}, scaler={scaler_type}, (train_len,val_len)={len(train_idx),len(val_idx)}, lr={lr}, hidden_size={hidden_size}, num_layers={num_layers}" # TODO:
                plot_pred_actual(actual_list=actual_list, pred_list=pred_list, suptitle=suptitle, title=title)
                break
        
        # Compute duration used
        end_time = time.time()
        elapsed_time = end_time - start_time
        min, sec = divmod(elapsed_time, 60)
        min, sec = int(min), round(sec, 0)
        print(f"Elapsed time is {min}m{sec}s.")
        # After all epoch ends for one config
        plot_loss(model_dict=model_dict, suptitle="Train vs Validation RMSE Loss", 
                title = f"data_{df_id}, seq_len={seq_len}, scaler={scaler_type}, (train_len,val_len)={len(train_idx),len(val_idx)}, lr={lr}, hidden_size={hidden_size}, num_layers={num_layers}, duration={min}m{sec}s")
        print('Finished Training')

In [9]:
data_dfs = data_dfs # 3 df
seq_len_list = [10, 20, 30]
scaler_type_list = ['standard', 'minmax']
lr_list = [0.0001, 0.00001, 0.000001]
hidden_size_list = [64, 128, 256]
num_layers_list = [2, 4, 8]

import itertools
combinations = itertools.product(seq_len_list, scaler_type_list, lr_list, hidden_size_list, num_layers_list)

input("Created new folder for plots?")
for df_id in range(data_dfs):
    for combination in combinations:
        kwargs = {
            'df': data_dfs[df_id],
            'df_id': df_id,
            'seq_len': combination[0],
            'scaler_type': combination[1],
            'lr': combination[2],
            'hidden_size': combination[3],
            'num_layers': combination[4]
        }
        
        hp_tuning(**kwargs)


<itertools.product object at 0x17ff21b00>
(     A17U.SI_Open  A17U.SI_High  A17U.SI_Low  A17U.SI_Close  A17U.SI_Volume  \
0        2.521650      2.538574     2.504726       2.521650        10916700   
1        2.530112      2.538574     2.504726       2.513188        10010400   
2        2.530112      2.597807     2.530112       2.563960        22244500   
3        2.572422      2.580884     2.538574       2.538574        16010000   
4        2.547036      2.572422     2.538574       2.572422        10073900   
..            ...           ...          ...            ...             ...   
815      2.780000      2.790000     2.750000       2.760000        11393700   
816      2.750000      2.750000     2.720000       2.730000        12905200   
817      2.740000      2.770000     2.740000       2.740000        10948700   
818      2.760000      2.760000     2.730000       2.760000         8951100   
819      2.760000      2.770000     2.740000       2.750000         6052200   

     A17