In [None]:
import pandas as pd
import numpy as np

# Finance
import mplfinance as mpf
import ta

import torch
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from torch.optim import Adam

import matplotlib.pyplot as plt
import yfinance as yf

# System
from dotenv import load_dotenv
import os
from pathlib import Path
import requests
import sys
import time

sys.path.append('../') # Change the python path at runtime
from src.utils import path as path_yq


In [None]:
load_dotenv()
cur_dir = Path.cwd()

POLYGON_API_KEY = os.environ.get("POLYGON_API_KEY")

# Fetch Tick Data
- Add feature to pull from data instead of fetching

## Polygon AI

In [None]:
# TODO: Scale this up to n years
ticker = "NVDA"
max_limit = 50000
start_date = "2000-01-01"
end_date = "2024-12-31"

api_url = f"https://api.polygon.io/v2/aggs/ticker/{ticker}/range/1/day/{start_date}/{end_date}?adjusted=true&sort=asc&limit={max_limit}&apiKey={POLYGON_API_KEY}"



In [None]:
str = "N" # Reset
str = input("Confirm?")

if str == "Y":
    resp = requests.get(api_url)
    print(f"Request made.")

In [None]:
resp

In [None]:
cols = ["Date", "Open", "High", "Low", "Close", "Volume", "VWAP", "Transactions"]
if resp.status_code == 200:
    dict_list = resp.json().get('results')
    df = pd.DataFrame(dict_list)

    column_map = {
        't': 'Timestamp',
        'o': 'Open',
        'h': 'High',
        'l': 'Low',
        'c': 'Close', 
        'n': 'Transactions', # Number of trades (market activity)
        'v': 'Volume', # Number of shares traded (intensity of the activity)
        'vw': 'VWAP'
    }

    df.rename(columns=column_map, inplace=True)

    df['Datetime'] = pd.to_datetime(df['Timestamp'], unit='ms')
    df['Date'] = df['Datetime'].dt.normalize() # Remove the time and return date object

    df = df[cols]
    df.set_index(keys="Date", inplace=True)
else:
    print(f"Error fetching data: {resp.status_code}, {resp.text}")

## Yfinance

In [None]:
import seaborn as sns
historical_start_date = '2022-08-09'
# Define the ticker list
# Capitaland A17U, SUNT.SI cannot download
ticker_list = ['S51.SI']

# Fetch the data
data = yf.download(ticker_list, historical_start_date)['Adj Close'] # Auto adjust is false
data.index = pd.to_datetime(data.index)
display(data.tail(20))
data.plot()

plt.figure(figsize=(3,2))
sns.heatmap(data.corr(), cmap="Reds", annot=True)
plt.show()

In [None]:
df


In [None]:
root_dir = path_yq.get_root_dir(cur_dir=cur_dir)
csv_path = Path.joinpath(root_dir, "data", f"{ticker}_{start_date}_{end_date}.csv")

# Get df
try:
    df.to_csv(csv_path)
except NameError:
    print(f"df not defined, trying to fetch from csv")
    if os.path.exists(csv_path):
        df = pd.read_csv(csv_path)

In [None]:
df['Date'] = pd.to_datetime(df['Date'])
df.set_index(keys="Date", inplace=True)

mpf plot: https://github.com/matplotlib/mplfinance?tab=readme-ov-file

In [None]:
mpf.plot(df, type='candle', style='charles', figsize=(20, 10), title="OHLC Bars for NVDA", volume=True, show_nontrading=True, mav=(3, 6, 9))

In [None]:
df.isna().sum()


# Features
- Technical indicators
- Fundamental indicators
- Date features
- Holiday indicators etc.

# Preprocessing
- Train-test split
- Scale data (only fit-transform for train, but not for test)
- Decide which to predict. Have open and predict the close for the same day? Have the close for the previous day and predict next open?

In [None]:
def create_technical_indicators(tmp: pd.DataFrame):
    df = tmp.copy(deep=True)

    # Date features
    df['Year'], df['Month'], df['Day'] = df.index.year, df.index.month, df.index.day

    # Use the close price to create the indicators
    df['BB High'], df['BB Low']= ta.volatility.bollinger_hband(df['Close']), ta.volatility.bollinger_lband(df['Close'])

    
    
    # Takes the past 20 data, including the current one to calculate the mean
    df['MA_50'] = df['Close'].rolling(window=50).mean()

    df['MA_20'] = df['Close'].rolling(window=20).mean()

    df['MA_5'] = df['Close'].rolling(window=5).mean()

    df['Future Close'] = df['Close'].shift(-1)

    return df

In [None]:
df2 = create_technical_indicators(tmp=df)

In [None]:
df2

In [None]:
df2.dropna(inplace=True)
df2

## Split data

- Need to have train, val, test
- General requirements:
    - Ideally no NA values, otherwise model performance might be affected
    - Number of rows TBC, cannot be too little (not enough data to train), and cannot 
    be too much also (computationally expensive)
- Specific requirements:
    - df2 (not sure if need date as index yet)
    - Features (should be able to have both categorical and numerical)
    - Target variable: Future Close

In [None]:
# We still need the current Close to predict the next close
X = df2.drop(columns=['Future Close'])
y = df2['Future Close']

In [None]:
X

In [None]:
n_splits = 5
split_idx = int(len(df2) * 0.8)

# FIXME: Assess the model on different sets and use the best one
tscv = TimeSeriesSplit(n_splits=n_splits)

# TODO: Assess the models
# Now do CV for assessing robustness, but later on, the train and validation should
# Choose the best model/just go according to the sequential split
for train_idx, val_idx in tscv.split(X.iloc[:split_idx]):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # Scale TODO: Make into a function
    scaler_choice = 'standard'
    if scaler_choice == 'standard':
        scaler = StandardScaler()
    else:
        scaler = MinMaxScaler()

    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    train_dataloader = load_data(X=X_train_scaled, y=y_train, batch_size=batch_size, shuffle=shuffle)
    val_dataloader = load_data(X=X_val_scaled, y=y_val, batch_size=batch_size, shuffle=shuffle)

X_test, y_test = X.iloc[split_idx:], y.iloc[split_idx:]



## Loading dataset

In [None]:
start_time = time.time()
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Elapsed time is {elapsed_time:3f}s.")

In [None]:
device = torch.device('mps') # torch.device('cpu')
# assert mps_device == 'mps'
print(device)
batch_size = 64
shuffle = False

def convert_numpy_torch(arr: np.ndarray) -> torch:
    if isinstance(arr, (pd.DataFrame, pd.Series)):
        arr = arr.to_numpy()
    if isinstance(arr, np.ndarray):
        # This does not make a copy, but any changes will affect the original arr
        # Alternative: torch.tensor(arr, dtype=torch.float32)
        return torch.from_numpy(arr).float().to(device) # Ensure it is a float tensor, and move to device
    else:
        raise ValueError(f"The input\n{arr}\nis not an ndarray, it is a {type(arr)}.")

def load_data(X, y, batch_size, shuffle):
    dataset = TensorDataset(convert_numpy_torch(X),
                                convert_numpy_torch(y))
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
    return dataloader


# Model

## Fitting

In [None]:
iterdata = iter(train_dataloader)
input, label = next(iterdata)
input.shape
print(len(train_dataloader), input.shape)
# This is len_seq, batch_size, n_features

In [None]:
class EarlyStopper:
    def __init__(self, patience=3):
        self.min_loss = np.inf
        self.patience = patience
        self.counter = 0

    def stop(self, loss: float) -> bool:
        if loss < self.min_loss:
            self.min_loss = loss
            # Reset counter
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                return True
        return False


        



In [None]:
class RNN(nn.Module):
    def __init__(self, **kwargs):
        super().__init__() # Need bracket for super
        self.hidden_size = kwargs.get('hidden_size', 1)
        self.output_size = 1
        self.bidir = False # Hardcoded since bidir won't be used for stock price pred

        self.rnn = nn.RNN(
            input_size=X.shape[-1], 
            hidden_size=self.hidden_size, 
            num_layers=1, # Dropout has no effect on single hidden layer
            nonlinearity='tanh', # or relu
            bias=True,
            batch_first=False,
            dropout=0.2,
            bidirectional=self.bidir, # Can be True for NLP, but will introduce lookahead bias for stock
            # device=kwargs.get('device', torch.device('cpu'))
        )

        total_features = self.hidden_size * 2 if self.bidir else self.hidden_size
        # Use linear layer for fully connected layer to map to 1 column of output
        self.fc = nn.Linear(
            in_features=total_features,
            out_features=self.output_size
            )
    
    # Must override the parent class's forward method
    def forward(self, x):
        rnn_out, _ = self.rnn(x)

        print(rnn_out.shape) # [batch_size, features]

        return self.fc(rnn_out)
        

In [None]:
# TODO: Tweak dropout, bidirectional, etc.
lr = 0.001
n_epoch = 50

model = RNN(
    hidden_size=4
    ).to(device)
optimiser = Adam(params=model.parameters(), lr=lr)
loss_fn = nn.MSELoss()

from collections import defaultdict

loss_dict = defaultdict(list)
early_stopper = EarlyStopper()

for epoch in range(n_epoch):
    total_train_loss = 0
    total_val_loss = 0

    model.train()
    for input, target in train_dataloader:
        optimiser.zero_grad() # Resets gradient of the optimised Tensors to None
        output = model(input)
        print(output.shape, target.shape)
        assert output.squeeze(-1).shape == target.shape
        loss = loss_fn(output, target)
        total_train_loss += loss.item() # Tensor operation: get the scalar in a tensor with 1 element

        loss.backward() # Compute the gradient of the loss wrt weights, backpropagate
        optimiser.step() # Takes a step in the direction that reduces the loss, updates params
    model.eval() # Disable dropout
    with torch.no_grad():
        for input, target in val_dataloader:
            output = model(input)
            loss = loss_fn(output, target)
            total_val_loss += loss.item()
    
    # Add train, val loss to dict for each epoch
    avg_train_loss = total_train_loss / len(train_dataloader) # Take the mean of MSE for all batches
    avg_val_loss = total_val_loss / len(val_dataloader)

    loss_dict['train_loss'].append(avg_train_loss)
    loss_dict['val_loss'].append(avg_val_loss)

    if early_stopper.stop(loss=avg_val_loss) == True:
        print(f"Early stopping at epoch {epoch + 1}.")
        break

    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch + 1} has completed.")

# After the whole training is completed, we can plot the losses, time and analyse which model is the best 
# Train with full set and save the model for testing

Afterwards for really testing against test

In [None]:
#
# X_train, X_val, y_train, y_val = train_test_split(X.iloc[:split_idx], y.iloc[:split_idx], shuffle=False, test_size=0.2)