## Librarby Import

In [101]:
import pymongo

import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.gaussian_process import kernels,GaussianProcessRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit

import math,os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import TimeSeriesSplit

import plotly.graph_objects as go

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)  
pd.set_option('display.max_colwidth', None) 
pd.set_option('display.width', None)


## Data Fetching

In [102]:
class fetch_and_split_data:
    def __init__(self, 
                symbol, 
                db_name = "local", 
                collection_name = "technical_stock_data", 
                mongo_uri="mongodb://localhost:27017/"):
        """
        Initializes the MongoDB connection and prepares the collection for the stock data.

        Args:
            db_name (str): Name of the MongoDB database.
            collection_name (str): Name of the collection inside the database.
            symbol (str): The stock symbol to filter the data (e.g., 'NVDA').
            mongo_uri (str): MongoDB connection URI (default is localhost).
        """
        self.mongo_uri = mongo_uri
        self.client = pymongo.MongoClient(self.mongo_uri)
        self.db = self.client[db_name]
        self.collection = self.db[collection_name]
        self.symbol = symbol
        self.df = None
        self.df_test = None

    def fetch_data(self):
        """Fetches stock data from the MongoDB collection and converts it to a Pandas DataFrame."""
        # Fetch the data from MongoDB
        fetched_data_lst = list(self.collection.find({"symbol": self.symbol}))

        # Extract the desired stock symbol's technical data
        if not len(fetched_data_lst) == 0:
            # Extract the 'technical_data' field from the filtered data
            self.df = pd.DataFrame(fetched_data_lst)
        else:
            raise ValueError(f"No data found for symbol: {self.symbol}")

    def split_data(self):
        """Splits the data into training and testing datasets."""
        if self.df is not None:
            # Take the latest rows for training
            self.df_train = self.df[-252*3:-100] 

            # Take the rest (last 100 rows) for testing
            self.df_test = self.df[-100:]
        else:
            raise ValueError("Data not loaded. Call fetch_data() first.")

    def get_train_data(self):
        """Returns the training data."""
        if self.df_train is not None:
            return self.df_train
        else:
            raise ValueError("Data not loaded or split. Call fetch_data() and split_data() first.")

    def get_test_data(self):
        """Returns the test data."""
        if self.df_test is not None:
            return self.df_test
        else:
            raise ValueError("Test data not available. Call split_data() first.")


## Check Data Integrity

In [103]:
#Compute the missing value ratio 
def missing_values(df):
    missing_data = ((df.isnull().sum())/len(df))
    missing_data = missing_data[missing_data.values > 0].sort_values()
    if missing_data.empty:
        print('No missing values')    
    else:
        #Visualize the missing value ratio 
        fig = plt.figure(figsize=(5,5), dpi = 100)
        sns.barplot(x = missing_data.index, y = missing_data.values) 
        plt.xticks(rotation=90)
        plt.title('Features Missing Ratio')
        plt.show()
    

## Data Processing

In [104]:
class prepare_data:
    def __init__(self, exclude_columns=None):
        """
        Initializes the DataPreprocessor with the columns to exclude from log transformation.

        Args:
            exclude_columns (list): List of numeric columns to exclude from log transformation.
        """
        self.exclude_columns = exclude_columns if exclude_columns else ['MACD', 'MACD_SIGNAL', 'MACD_HIST']
        self.ohe = OneHotEncoder(drop='first')  # OneHotEncoder for categorical columns
        
    def preprocess(self, df, train=True):
        """
        Preprocess the dataframe by performing the following steps:
        - Drop 'date' column and rows with missing values
        - Convert alert-related columns to categorical types
        - Log-transform numeric columns except for excluded columns
        - One-hot encode categorical columns
        - Add a timestamp column

        Args:
            df (pd.DataFrame): The input dataframe to preprocess.

        Returns:
            pd.DataFrame: The preprocessed dataframe.
        """
        self.filename = 'train_data.parquet' if train else 'test_data.parquet'

        # Step 1: Drop 'date' column and handle missing values
        df = df.drop(columns=['date'], errors='ignore')  # Avoids error if 'date' is missing
        df = df.dropna()

        # Step 2: Convert relevant columns to category
        df["CandleStickType"] = df["CandleStickType"].astype('category')
        df["Incremental_High"] = df["Incremental_High"].astype('category')
        df["MACD_GOLDEN_CROSS"] = df["MACD_GOLDEN_CROSS"].astype('category')

        alert_columns = df.columns[df.columns.str.contains('Alert')]
        for column in alert_columns:
            df[column] = df[column].astype('category')

        # Step 3: One-hot encode categorical columns
        categorical_df = df.select_dtypes(include=['category'])
        encoded_df = pd.DataFrame(self.ohe.fit_transform(categorical_df).toarray(), 
                                columns=self.ohe.get_feature_names_out(categorical_df.columns))

        # Step 4: Concatenate numeric and encoded categorical data
        numeric_df = df.select_dtypes(include=['float64', 'int64'])
        numeric_df = numeric_df.reset_index(drop=True)
        
        df = pd.concat([numeric_df, encoded_df], axis=1)

        # Step 5: Add timestamp column
        df['timestamp'] = df.index

        # Create target variable
        df = self.create_target(df)
        
        # Save the prepared data for model training
        self.save_data(df, '/Users/yiukitcheung/Documents/Projects/Stocks/train_data_repository')
        
        return df
    
    def create_target(self, df):
        # Compute the return % of the next day
        df['log_daily_return'] = np.log(df.close.pct_change() + 1) * 100
        
        return df
    
    def save_data(self, df, file_path):
        """Saves the preprocessed dataframe to a CSV file."""
        file_path = os.path.join(file_path, self.filename)
        df.to_parquet(file_path, index=False)

In [105]:
# Test the fetch_and_split_data
StockDataPreprocessor = fetch_and_split_data('NVDA')
StockDataPreprocessor.fetch_data()
StockDataPreprocessor.split_data()
df = StockDataPreprocessor.get_train_data()
test_df = StockDataPreprocessor.get_test_data()

# Test the data prepare_data
DataPreprocessor = prepare_data()

df = DataPreprocessor.preprocess(df)
test_df = DataPreprocessor.preprocess(test_df,train=False)

In [106]:
df

Unnamed: 0,high,close_t-1,low,MACD_HIST,open,close_t-2,BodyDiff,volume,MACD,169EMA,144EMA,close,MACD_SIGNAL,169EMA_Lower,13EMA,8EMA,169EMA_Upper,close_t-3,atr,MACD_GOLDEN_CROSS_1.0,382_Alert_1,Incremental_High_1.0,dual_channel_Alert_0,dual_channel_Alert_1,CandleStickType_red,Engulf_Alert_0,Engulf_Alert_1,MACD_Alert_0,MACD_Alert_1,timestamp,log_daily_return
0,21.388277,21.076815,20.914093,-0.230128,21.380292,21.862467,0.170706,204688000,0.256492,17.608463,18.147775,21.209585,0.486620,16.728040,21.852762,21.765684,18.488886,22.203880,0.635147,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0,
1,21.922363,21.209585,21.159673,-0.195934,21.329381,21.076815,0.574015,268726000,0.241703,17.658991,18.199577,21.903397,0.437637,16.776042,21.859996,21.796287,18.541941,21.862467,0.636173,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,3.218851
2,22.496379,21.903397,21.852484,-0.133754,22.102055,21.209585,0.341415,248555000,0.270444,17.715279,18.258113,22.443470,0.404198,16.829515,21.943349,21.940105,18.601043,21.076815,0.637189,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2,2.435798
3,22.111038,22.443470,21.823532,-0.117002,22.032173,21.903397,0.010981,217655000,0.257946,17.766195,18.310321,22.043154,0.374948,16.877886,21.957607,21.963005,18.654505,21.209585,0.638636,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3,-1.799764
4,21.761641,22.043154,21.288454,-0.130608,21.672793,22.443470,0.049914,245215000,0.211688,17.811568,18.356011,21.622879,0.342296,16.920990,21.909789,21.887421,18.702147,21.903397,0.640914,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,4,-1.925010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
651,84.067903,82.409180,79.169722,-1.257190,83.935921,79.504669,4.272279,512208000,-1.361822,64.130016,67.018560,79.663643,-0.104632,60.923515,83.651273,82.212787,67.336517,76.187225,3.744350,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,651,-3.388353
652,83.309027,79.663643,78.209884,-0.975734,78.854772,82.409180,3.763376,424641000,-1.324299,64.347523,67.233726,82.618149,-0.348565,61.130147,83.503684,82.302867,67.564899,79.504669,3.748665,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,652,3.641607
653,88.316192,82.618149,83.373021,-0.419388,83.803948,79.663643,3.916343,551011000,-0.872800,64.622497,67.516300,87.720291,-0.453412,61.391372,84.106056,83.506739,67.853622,82.409180,3.756031,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,653,5.992387
654,87.977240,87.720291,85.251699,-0.043159,87.580311,82.618149,0.161975,388971000,-0.507361,64.894494,67.795279,87.742287,-0.464202,61.649770,84.625518,84.447972,68.139219,79.663643,3.761530,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,654,0.025071
