# Imports

In [None]:
import os
import pickle
import pandas as pd
import numpy as np
import datetime as dt
import yfinance as yf
import plotly.express as px
import scipy
from dash_bootstrap_templates import load_figure_template
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
import xgboost as xgb
from sklearn.neighbors import KNeighborsRegressor
from gensim import corpora, models
import ast
from gensim.models import LdaModel
from gensim.models import CoherenceModel
import spacy
from collections import Counter
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [None]:
pd.set_option('display.max_columns', None)
load_figure_template('minty')

In [None]:
def save_pkl_file(object, filepath):
    # Retrieve directory
    directory = os.path.dirname(filepath)
    if not os.path.exists(directory):
        os.makedirs(directory)
        print(f"Folder created at {directory}")
    with open(filepath, 'wb') as file:
        pickle.dump(object, file)

def load_pkl_file(filepath):
    with open(filepath, 'rb') as file:
        return pickle.load(file)

# 1. Data Exploration

In [None]:
data_1 = pd.read_csv("data/daily_financial_news/analyst_ratings_processed.csv", index_col=0)
data_1.head()

In [None]:
data_1['date'] = data_1['date'].str.split(' ', expand=True).iloc[:, 0]

In [None]:
data_1.head()

In [None]:
data_1['stock'].nunique()

In [None]:
data_1.info()

## Drop Null Values

In [None]:
print(data_1.isna().sum())

In [None]:
print(data_1.shape)

In [None]:
data_1[data_1['date'].isna()]

In [None]:
data_1.dropna(subset=['date'],inplace=True)
data_1.dropna(subset=['stock'],inplace=True)

In [None]:
print(data_1.shape)

In [None]:
print(data_1.isna().sum())

In [None]:
data_1.info()

## Filter stocks

In [None]:
# Filter dataset down to stocks with top 100 number of headlines
top_100_stocks_by_headlines = data_1.groupby('stock').size().reset_index(name='size').sort_values('size', ascending=False).reset_index(drop=True).iloc[:100]
top_100_stocks_by_headlines.head()

In [None]:
data_1 = data_1[data_1['stock'].isin(top_100_stocks_by_headlines.stock)]

In [None]:
data_1.shape

## Clean Text Data

In [None]:
data_1['title'] = data_1['title'].str.lower()
data_1

### Remove Punctuations

In [None]:
import string

def remove_punctuation(text):
    return ''.join([char for char in text if char not in string.punctuation and not char.isdigit()])

data_1['title'] = data_1['title'].apply(remove_punctuation)

In [None]:
data_1.head()

In [None]:
data_1['tokens'] = data_1['title'].apply(lambda x: x.split())
data_1.head()

### Remove stop words

In [None]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

stop = stopwords.words('english')

data_1['tokens'] = data_1['tokens'].apply(lambda x: [word for word in x if word not in stop])

In [None]:
data_1.shape

### Lemmatize tokens

In [None]:
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

data_1['tokens'] = data_1['tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])


In [None]:
data_1.head()

In [None]:
data_1['preprocessed_text'] = data_1['tokens'].apply(' '.join)
data_1

In [None]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()
# NLTK Sentiment Intensity Analyzer uses a 'Bag of Words' approach
# it removes stop words and scores each word individually before compounding

In [None]:
data_duplicate = data_1.copy()
data_duplicate['sentiment_score'] = data_duplicate['preprocessed_text'].apply(lambda x: sia.polarity_scores(x)['compound'])
data_duplicate

In [None]:
data_duplicate['sentiment_score'].describe()

In [None]:
data_duplicate.info()

In [None]:
signals_df = data_duplicate.copy()

In [None]:
signals_df.dtypes

In [None]:
# There are some stocks that have multiple news articles on the same day
# Have to handle these cases
# non_dup_signals_df = signals_df.groupby(['date', 'stock'])['sentiment_score'].mean().reset_index(name='sentiment_score')

# 2. Pull yfinance Data

In [None]:
tickers = data_duplicate.stock.unique()

In [None]:
start_date, end_date = data_duplicate.date.sort_values().iloc[0], data_duplicate.date.sort_values().iloc[-1]

In [None]:
data = yf.download(list(tickers), start=start_date, end=end_date)

In [None]:
adj_close_data = data['Adj Close']
adj_close_data

In [None]:
# Tickers that don't have data
missing_data_tickers = adj_close_data.columns[adj_close_data.isna().sum()/adj_close_data.shape[0] == 1]

# Drop missing tickers
adj_close_data = adj_close_data.drop(columns=missing_data_tickers)

In [None]:
# Drop other tickers
adj_close_data = adj_close_data.dropna(axis=1)

In [None]:
any(adj_close_data.isna().sum() > 0)

In [None]:
returns_df = adj_close_data.pct_change().dropna().reset_index().rename(columns={'Date': 'date'})

In [None]:
returns_df_melt = returns_df.melt(id_vars='date', var_name='stock', value_name='daily_returns')
returns_df_melt['date'] = pd.to_datetime(returns_df_melt['date'])
returns_df_melt

In [None]:
returns_df_melt.dtypes

In [None]:
signals_df['date'] = pd.to_datetime(signals_df['date'])

In [None]:
merged_df = pd.merge(returns_df_melt, signals_df, on=['date', 'stock'], how='left').dropna()
merged_df

In [None]:
# Count number of stocks per date
merged_df['num_stocks_by_date'] = merged_df.groupby('date').transform('size')

# Select data where there were at least 10 stocks for each date
merged_df_filtered = merged_df[merged_df['num_stocks_by_date'] >= 10]

# Drop num_stocks_by_date column
ml_df = merged_df_filtered.drop(columns='num_stocks_by_date').reset_index(drop=True)

# 3. Build Machine Learning Dataframes and Features

## Check Dataframes

In [None]:
ml_df

In [None]:
# # Convert the 'date' column in both DataFrames to datetime format
# ml_df['date'] = pd.to_datetime(ml_df['date'])
# data_duplicate['date'] = pd.to_datetime(data_duplicate['date'])

# # Merge ml_df with data_duplicate on the columns date and stock
# data_additional_features = pd.merge(ml_df, data_duplicate[['date', 'stock', 'title', 'preprocessed_text', 'tokens']], on=['date', 'stock'], how='left')
# data_additional_features

## Additional Feature Engineer

### Create Day of Week Feature

In [None]:
ml_df['date'] = pd.to_datetime(ml_df['date'])
ml_df['day_of_week'] = ml_df['date'].dt.dayofweek

In [None]:
ml_df

### Create Topic Feature using Latent Dirichlet Allocation (LDA)

In [None]:
def safe_literal_eval(s):
    try:
        return ast.literal_eval(s)
    except (ValueError, SyntaxError):
        return s

In [None]:
ml_df['tokens'] = ml_df['tokens'].apply(safe_literal_eval)

# Create Dictionary (takes 5 minutes to run)
id2word = corpora.Dictionary(ml_df['tokens'])

# Term Document Frequency (Corpus)
corpus = [id2word.doc2bow(text) for text in ml_df['tokens']]

lda_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=10, random_state=42, passes=10)

In [None]:
# Inspect Topics
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

# Inspect Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_additional_features['tokens'], dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
# Function to get the dominant topic
def get_dominant_topic(lda_model, corpus):
    dominant_topics = []
    for doc_topics in lda_model[corpus]:
        # Sort the topics by their assigned proportions
        sorted_topics = sorted(doc_topics, key=lambda x: x[1], reverse=True)
        # Get the topic number of the highest proportion topic
        dominant_topic = sorted_topics[0][0]
        dominant_topics.append(dominant_topic)
    return dominant_topics

In [None]:
# Get the dominant topic on the data
ml_df['dominant_topic'] = get_dominant_topic(lda_model, corpus)

### Create Entities Count Feature using Named Entity Recognizer (NER)

In [None]:
# Load the spaCy model
spacy_nlp_model = spacy.load('en_core_web_sm')

def extract_entities(text):
    # Process the text with the NER model
    doc = spacy_nlp_model(text)

    # Extract entities that are either PERSON or ORG (companies). You can adjust this as needed.
    entities = [ent.text for ent in doc.ents if ent.label_ in ['PERSON', 'ORG']]

    return entities

In [None]:
# Extract entities from data (takes 10 - 20 minutes to run)
ml_df['entities'] = ml_df['title'].apply(extract_entities)

In [None]:
ml_df['entities_count'] = ml_df['entities'].apply(len)
ml_df.drop('entities', axis=1, inplace=True)

In [None]:
ml_df_final = ml_df[['date', 'stock', 'daily_returns', 'sentiment_score', 'day_of_week', 'dominant_topic', 'entities_count']].copy()

In [None]:
# Save to pkl file
save_pkl_file(ml_df_final, 'cache/dataframes/ml_df.pkl')

## Feature Engineer Evaluation

### Correlation Matrix

## Create Functions for Train Test Split, One Hot Encoding, Scoring


In [None]:
# Define function for train test split
def ts_train_test_split(data, test_size):
    """Takes in data and output train set and test set in that order

    Args:
        data (pd.DataFrame or pd.Series): Data to split into train and test
        test_size (float): Percentage for test size

    Returns:
        tuple: train set, test set
    """
    train_size = 1-test_size
    train_idx = round(X.shape[0] * train_size)
    return data.iloc[:train_idx], data.iloc[train_idx:]

In [None]:
#define function for one hot encoding
def one_hot_encode(X_train, X_valid):
    # Initialize OneHotEncoder
    one_hot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    
        # Drop date column if it exists
    X_train = X_train.drop(columns=['date'], errors='ignore')
    X_valid = X_valid.drop(columns=['date'], errors='ignore')
    
    # One-hot Encode 'stock' Column for Training and Validation Data
    OH_cols_train = pd.DataFrame(one_hot_encoder.fit_transform(X_train[['stock']]))
    OH_cols_valid = pd.DataFrame(one_hot_encoder.transform(X_valid[['stock']]))

    # Assign Column Names after One-Hot Encoding and Restore Index
    OH_cols_train.columns = one_hot_encoder.get_feature_names_out(['stock'])
    OH_cols_valid.columns = one_hot_encoder.get_feature_names_out(['stock'])
    OH_cols_train.index, OH_cols_valid.index = X_train.index, X_valid.index

    # Remove Original 'stock' Column
    num_X_train = X_train.drop('stock', axis=1)
    num_X_valid = X_valid.drop('stock', axis=1)

    # Concatenate Original Data with One-Hot Encoded Columns
    OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
    OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

    # Ensure all columns have string type
    OH_X_train.columns = OH_X_train.columns.astype(str)
    OH_X_valid.columns = OH_X_valid.columns.astype(str)
    
    return OH_X_train, OH_X_valid

In [None]:
def score_model(model, X_t, X_v, y_t, y_v):
    # Fit Model
    model.fit(X_t, y_t)

    # Predict
    preds = model.predict(X_v)

    # Check MAE
    mae = mean_absolute_error(preds, y_v)
    return preds, mae

# 4. Machine Learning Model Creation

## Train-Test Split

In [None]:
# Split Data into Features and Target
# Sort values by date
X = ml_df.sort_values("date").drop('daily_returns', axis=1).reset_index(drop=True)
y = ml_df.sort_values("date")['daily_returns'].reset_index(drop=True)

# Split Data into Training and Validation Sets
X_train, X_valid = ts_train_test_split(X, test_size=0.2)
y_train, y_valid = ts_train_test_split(y, test_size=0.2)

# Use one_hot_encode function to get One-Hot Encoded Training and Validation Data
OH_X_train, OH_X_valid = one_hot_encode(X_train, X_valid)

## Linear Regression Model

In [None]:
#Fit linear model
linear_model = LinearRegression()
preds_linear, mae_linear = score_model(linear_model, OH_X_train, OH_X_valid, y_train, y_valid)
print(f"Mean Absolute Error with Linear Regression: {mae_linear}")

## XGBoost Model

In [None]:
#Fit XGboost model
xgb_model = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=5)
preds_xgboost, mae_xgboost = score_model(xgb_model, OH_X_train, OH_X_valid, y_train, y_valid)
print(f"Mean Absolute Error with XGBoost: {mae_xgboost}")

## K-Nearest Neighbours (KNN) Model

In [None]:
#Fit K-Nearest Neighbours model
knn_regressor = KNeighborsRegressor(n_neighbors=2000)
preds_knn, mae_knn= score_model(knn_regressor, OH_X_train, OH_X_valid, y_train, y_valid)
print(f"Mean Absolute Error with KNN: {mae_knn}")

In [None]:
# Cache KNN Model
save_pkl_file(knn_regressor, filepath='cache/ml_models/knn_regressor.pkl')

In [None]:
# Read in KNN Model
knn_regressor = load_pkl_file('cache/ml_models/knn_regressor.pkl')

## Multi-layer Perceptron (MLP) Neural Network Model

In [None]:
# Initialize the MLPRegressor model
nn_model = MLPRegressor(hidden_layer_sizes=(128, 64, 32),
                        activation='relu',
                        solver='adam',
                        max_iter=500,
                        early_stopping=True, # To use early stopping based on validation score
                        validation_fraction=0.1, # Fraction of training data to set aside as validation set for early stopping
                        verbose=True,
                        random_state=42)

# Use the score_model function to fit and predict
preds_nn, mae_nn = score_model(nn_model, OH_X_train, OH_X_valid, y_train, y_valid)

# Print the MAE for the neural network model
print(f"Mean Absolute Error with Neural Network (MLPRegressor): {mae_nn}")

In [None]:
# Cache KNN Model
save_pkl_file(nn_model, filepath='cache/ml_models/nn_model.pkl')

In [None]:
# Read in KNN Model
nn_model = load_pkl_file('cache/ml_models/nn_model.pkl')

## Support Vector Regression (SVR) Model

In [None]:
# Create SVR pipeline
def create_svr_pipeline():
    return make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.1, kernel='rbf'))

# Use your score_model function
pipeline = create_svr_pipeline()
preds_svr, mae_svr = score_model(pipeline, OH_X_train, OH_X_valid, y_train, y_valid)

# Print MAE
print(f"Mean Absolute Error with SVR: {mae_svr}")

In [None]:
save_pkl_file(pipeline, 'cache/ml_models/svr_pipeline.pkl')

In [None]:
pipeline = load_pkl_file('cache/ml_models/svr_pipeline.pkl')

# 5. Baseline Model Creation

## Mean Model

In [None]:
# Calculate the mean of the training target values
mean_train = y_train.mean()

# Use the mean to make predictions for the validation set
mean_preds = [mean_train] * len(y_valid)

# Calculate the Mean Absolute Error (MAE) of the mean model
mae_mean = mean_absolute_error(y_valid, mean_preds)

print(f"Mean Absolute Error with Mean Model: {mae_mean}")

## Arimax Model

In [None]:
# Define external regressors. Ensure there are no non-numeric or NaN values.
exog_train = OH_X_train
exog_valid = OH_X_valid

# Fit ARIMA-X model (ARIMA with external regressors, without the seasonal component) using the entire dataset
model = ARIMA(y_train, exog=exog_train, order=(1,1,1))
results = model.fit()

# Forecast
forecast = results.predict(start=len(y_train), end=len(y_train) + len(y_valid) - 1, exog=exog_valid, dynamic=True)

# Calculate MAE
mae_arimax = mean_absolute_error(y_valid, forecast)

print(f"Mean Absolute Error with ARIMA-X: {mae_arimax}")

# 6. Model Evaluation

## Compare with Baseline Models

In [None]:
models = {
    'Linear': mae_linear,
    'XGBoost': mae_xgboost,
    'KNN': mae_knn,
    'SVR' : mae_svr,
    'NN' : mae_nn,
    'Mean Model': mae_mean,
    'Arima X Model': mae_arimax
}

# Sorting dictionary by MAE
sorted_models = dict(sorted(models.items(), key=lambda item: item[1]))

# Define colors for models
colors = ['lightgrey' if model in ['Mean Model', 'Arima X Model'] else 'lightpink' for model in sorted_models.keys()]

# Plotting
plt.figure(figsize=(12,6))
bars = plt.barh(list(sorted_models.keys()), list(sorted_models.values()), color=colors)
plt.xlabel('Mean Absolute Error (MAE)')
plt.ylabel('Models')
plt.title('Model Comparison based on MAE')

# Annotate the bars with the actual MAE values
for bar in bars:
    plt.text(bar.get_width() - 0.005, bar.get_y() + bar.get_height()/2, 
             f'{bar.get_width():.4f}', 
             va='center', ha='right', color='black', fontsize=10)

plt.tight_layout()
plt.show()

# 7. Feature Determination based on Model Evaluation

# 8. Tuning Features from NLP Packages

# 9. Hyperparameter Tuning for Machine Learning Models

# 10. Model Interpretation

# Portfolio Analytics Functions

In [None]:
def calc_daily_ptf_rtn(ptf_wgt, returns_df, ls=False):
    """Calculate daily portfolio returns given long form portfolio weights and returns

    Args:
        ptf_wgt (pd.DataFrame): Long form id level portfolio weights
        returns_df (pd.DataFrame): Long form id level returns
    """
    ptf_wgt['DATE'] = pd.to_datetime(ptf_wgt['DATE'])
    returns_df['DATE'] = pd.to_datetime(returns_df['DATE'])

    start_date = ptf_wgt['DATE'].min()
    end_date = ptf_wgt['DATE'].max()

    filtered_returns = returns_df[(returns_df['DATE'] <= end_date) & (returns_df['DATE'] >= start_date)].reset_index(drop=True)
    rebal_dates = ptf_wgt['DATE'].unique()

    # Add rebal date column to returns df
    filtered_returns['REBAL_DATE'] = filtered_returns['DATE'].apply(lambda x: rebal_dates[rebal_dates <= x].max())
    joint_df = pd.merge(ptf_wgt.rename(columns={'DATE': 'REBAL_DATE'}), filtered_returns, on=['REBAL_DATE', 'ID'], how='left')

    # Dates with no returns are filled as 0
    joint_df = joint_df.fillna(0)

    final_df = joint_df[['DATE', 'REBAL_DATE', 'ID', "WGT", "RTN"]].copy()

    # Cumulate asset returns within each rebal date
    final_df['ASSET_CUM_RTN'] = final_df.groupby(['REBAL_DATE', 'ID'])['RTN'].transform(lambda x: (1+x).cumprod())

    # Calculate asset's MTM weight
    final_df['MTM_WGT'] = final_df['WGT'] * final_df['ASSET_CUM_RTN']
    final_df['DATE'] = pd.to_datetime(final_df['DATE'])

    if ls:
        final_df['LONG/SHORT'] = np.where(final_df['MTM_WGT'] > 0, 'LONG', 'SHORT')
        final_df['PTF_MTM_BASE'] = final_df.groupby(["DATE", 'LONG/SHORT'])['MTM_WGT'].transform(lambda x: abs(x).sum())
        final_df['ASSET_WEIGHTS'] = final_df['MTM_WGT'] / final_df['PTF_MTM_BASE']

    else:
        # Calculate portfolio MTM base weight
        final_df['PTF_MTM_BASE'] = final_df.groupby("DATE")['MTM_WGT'].transform('sum')

        # Calculate renormed asset weights
        final_df['ASSET_WEIGHTS'] = final_df['MTM_WGT'] / final_df['PTF_MTM_BASE']

    # Shift asset weights down by 1 to represent implied lag of 1 day
    final_df['ASSET_WEIGHTS_SHIFTED'] = final_df.groupby('ID')['ASSET_WEIGHTS'].shift(1)

    # Drop NaNs introduced from shifting
    final_df = final_df.dropna(axis=0)

    ptf_rtn_df = final_df.groupby('DATE').apply(lambda x: (x['RTN'] * x['ASSET_WEIGHTS_SHIFTED']).sum()).reset_index(name="PTF_RTN")
    return ptf_rtn_df, final_df

def calc_annualised_returns(cumulative_returns:float, n, frequency):
    if frequency == "D":
        t = 252
    elif frequency == "M":
        t = 12
    return ((cumulative_returns + 1)**(t/n) - 1).values[0]

def calc_annualised_vol(ptf_rtn: pd.Series, frequency):
    if frequency == "D":
        n = 252 # 252 trading days in ptf_rtn
    elif frequency == "M":
        n = 12
    return ptf_rtn.std(ddof=1).values[0] * np.sqrt(n)

def calc_max_dd(ptf_rtn: pd.Series):
    # Cumulative returns must be base 1
    ptf_cumulative_return = (1+ptf_rtn).cumprod()

    # Calculate running max
    running_max = ptf_cumulative_return.cummax()

    # Drawdown
    drawdown = (ptf_cumulative_return-running_max)/running_max

    max_drawdown = drawdown.min().values[0]
    return max_drawdown

def calc_ptf_summary(ptf_rtn):
    ptf_cum_rtn = (ptf_rtn+1).prod()-1
    ptf_ann_rtn = calc_annualised_returns(ptf_cum_rtn, len(ptf_rtn), 'D')
    ptf_ann_vol = calc_annualised_vol(ptf_rtn, "D")
    ptf_max_dd = calc_max_dd(ptf_rtn)
    sharpe_ratio = ptf_ann_rtn/ptf_ann_vol
    downside_sd = ptf_rtn[ptf_rtn < 0].std()[0]
    sortino_ratio = ptf_ann_rtn/downside_sd
    return pd.DataFrame({
        'Metrics': ['Cumulative Returns', 'Annualised Returns',
                    'Annualised Volatility', 'Maximum Drawdown',
                    'Sharpe Ratio', 'Sortino Ratio'],
        'Values': [ptf_cum_rtn[0], ptf_ann_rtn, ptf_ann_vol, ptf_max_dd,
                   sharpe_ratio, sortino_ratio]
    })


In [None]:
class Backtest():
    def __init__(self, fitted_model, rtn_df, OH_X_train, y_train, OH_X_valid, X_valid):
        self.model = fitted_model
        self.rtn_df = rtn_df
        self.OH_X_train = OH_X_train
        self.OH_X_valid = OH_X_valid
        self.y_train = y_train
        self.X_valid = X_valid

        # Generate signal dataframe
        self.sig_df =  self.gen_signals_df()

        # Construct long short portfolio
        self.ls_ptf_wgt = self.constr_ls_ptf_wgt()

        # Construct long only portfolio
        self.lo_ptf_wgt = self.constr_lo_ptf_wgt()

        # Construct equal weighted portfolio
        self.eq_ptf_wgt = self.constr_eq_ptf_wgt()

        # Generate backtest analytics
        self.ptf_rtn_combined, self.cum_rtn_fig, self.summary_metrics = self.gen_backtest_analytics()

    def gen_signals_df(self):
        preds = self.model.predict(self.OH_X_valid)
        trading_df = self.X_valid.copy()

        # Create column of predicted returns
        trading_df['predicted_rtn'] = preds

        # Count number of stocks per date
        trading_df['num_stocks_by_date'] = trading_df.groupby('date').transform('size')

        # Select data where there were at least 10 stocks for each date
        trading_df_filtered = trading_df[trading_df['num_stocks_by_date'] >= 10]

        trading_df_filtered = trading_df_filtered[['date', 'stock', 'predicted_rtn']].reset_index(drop=True)

        # Handle duplicated stocks on a single date (stems from multiple headline for a stock on one day)
        trading_df_filtered_dd = trading_df_filtered.groupby(['date', 'stock'])['predicted_rtn'].mean().reset_index(name='predicted_rtn')

        # Create daily index and forward fill
        start_date = trading_df_filtered_dd['date'].min()
        end_date = trading_df_filtered_dd['date'].max()

        # Reesample to daily
        daily_index = pd.date_range(start=start_date, end=end_date, freq='B')

        # Pivot dataframe to forward fill to daily index so that we have signals everyday
        trading_df_filtered_wide = trading_df_filtered_dd.pivot(index='date', values='predicted_rtn', columns='stock')
        trading_df_filtered_wide = trading_df_filtered_wide.reindex(daily_index).fillna(method='ffill')

        final_signal_df = (trading_df_filtered_wide
                        .reset_index(names='date')
                        .rename_axis(None, axis=1)
                        .melt(id_vars='date', var_name='stock', value_name='signal')
                        .dropna(axis=0).reset_index(drop=True))
        final_signal_df = final_signal_df.rename(columns={
            'date': 'DATE',
            'stock': 'ID',
            'signal': 'SIGNAL'
        })
        return final_signal_df

    def constr_ls_ptf_wgt(self):
        # Create copy of sig_df
        sig_df = self.sig_df.copy()

        # Ranking the signals to reduce fat tails
        sig_df['RANKED_SIGNAL'] = sig_df.groupby('DATE')['SIGNAL'].transform(lambda x: scipy.stats.rankdata(x))
        sig_df = sig_df.sort_values(['DATE', 'ID']).reset_index(drop=True)

        # Long short weights calculated as the distance for median signal for each date
        sig_df['WGT'] = sig_df.groupby(['DATE'])['RANKED_SIGNAL'].transform(lambda x: x-x.median())

        # Renormalise weights to $1 long $1 short – dollar neutral strategy
        sig_df['DIRECTION'] = np.where(sig_df['WGT']>=0, 'LONG', 'SHORT')
        sig_df['RENORM_WGT'] = sig_df.groupby(['DATE', 'DIRECTION'])['WGT'].transform(lambda x: x/np.abs(x.sum()))
        return sig_df[['DATE', 'ID', 'RENORM_WGT']].rename(columns={'RENORM_WGT': 'WGT'})

    def constr_lo_ptf_wgt(self):
        # Create copy of sig_df
        sig_df = self.sig_df.copy()

        # Create long only signal df
        sig_df_lo = sig_df[sig_df['SIGNAL']>0].copy()

        # Reset index
        sig_df_lo = sig_df_lo.reset_index(drop=True)

        # Use predicted returns as weights
        sig_df_lo['WGT'] = sig_df_lo.groupby('DATE')['SIGNAL'].transform(lambda x: x/x.sum())
        return sig_df_lo[['DATE', 'ID', 'WGT']]

    def constr_eq_ptf_wgt(self):
        # Create copy of sig_df
        sig_df = self.sig_df.copy()

        # Use predicted returns as weights
        sig_df['WGT'] = sig_df.groupby('DATE')['ID'].transform(lambda x: 1/x.shape[0])
        return sig_df[['DATE', 'ID', 'WGT']]

    def calc_ptf_rtn(self, ptf_wgt):
        # Merge ls weights on returns – left join
        ptf_df = pd.merge(ptf_wgt, self.rtn_df, on=['DATE', 'ID'], how='left')

        # On dates without returns just set returns to 0
        ptf_df = ptf_df.fillna(0)

        # Shift weights by 1 to imply lag
        ptf_df['WGT_SHIFTED'] = ptf_df['WGT'].shift(1)

        # Calculate long short weighted returns
        ptf_df['WGT_RTN'] = ptf_df['WGT_SHIFTED'] * ptf_df['RTN']

        # Drop NaN values from shifting
        ptf_df = ptf_df.dropna()

        ptf_rtn = ptf_df.groupby('DATE')['WGT_RTN'].sum().reset_index(name='PTF_RTN')
        return ptf_rtn

    def gen_backtest_analytics(self):
        ls_ptf_rtn = self.calc_ptf_rtn(self.ls_ptf_wgt)
        ls_ptf_rtn['PORT'] = "Long Short Portfolio"

        lo_ptf_rtn = self.calc_ptf_rtn(self.lo_ptf_wgt)
        lo_ptf_rtn['PORT'] = "Long Only Portfolio"

        eq_ptf_rtn = self.calc_ptf_rtn(self.eq_ptf_wgt)
        eq_ptf_rtn['PORT'] = "Equal Weighted Portfolio"

        ptf_rtn_combined = pd.concat([ls_ptf_rtn, lo_ptf_rtn, eq_ptf_rtn], axis=0)

        # Cumulate portfolio returns
        ptf_rtn_combined['CUM_RTN'] = ptf_rtn_combined.groupby(['PORT'])['PTF_RTN'].transform(lambda x: (1+x).cumprod())

        # Generate cumulative returns figure
        fig = px.line(ptf_rtn_combined, x='DATE', y='CUM_RTN', color='PORT')
        fig.update_layout(hovermode='x unified',
                          title="Portfolio Performance <br><sup></sup>",
                          yaxis_title='Cumulative Returns')
        fig.add_hline(y=1, line_dash='dash')

        # Create summary metrics table
        summary_metrics = (linear_model_backtest.ptf_rtn_combined.groupby('PORT')
                            .apply(lambda x: calc_ptf_summary(x.set_index('DATE')))
                            .reset_index(level=0)
                            .pivot(index='Metrics', columns='PORT', values='Values')
                            .reset_index().rename_axis(None, axis=1)
                            )
        summary_metrics.columns = ['Metrics', 'Equal Weighted Portfolio', 'Signals-Weighted Long Only Portfolio', 'Signals-Weighted Long Short Portfolio']
        return ptf_rtn_combined, fig, summary_metrics

    def display_results(self):
        display(self.summary_metrics)
        self.cum_rtn_fig.show()

# Simple Long Short Strategy

In [None]:
rtn_df = returns_df_melt.rename(columns={'date': 'DATE', 'stock': 'ID', 'daily_returns': 'RTN'})
rtn_df.head()

### Linear Model Performance

In [None]:
linear_model_backtest = Backtest(linear_model, rtn_df, OH_X_train, y_train, OH_X_valid, X_valid)

In [None]:
linear_model_backtest.display_results()

### KNN Model Performance

In [None]:
nn_model = load_pkl_file('cache/ml_models/nn_model.pkl')

In [None]:
nn_model_backtest = Backtest(nn_model, rtn_df, OH_X_train, y_train, OH_X_valid, X_valid)

In [None]:
nn_model_backtest.display_results()

# Long only Strategy

In [None]:
# Sample
sig_df = nn_model_backtest.sig_df.copy()

In [None]:
# Use SIGNAL (i.e. predicted returns) as weights
# Create long only signal df
sig_df_lo = sig_df[sig_df['SIGNAL']>0].copy()

sig_df_lo = sig_df_lo.reset_index(drop=True)

In [None]:
# Use predicted returns as weights
sig_df_lo['WGT'] = sig_df_lo.groupby('DATE')['SIGNAL'].transform(lambda x: x/x.sum())
sig_df_lo
sig_df_lo[['DATE', 'ID', 'WGT']]

# MPT Optimised Strategy

# Evaluating all ML Models Trading Performance

In [None]:
for model in os.listdir('cache/ml_models/'):
    ml_model = load_pkl_file(f'cache/ml_models/{model}')
    backtester = Backtest(ml_model, rtn_df, OH_X_train, y_train, OH_X_valid, X_valid)
    backtester.display_results()