In [None]:
import pandas as pd
import numpy as np
import datetime as dt
import yfinance as yf
import plotly.express as px
import scipy
from dash_bootstrap_templates import load_figure_template

In [None]:
pd.set_option('display.max_columns', None)
load_figure_template('minty')

# Data Exploration

In [None]:
data_1 = pd.read_csv("data/daily_financial_news/analyst_ratings_processed.csv", index_col=0)
data_1.head()

In [None]:
data_1['date'] = data_1['date'].str.split(' ', expand=True).iloc[:, 0]

In [None]:
data_1.head()

In [None]:
data_1['stock'].nunique()

In [None]:
data_1.info()

## Drop Null Values

In [None]:
print(data_1.isna().sum())

In [None]:
print(data_1.shape)

In [None]:
data_1[data_1['date'].isna()]

In [None]:
data_1.dropna(subset=['date'],inplace=True)
data_1.dropna(subset=['stock'],inplace=True)

In [None]:
print(data_1.shape)

In [None]:
print(data_1.isna().sum())

In [None]:
data_1.info()

## Filter stocks

In [None]:
# Filter dataset down to stocks with top 100 number of headlines
top_100_stocks_by_headlines = data_1.groupby('stock').size().reset_index(name='size').sort_values('size', ascending=False).reset_index(drop=True).iloc[:100]
top_100_stocks_by_headlines.head()

In [None]:
data_1 = data_1[data_1['stock'].isin(top_100_stocks_by_headlines.stock)]

In [None]:
data_1.shape

## Clean Text Data

In [None]:
data_1['title'] = data_1['title'].str.lower()

In [None]:
data_1.head()

### Remove Punctuations

In [None]:
import string

def remove_punctuation(text):
    return ''.join([char for char in text if char not in string.punctuation and not char.isdigit()])

data_1['title'] = data_1['title'].apply(remove_punctuation)

In [None]:
data_1.head()

In [None]:
data_1['tokens'] = data_1['title'].apply(lambda x: x.split())
data_1.head()

### Remove stop words

In [None]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

stop = stopwords.words('english')

data_1['tokens'] = data_1['tokens'].apply(lambda x: [word for word in x if word not in stop])

In [None]:
data_1.shape

### Lemmatize tokens

In [None]:
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

data_1['tokens'] = data_1['tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])


In [None]:
data_1.head()

In [None]:
data_1['preprocessed_text'] = data_1['tokens'].apply(' '.join)
data_1.head()

In [None]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()
# NLTK Sentiment Intensity Analyzer uses a 'Bag of Words' approach
# it removes stop words and scores each word individually before compounding

In [None]:
data_duplicate = data_1.copy()
data_duplicate['sentiment_score'] = data_duplicate['preprocessed_text'].apply(lambda x: sia.polarity_scores(x)['compound'])
data_duplicate

In [None]:
data_duplicate['sentiment_score'].describe()

In [None]:
data_duplicate.info()

In [None]:
data_duplicate.head()

In [None]:
signals_df = data_duplicate[['date', 'stock', 'sentiment_score']].copy()

In [None]:
signals_df.dtypes

In [None]:
# There are some stocks that have multiple news articles on the same day
# Have to handle these cases
non_dup_signals_df = signals_df.groupby(['date', 'stock'])['sentiment_score'].mean().reset_index(name='sentiment_score')

# Pull yfinance data

In [None]:
tickers = data_duplicate.stock.unique()

In [None]:
start_date, end_date = data_duplicate.date.sort_values().iloc[0], data_duplicate.date.sort_values().iloc[-1]

In [None]:
data = yf.download(list(tickers), start=start_date, end=end_date)

In [None]:
adj_close_data = data['Adj Close']

In [None]:
# Tickers that don't have data
missing_data_tickers = adj_close_data.columns[adj_close_data.isna().sum()/adj_close_data.shape[0] == 1]

# Drop missing tickers
adj_close_data = adj_close_data.drop(columns=missing_data_tickers)

In [None]:
# Drop other tickers
adj_close_data = adj_close_data.dropna(axis=1)

In [None]:
any(adj_close_data.isna().sum() > 0)

In [None]:
returns_df = adj_close_data.pct_change().dropna().reset_index().rename(columns={'Date': 'date'})

In [None]:
returns_df_melt = returns_df.melt(id_vars='date', var_name='stock', value_name='daily_returns')
returns_df_melt['date'] = pd.to_datetime(returns_df_melt['date'])
returns_df_melt.head()

In [None]:
non_dup_signals_df.sort_values('date').head()

In [None]:
returns_df_melt.dtypes

In [None]:
non_dup_signals_df['date'] = pd.to_datetime(non_dup_signals_df['date'])

In [None]:
merged_df = pd.merge(returns_df_melt, non_dup_signals_df, on=['date', 'stock'], how='left').dropna()

In [None]:
# Count number of stocks per date
merged_df['num_stocks_by_date'] = merged_df.groupby('date').transform('size')

# Select data where there were at least 10 stocks for each date
merged_df_filtered = merged_df[merged_df['num_stocks_by_date'] >= 10]

# Drop num_stocks_by_date column
ml_df = merged_df.drop(columns='num_stocks_by_date').reset_index(drop=True)

# Build Machine Learning Models

In [None]:
ml_df

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [None]:
# Split Data into Features and Target
# Sort values by date
X = ml_df.sort_values("date").drop('daily_returns', axis=1).reset_index(drop=True)
y = ml_df.sort_values("date")['daily_returns'].reset_index(drop=True)

In [None]:
def ts_train_test_split(data, test_size):
    """Takes in data and output train set and test set in that order

    Args:
        data (pd.DataFrame or pd.Series): Data to split into train and test
        test_size (float): Percentage for test size

    Returns:
        tuple: train set, test set
    """
    train_size = 1-test_size
    train_idx = round(X.shape[0] * train_size)
    return data.iloc[:train_idx], data.iloc[train_idx:]

In [None]:
# Split Data into Training and Validation Sets
X_train, X_valid = ts_train_test_split(X, test_size=0.2)
y_train, y_valid = ts_train_test_split(y, test_size=0.2)

In [None]:
# Initialize OneHotEncoder
one_hot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# One-hot Encode 'stock' Column for Training and Validation Data
OH_cols_train = pd.DataFrame(one_hot_encoder.fit_transform(X_train[['stock']]))
OH_cols_valid = pd.DataFrame(one_hot_encoder.transform(X_valid[['stock']]))

# Assign Column Names after One-Hot Encoding and Restore Index
OH_cols_train.columns = one_hot_encoder.get_feature_names_out(['stock'])
OH_cols_valid.columns = one_hot_encoder.get_feature_names_out(['stock'])
OH_cols_train.index, OH_cols_valid.index = X_train.index, X_valid.index

# Remove Original 'stock' Column
num_X_train = X_train.drop('stock', axis=1)
num_X_valid = X_valid.drop('stock', axis=1)

# Concatenate Original Data with One-Hot Encoded Columns
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

# Ensure all columns have string type
OH_X_train.columns = OH_X_train.columns.astype(str)
OH_X_valid.columns = OH_X_valid.columns.astype(str)

# Ensure 'date' is in datetime format if it is used in further analyses
OH_X_train = OH_X_train.drop(columns=['date'])
OH_X_valid = OH_X_valid.drop(columns=['date'])

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

In [None]:
def score_model(model, X_t, X_v, y_t, y_v):
    # Fit Model
    model.fit(X_t, y_t)

    # Predict
    preds = model.predict(X_v)

    # Check MAE
    mae = mean_absolute_error(preds, y_v)
    return mae

In [None]:
linear_model = LinearRegression()

In [None]:
score_model(linear_model, OH_X_train, OH_X_valid, y_train, y_valid)

# Portfolio Analytics Functions

In [None]:
def calc_daily_ptf_rtn(ptf_wgt, returns_df, ls=False):
    """Calculate daily portfolio returns given long form portfolio weights and returns

    Args:
        ptf_wgt (pd.DataFrame): Long form id level portfolio weights
        returns_df (pd.DataFrame): Long form id level returns
    """
    ptf_wgt['DATE'] = pd.to_datetime(ptf_wgt['DATE'])
    returns_df['DATE'] = pd.to_datetime(returns_df['DATE'])

    start_date = ptf_wgt['DATE'].min()
    end_date = ptf_wgt['DATE'].max()

    filtered_returns = returns_df[(returns_df['DATE'] <= end_date) & (returns_df['DATE'] >= start_date)].reset_index(drop=True)
    rebal_dates = ptf_wgt['DATE'].unique()

    # Add rebal date column to returns df
    filtered_returns['REBAL_DATE'] = filtered_returns['DATE'].apply(lambda x: rebal_dates[rebal_dates <= x].max())
    joint_df = pd.merge(ptf_wgt.rename(columns={'DATE': 'REBAL_DATE'}), filtered_returns, on=['REBAL_DATE', 'ID'], how='left')

    # Dates with no returns are filled as 0
    joint_df = joint_df.fillna(0)

    final_df = joint_df[['DATE', 'REBAL_DATE', 'ID', "WGT", "RTN"]].copy()

    # Cumulate asset returns within each rebal date
    final_df['ASSET_CUM_RTN'] = final_df.groupby(['REBAL_DATE', 'ID'])['RTN'].transform(lambda x: (1+x).cumprod())

    # Calculate asset's MTM weight
    final_df['MTM_WGT'] = final_df['WGT'] * final_df['ASSET_CUM_RTN']
    final_df['DATE'] = pd.to_datetime(final_df['DATE'])

    if ls:
        final_df['LONG/SHORT'] = np.where(final_df['MTM_WGT'] > 0, 'LONG', 'SHORT')
        final_df['PTF_MTM_BASE'] = final_df.groupby(["DATE", 'LONG/SHORT'])['MTM_WGT'].transform(lambda x: abs(x).sum())
        final_df['ASSET_WEIGHTS'] = final_df['MTM_WGT'] / final_df['PTF_MTM_BASE']

    else:
        # Calculate portfolio MTM base weight
        final_df['PTF_MTM_BASE'] = final_df.groupby("DATE")['MTM_WGT'].transform('sum')

        # Calculate renormed asset weights
        final_df['ASSET_WEIGHTS'] = final_df['MTM_WGT'] / final_df['PTF_MTM_BASE']

    # Shift asset weights down by 1 to represent implied lag of 1 day
    final_df['ASSET_WEIGHTS_SHIFTED'] = final_df.groupby('ID')['ASSET_WEIGHTS'].shift(1)

    # Drop NaNs introduced from shifting
    final_df = final_df.dropna(axis=0)

    ptf_rtn_df = final_df.groupby('DATE').apply(lambda x: (x['RTN'] * x['ASSET_WEIGHTS_SHIFTED']).sum()).reset_index(name="PTF_RTN")
    return ptf_rtn_df, final_df

def calc_annualised_returns(cumulative_returns:float, n, frequency):
    if frequency == "D":
        t = 252
    elif frequency == "M":
        t = 12
    return ((cumulative_returns + 1)**(t/n) - 1).values[0]

def calc_annualised_vol(ptf_rtn: pd.Series, frequency):
    if frequency == "D":
        n = 365 # 365 trading days in ptf_rtn
    elif frequency == "M":
        n = 12
    return ptf_rtn.std(ddof=1).values[0] * np.sqrt(n)

def calc_max_dd(ptf_rtn: pd.Series):
    # Cumulative returns must be base 1
    ptf_cumulative_return = (1+ptf_rtn).cumprod()

    # Calculate running max
    running_max = ptf_cumulative_return.cummax()

    # Drawdown
    drawdown = (ptf_cumulative_return-running_max)/running_max

    max_drawdown = drawdown.min().values[0]
    return max_drawdown

def calc_ptf_summary(ptf_rtn):
    ptf_cum_rtn = (ptf_rtn+1).prod()-1
    ptf_ann_rtn = calc_annualised_returns(ptf_cum_rtn, len(ptf_rtn), 'D')
    ptf_ann_vol = calc_annualised_vol(ptf_rtn, "D")
    ptf_max_dd = calc_max_dd(ptf_rtn)
    sharpe_ratio = ptf_ann_rtn/ptf_ann_vol
    downside_sd = ptf_rtn[ptf_rtn < 0].std()[0]
    sortino_ratio = ptf_ann_rtn/downside_sd
    return pd.DataFrame({
        'Metrics': ['Cumulative Returns', 'Annualised Returns',
                    'Annualised Volatility', 'Maximum Drawdown',
                    'Sharpe Ratio', 'Sortino Ratio'],
        'Values': [ptf_cum_rtn[0], ptf_ann_rtn, ptf_ann_vol, ptf_max_dd,
                   sharpe_ratio, sortino_ratio]
    })


In [None]:
class Backtest():
    def __init__(self, model, rtn_df, OH_X_train, y_train, OH_X_valid, X_valid):
        self.model = model
        self.rtn_df = rtn_df
        self.OH_X_train = OH_X_train
        self.OH_X_valid = OH_X_valid
        self.y_train = y_train
        self.X_valid = X_valid

        # Generate signal dataframe
        self.sig_df =  self.gen_signals_df()

        # Generate long short portfolio
        self.ls_ptf_wgt = self.constr_ls_ptf_wgt()

        # Generate backtest analytics
        self.cum_rtn_fig, self.summary_metrics = self.gen_backtest_analytics()

    def gen_signals_df(self):
        self.model.fit(self.OH_X_train, self.y_train)
        preds = self.model.predict(self.OH_X_valid)
        trading_df = self.X_valid.copy()

        # Create column of predicted returns
        trading_df['predicted_rtn'] = preds

        # Count number of stocks per date
        trading_df['num_stocks_by_date'] = trading_df.groupby('date').transform('size')

        # Select data where there were at least 10 stocks for each date
        trading_df_filtered = trading_df[trading_df['num_stocks_by_date'] >= 10]

        trading_df_filtered = trading_df_filtered[['date', 'stock', 'predicted_rtn']].reset_index(drop=True)

        # Create daily index and forward fill
        start_date = trading_df_filtered['date'].min()
        end_date = trading_df_filtered['date'].max()

        # Reesample to daily
        daily_index = pd.date_range(start=start_date, end=end_date, freq='D')

        trading_df_filtered_wide = trading_df_filtered.pivot(index='date', values='predicted_rtn', columns='stock')
        trading_df_filtered_wide = trading_df_filtered_wide.reindex(daily_index).fillna(method='ffill')

        final_signal_df = (trading_df_filtered_wide
                        .reset_index(names='date')
                        .rename_axis(None, axis=1)
                        .melt(id_vars='date', var_name='stock', value_name='signal')
                        .dropna(axis=0).reset_index(drop=True))
        final_signal_df = final_signal_df.rename(columns={
            'date': 'DATE',
            'stock': 'ID',
            'signal': 'SIGNAL'
        })
        final_signal_df['RANKED_SIGNAL'] = final_signal_df.groupby('DATE')['SIGNAL'].transform(lambda x: scipy.stats.rankdata(x))
        final_signal_df = final_signal_df.sort_values(['DATE', 'ID']).reset_index(drop=True)
        return final_signal_df

    def constr_ls_ptf_wgt(self):
        # Long short weights calculated as the distance for median signal for each date
        self.sig_df['WGT'] = self.sig_df.groupby(['DATE'])['RANKED_SIGNAL'].transform(lambda x: x-x.median())

        # Renormalise weights to $1 long $1 short – dollar neutral strategy
        self.sig_df['DIRECTION'] = np.where(self.sig_df['WGT']>=0, 'LONG', 'SHORT')
        self.sig_df['RENORM_WGT'] = self.sig_df.groupby(['DATE', 'DIRECTION'])['WGT'].transform(lambda x: x/np.abs(x.sum()))
        return self.sig_df[['DATE', 'ID', 'RENORM_WGT']].rename(columns={'RENORM_WGT': 'WGT'})

    def gen_backtest_analytics(self):
        # Merge weights on returns – left join
        ptf_df = pd.merge(self.ls_ptf_wgt, self.rtn_df, on=['DATE', 'ID'], how='left')

        # On dates without returns just set returns to 0
        ptf_df = ptf_df.fillna(0)

        # Shift weights by 1 to imply lag
        ptf_df['LS_WGT_SHIFTED'] = ptf_df['WGT'].shift(1)

        # Calculate long short weighted returns
        ptf_df['LS_WGT_RTN'] = ptf_df['LS_WGT_SHIFTED'] * ptf_df['RTN']

        # Derive equal weighted portfolio weights
        ptf_df['EQ_WGT'] = ptf_df.groupby('DATE')['ID'].transform(lambda x: 1/x.shape[0])

        # Shift EQ_WGT
        ptf_df['EQ_WGT_SHIFTED'] = ptf_df['EQ_WGT'].shift(1)
        ptf_df['EQ_WGT_RTN'] = ptf_df['EQ_WGT_SHIFTED'] * ptf_df['RTN']

        # Drop NaN values from shifting
        ptf_df = ptf_df.dropna()

        # Melt to long form
        ptf_df_port = (ptf_df.melt(id_vars='DATE',
                                   value_vars=['LS_WGT_RTN', 'EQ_WGT_RTN'],
                                   var_name='PORT', value_name='PTF_RTN'))

        # Group by portfolio and calculate portfolio returns
        ptf_rtn_combined = ptf_df_port.groupby(['DATE', 'PORT'])['PTF_RTN'].sum().reset_index()

        # Cumulate portfolio returns
        ptf_rtn_combined['CUM_RTN'] = ptf_rtn_combined.groupby(['PORT'])['PTF_RTN'].transform(lambda x: (1+x).cumprod())
        ptf_rtn_combined

        # Rename
        ptf_rtn_combined['PORT'] = (ptf_rtn_combined['PORT']
                                    .replace({'EQ_WGT_RTN': 'EQ_WGT_PORT',
                                              'LS_WGT_RTN': 'LS_WGT_PORT'}))

        # Generate cumulative returns figure
        fig = px.line(ptf_rtn_combined, x='DATE', y='CUM_RTN', color='PORT')
        fig.update_layout(hovermode='x unified')

        # Create summary metrics table
        ls_ptf_summary = calc_ptf_summary(ptf_rtn_combined.loc[ptf_rtn_combined['PORT']=='LS_WGT_PORT', ['DATE', 'PTF_RTN']].set_index('DATE'))
        eq_ptf_summary = calc_ptf_summary(ptf_rtn_combined.loc[ptf_rtn_combined['PORT']=='EQ_WGT_PORT', ['DATE', 'PTF_RTN']].set_index('DATE'))
        summary_metrics = pd.concat([ls_ptf_summary.set_index('Metrics'), eq_ptf_summary.set_index('Metrics')], axis=1)
        summary_metrics.columns = ['Signals-Weighted Long Short Portfolio', 'Equal Weight Portfolio']
        return fig, summary_metrics

    def display_results(self):
        display(self.summary_metrics)
        self.cum_rtn_fig.show()

# Simple Long Short Strategy

In [None]:
rtn_df = returns_df_melt.rename(columns={'date': 'DATE', 'stock': 'ID', 'daily_returns': 'RTN'})
rtn_df.head()

In [None]:
linear_model_backtest = Backtest(linear_model, rtn_df, OH_X_train, y_train, OH_X_valid, X_valid)

In [None]:
linear_model_backtest.display_results()

# Long only Strategy

# MPT Optimised Strategy