# Trade
- Swing trading/long-term trading
    - Exposed to overnight risk (close price previous day might not equal to open 
    price next day if there are major events happening between market closure and
    market open).
- Assume I already have which day to long, which day to short
- Conduct post-trade analysis
- Refine risk management techniques (Comparing starting on 2023-12-22)
    - Boeing: Main character in the events
        - Stock -18.61%

    - Direct competitors
        - Airbus (EPA: AIR): Boeing's primary competitor in commercial aircraft manufacturing
            - Stock +5.93%
        - Lockhead Martin (LMT): More focused on defense but also compete in aerospace
    - Suppliers
        - General Electric (GE): Supplies engines for Boeing aircraft
            - Have presence in aviation, healthcare, power, renewable energy
            - Doesn't seem to be affected
            - Can also supply engines to other aircraft manufacturers (effect on
            stock price is complicated)
    - Customers
        - Alaska Airlines (ALK): Main airline involved
            - Stock -11.73%
        - American Airlines (UAL - NasdaqGS)
            - Stock -4.91%
        - Delta Air Lines (DAL)
            - -11.73%
        - Southwest Airlines
- Trading timing (NYSE) vs news timing
    - The news was updated on January 18, 2024, at 4:36 AM GMT+8, which translates to January 17, 2024, at 3:36 PM Eastern Time (since GMT+8 is 13 hours ahead of Eastern Time). Since the NYSE closes at 4:00 PM ET, this news would have come out just before the market close.
    - Difference stock exchanges might operate at different timings also
- No training and validation - straight go to validation (backtesting)


# Set Up

In [1]:
import os
import ast
import requests
import logging

import yfinance as yf
import pandas as pd
import numpy as np

import finnhub
from dotenv import load_dotenv
from pathlib import Path    
import sys
import time
import itertools

import scipy.stats as stats
import statsmodels.api as sm
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

import matplotlib.pyplot as plt
import typing

sys.path.append('../') # Change the python path at runtime

# Self-created modules
from src.utils import path as path_yq
from src.backtesting import Backtest, Strategy


In [5]:
np.random.seed(7)
load_dotenv()
POLYGON_API_KEY = os.environ.get('POLYGON_API_KEY')

BT_START_DATE = '2023-11-01'
BT_START_STR = '20231101'
BT_END_DATE = '2024-01-31'
BT_END_STR = '20240131'

cur_dir = Path.cwd()
root_dir = path_yq.get_root_dir(cur_dir)

logging.basicConfig(filename=Path.joinpath(root_dir, 'logs', 'trading_system.log'),
                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
                    level=logging.DEBUG)

stm_techs = ['stc', 'blob', 'sid', 'bert', 'finbert']
contents = ['cln_hdl', 'cln_smr', 'cln_news']
lemmas = ['', 'lemma']

# Fetch Tick Data

## Polygon

Polygon docs: https://polygon.io/docs/stocks/get_v2_aggs_ticker__stocksticker__range__multiplier___timespan___from___to

- FIXME: The timings include those in pre-market hours
- The timestamp is in ms, not sec

Similar to download data codes
TODO: Assumption: assume other stocks share the same timezone

- The data is incomplete (not every minute)
24265	47739.0	217.6996	217.6800	217.7000	217.7000	217.6800	1705096560000	21	2024-01-12 21:56:00
24266	171.0	217.5423	217.5000	217.5000	217.5000	217.5000	1705096800000	5	2024-01-12 22:00:00

In [None]:
url = f"https://api.polygon.io/v2/aggs/ticker/BA/range/1/minute/{BT_START_DATE}/{BT_END_DATE}?adjusted=true&sort=asc&limit=50000&apiKey={POLYGON_API_KEY}"

# Make the GET request
resp = requests.get(url)

In [None]:
# Check if the request was successful
if resp.status_code == 200:
    # Convert the 'results' list to a DataFrame
    df = pd.DataFrame(resp.json().get('results'))

    # Rename the columns to more descriptive names
    column_mapping = {
        "v": "Volume",
        "vw": "VWAP",
        "o": "Open",
        "c": "Close",
        "h": "High",
        "l": "Low",
        "t": "Timestamp",
        "n": "Transactions"
        # Add more mappings as necessary
    }

    df.rename(columns=column_mapping, inplace=True)

    # Optionally, convert the 'Timestamp' column from Unix milliseconds to a datetime format
    df['Datetime'] = pd.to_datetime(df['Timestamp'], unit='ms')

    # Display the updated DataFrame
    print(df)
else:
    # Handle errors (e.g., logging, raising an exception)
    print(f"Error fetching data: {resp.status_code}, {resp.text}")



In [None]:
# Boeing open high low close data
raw_path = Path.joinpath(root_dir, 'data', 'raw', f'BA_OHLC_{BT_START_STR}_{BT_END_STR}.csv')
df.to_csv(raw_path, index=False)

## Yahoo (Outdated)

In [None]:
# Define the ticker list
ticker_list = ['BA']

# Fetch the data
dl_data = yf.download(ticker_list, start=BT_START_DATE, end=BT_END_DATE) # Auto adjust is false

dl_data = pd.DataFrame(dl_data)
data = dl_data.drop(columns=['Close'], axis=1)
data = data.rename(columns={'Adj Close': 'Close'})
display(data.isna().sum(axis=0)) # Axis=0: along the indices, row-wise opertaion
# Gives the sum for rows in a column
data.index = pd.to_datetime(data.index)
data


In [None]:
dates = pd.DataFrame(data.index.strftime('%Y-%m-%d'))
# dates.to_csv("trading_dates.csv", index=False)

In [None]:
# After performing sentiment
stm_path = root_dir.joinpath('data', 'proc', 'boeing_stm_20231101_to_20240131.csv')
news = pd.read_csv(stm_path, index_col=False)
news2 = news[['datetime2', 'news_pol_blob']]
news2

news2.plot()
# # data['Sentiment'] = np.random.random(len(data)) * 2 - 1
# display(len(data))
# sentiment = np.array([0, -1, -0.8, 0, 0, 0]) # Put -1 on 01-05 (Before the whole thing Boeing case appeared after market closed on 01-05 to prepare to trade for 01-08)
# data['Sentiment'] = sentiment
# display(data.tail(20))

In [None]:
# Ensure datetime2 in news2 is in pandas datetime format
news2['datetime2'] = pd.to_datetime(news2['datetime2'])

# Assuming data.index is already a DatetimeIndex, no need to convert it again
# Just ensure it's sorted
data.sort_index(inplace=True)

# Function to find the closest previous date in data for each date in news2
def find_closest_previous_date(target_date, date_index):
    previous_dates = date_index[date_index <= target_date]
    if not previous_dates.empty:
        return previous_dates.max()
    else:
        return pd.NaT  # Return Not-A-Time (NaT) if no previous date is found

# Apply the function to each date in news2['datetime2']
closest_dates = news2['datetime2'].apply(lambda x: find_closest_previous_date(x, data.index))

# Add this closest date information to news2
news2['closest_date'] = closest_dates
news2

In [None]:
# TODO: Need to think of how to combine the data (might have many neutral etc.)
# as_index will retain closest_date
news3 = news2.groupby('closest_date', as_index=False)['news_pol_blob'].mean().reset_index(drop=True) 
news3

In [None]:
merged = pd.merge(data, news3, left_on='Date', right_on='closest_date', how='left')
merged

In [None]:
# Clean for 2 lines only
merged2 = merged.dropna().reset_index(drop=True)
merged2

# Merge data

In [6]:

def convert_data(row):
    """
    A function from sentiment.ipynb.
    """
    try:
        # First, try to evaluate the row as a list
        evaluated = ast.literal_eval(row)
        # If the result is a list, return it directly
        if isinstance(evaluated, list):
            return evaluated
        # If not, it's already the correct type (int, float, etc.)
        return evaluated
    except ValueError:
        # Handle the case where the row is not a valid Python literal
        # This could be a string that should not be converted
        return row
    except SyntaxError:
        # Handle syntax errors which might occur if ast.literal_eval can't parse the string
        return row
    except Exception as e:
        print(f'Exception: {e}')
        return row

score_path = root_dir.joinpath('data', 'proc', f'BA_score_{BT_START_STR}_{BT_END_STR}.csv') 
df9 = pd.read_csv(score_path, index_col=False)

# Apply the conversion function to each specified column
for col in df9.columns:
    df9[col] = df9[col].apply(convert_data)
df9['datetime2'] = pd.to_datetime(df9['datetime2'])

# print(df8.equals(df7))
# print(type(df8['datetime2'][0]))

In [7]:
# Fetch and sort tick data
# Boeing open high low close data
raw_path = Path.joinpath(root_dir, 'data', 'raw', f'BA_OHLC_{BT_START_STR}_{BT_END_STR}.csv')
tick = pd.read_csv(raw_path, index_col=False)
tick['Datetime'] = pd.to_datetime(tick['Datetime'])
tick = tick.sort_values(by='Datetime')

# Make sure the tick data is within backtest date range
tick = tick[(tick['Datetime'] >= BT_START_DATE) & (tick['Datetime'] <= BT_END_DATE)]
tick

Unnamed: 0,Volume,VWAP,Open,Close,High,Low,Timestamp,Transactions,Datetime
0,991.0,186.6991,186.6200,186.8000,186.8000,186.6200,1698829200000,31,2023-11-01 09:00:00
1,410.0,186.8187,186.8200,186.8200,186.8200,186.8200,1698829560000,5,2023-11-01 09:06:00
2,1289.0,187.6589,187.5900,187.7000,187.7000,187.5900,1698830040000,29,2023-11-01 09:14:00
3,535.0,188.1637,187.8400,187.9600,187.9600,187.8400,1698830100000,34,2023-11-01 09:15:00
4,442.0,188.8297,188.7900,188.7900,188.7900,188.7900,1698830160000,27,2023-11-01 09:16:00
...,...,...,...,...,...,...,...,...,...
31393,1009.0,199.7495,199.7500,199.7500,199.7500,199.7500,1706658600000,7,2024-01-30 23:50:00
31394,250.0,199.6644,199.6500,199.6500,199.6500,199.6500,1706658720000,4,2024-01-30 23:52:00
31395,315.0,199.7283,199.7369,199.7369,199.7369,199.7369,1706658960000,11,2024-01-30 23:56:00
31396,503.0,199.7896,199.7999,199.7999,199.7999,199.7999,1706659080000,6,2024-01-30 23:58:00


In [8]:
# Assuming data.index is already a DatetimeIndex, no need to convert it again
df9['datetime2'] = pd.to_datetime(df9['datetime2'])
tick['Datetime'] = pd.to_datetime(tick['Datetime'])

# Make sure to sort first
df9 = df9.sort_values(by='datetime2')
tick = tick.sort_values(by='Datetime')

# Function to find the closest previous date in tick for each date in news2
def find_closest_prev_date(target_date, date_col):
    # The information gotten at this time point can only be used in the next time point
    prev_dates = date_col[date_col <= target_date] 
    if not prev_dates.empty:
        return prev_dates.max()
    else:
        # Can happen when the news is earlier than all the tick data
        print(f"WARNING. Previous date not found for {target_date}")
        print(date_col)
        return pd.NaT  # Return Not-A-Time (NaT) if no previous date is found

# Apply the function to each date in news2['datetime2']
closest_dates = df9['datetime2'].apply(lambda x: find_closest_prev_date(x, tick['Datetime']))

# Add this closest date information to news2
df9['closest_date'] = closest_dates
df9.sort_values(by='datetime2')
df9.reset_index(inplace=True, drop=True)
df9

0       2023-11-01 09:00:00
1       2023-11-01 09:06:00
2       2023-11-01 09:14:00
3       2023-11-01 09:15:00
4       2023-11-01 09:16:00
                ...        
31393   2024-01-30 23:50:00
31394   2024-01-30 23:52:00
31395   2024-01-30 23:56:00
31396   2024-01-30 23:58:00
31397   2024-01-31 00:00:00
Name: Datetime, Length: 31398, dtype: datetime64[ns]


Unnamed: 0,id,datetime2,cln_hdl,cln_smr,cln_news,cln_hdl_lemma,cln_smr_lemma,cln_news_lemma,cln_hdl_pol_blob,cln_smr_pol_blob,...,cln_hdl_lemma_pol_bert_score,cln_smr_lemma_pol_bert_score,cln_news_lemma_pol_bert_score,cln_hdl_pol_finbert_score,cln_smr_pol_finbert_score,cln_news_pol_finbert_score,cln_hdl_lemma_pol_finbert_score,cln_smr_lemma_pol_finbert_score,cln_news_lemma_pol_finbert_score,closest_date
0,123559928,2023-11-01 05:39:51,"[Ford, GM bumped to buy Boeing gets 2 upgrades...",[Goldman Sachs upgraded Simon Property Group (...,[Investing.com — Here is your Pro Recap of the...,"[Ford , GM bumped buy Boeing get 2 upgrade : 4...",[Goldman Sachs upgraded Simon Property Group (...,[Investing.com — Pro Recap biggest analyst pic...,[0.0],[0.0],...,0.727060,-0.689266,-0.340141,0.894530,0.549459,0.360264,0.641842,0.836147,0.400344,NaT
1,123544219,2023-11-01 11:39:06,[UPDATE 2-Spirit Aero cuts 737 fuselage delive...,[Spirit AeroSystems on Wednesday announced $10...,"[(Adjusts shares in paragraph 5, adds Airbus c...",[UPDATE 2-Spirit Aero cut 737 fuselage deliver...,[Spirit AeroSystems Wednesday announced $ 101 ...,"[( Adjusts share paragraph 5 , add Airbus comm...",[0.0],"[0.0, 0.0625, 0.0]",...,0.142053,-0.793133,-0.597804,-0.943793,-0.335737,0.295100,-0.900221,-0.361670,0.448931,2023-11-01 11:39:00
2,123566505,2023-11-01 13:30:29,"[Compared to Estimates, Spirit Aerosystems (SP...",[Although the revenue and EPS for Spirit Aeros...,"[For the quarter ended September 2023, Spirit ...","[Compared Estimates , Spirit Aerosystems ( SPR...",[Although revenue EPS Spirit Aerosystems ( SPR...,"[quarter ended September 2023 , Spirit Aerosys...",[0.0],[0.15],...,0.174489,0.000000,-0.215470,0.000000,0.000000,0.354270,0.000000,0.000000,0.530006,2023-11-01 13:30:00
3,123545059,2023-11-01 14:21:57,[Morning Brew: AMDs Q4 Guidance Weighs on Stoc...,[Advanced Micro Devices (NASDAQ:AMD) stock was...,[Advanced Micro Devices (NASDAQ:AMD) stock was...,[Morning Brew : AMDs Q4 Guidance Weighs Stock ...,[Advanced Micro Devices ( NASDAQ : AMD ) stock...,[Advanced Micro Devices ( NASDAQ : AMD ) stock...,[-0.3],"[0.1527777777777778, 0.22727272727272727, -0.06]",...,-0.744470,-0.263412,-0.343342,-0.958961,-0.322292,-0.101700,-0.852977,-0.267872,0.130774,2023-11-01 14:21:00
4,123567205,2023-11-01 22:24:31,[UPDATE 1-US Air Force blows up Minuteman III ...,[The U.S. Air Force said on Wednesday it had b...,[Nov 1 (Reuters) - The U.S. Air Force said on ...,[UPDATE 1-US Air Force blow Minuteman III test...,[U.S. Air Force said Wednesday blown Minuteman...,[Nov 1 ( Reuters ) - U.S. Air Force said Wedne...,[0.0],"[-0.4, -0.25, 0.0]",...,-0.733265,-0.253360,0.317687,-0.892588,-0.872526,-0.066805,0.000000,0.000000,-0.047870,2023-11-01 22:01:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
546,125415682,2024-01-30 21:10:56,[Boeing Seen Narrowing Q4 Loss Amid 737 Max Gr...,[Dow Jones giant Boeing reports Q4 results ear...,[Dow Jones giant Boeing reports Q4 results ear...,[Boeing Seen Narrowing Q4 Loss Amid 737 Max Gr...,[Dow Jones giant Boeing report Q4 result early...,[Dow Jones giant Boeing report Q4 result early...,[0.0],"[0.05, 0.1]",...,-0.737527,-0.239781,-0.239781,-0.965217,-0.952749,-0.952749,-0.965217,-0.944420,-0.944420,2024-01-30 21:09:00
547,125415680,2024-01-30 22:23:48,"[Hawaiian Airlines ekes out Q4 revenue beat, e...",[Hawaiian Holdings (HA) — the parent company o...,[Hawaiian Holdings (HA) — the parent company o...,"[Hawaiian Airlines ekes Q4 revenue beat , earn...",[Hawaiian Holdings ( HA ) — parent company Haw...,[Hawaiian Holdings ( HA ) — parent company Haw...,[0.0],"[-0.06666666666666667, -0.15555555555555559, 0...",...,-0.776986,-0.258785,-0.224460,-0.381478,-0.922368,-0.055458,-0.667220,-0.905073,0.281039,2024-01-30 22:21:00
548,125415679,2024-01-30 22:39:00,"[Boeings Earnings Are Coming., Investors Are W...",[The list of points to watch when the jet make...,[The number of watch items in Boeings fourth-q...,"[Boeings Earnings Coming ., Investors Watching...",[list point watch jet maker report latest resu...,[number watch item Boeings fourth-quarter repo...,"[0.0, 0.0]",[0.225],...,0.698278,0.720234,0.151111,0.856519,0.000000,0.000000,0.730035,0.000000,0.000000,2024-01-30 22:38:00
549,125417521,2024-01-30 23:03:43,[Boeing was once known for safety and engineer...,[Part of the fuselage blowing off shortly afte...,[Part of the fuselage blowing off shortly afte...,"[Boeing known safety engineering ., critic say...","[Part fuselage blowing shortly takeoff , leavi...","[Part fuselage blowing shortly takeoff , leavi...","[0.0, 0.0]","[0.0, -0.1587179487179487, 0.0, -0.125]",...,-0.262539,-0.442463,-0.278902,0.000000,-0.928406,-0.508453,0.000000,-0.912366,-0.404513,2024-01-30 23:03:00


In [None]:
# Find the difference between datetime2 and closest_date to reduce overnight trading risks,
# or risks caused by lack of tick data
df9['datetime_diff'] = df9['datetime2'] - df9['closest_date']
# Check if there is any negative difference
print(np.sum(df9['datetime_diff'] < pd.Timedelta(0)))

mask = df9['datetime_diff'] >= pd.Timedelta(5, unit='m')
df9 = df9[~mask]

In [None]:
# Drop the NaT in find previous closest dates
def drop_na(df):
    # Drop all the news_content with na
    print(f"Before dropping na: {df.isna().sum().sum()}")
    df1 = df.dropna()
    df1.reset_index(inplace=True, drop=True)
    print(f"After dropping na: {df.isna().sum().sum()}")
    return df1



In [None]:
drop_na(df9).head()

Check whether there are 30 columns of scores

In [None]:
df9.columns

## Merge Scores between Trading Periods

In [None]:
df_list = []

for stm_tech in stm_techs:
    for lemma in lemmas:
        for content in contents:
            if lemma:
                col_name = f'{content}_{lemma}_pol_{stm_tech}_score'

            else:
                col_name = f'{content}_pol_{stm_tech}_score'
            tmp = df9.groupby('closest_date', as_index=False)[col_name].mean().reset_index(drop=True) 
            df_list.append(tmp)
            # display(tmp)
# print(df_list)

# # Assumes df_list has at least two elements
# merged = df_list[0]
# for i in range(1, len(df_list)):
#     merged = pd.merge(left=merged, right=df_list[i], on='closest_date', how='inner')
# merged

from functools import reduce
# A simpler implementation
merged = reduce(lambda left, right: pd.merge(left, right, on='closest_date', how='inner'), df_list)
merged

## Merge Tick Data and Scores

In [None]:
merged2 = pd.merge(left=tick, right=merged, left_on='Datetime', right_on='closest_date', how='left')
merged2.reset_index(inplace=True, drop=True)

In [None]:
merge_path = root_dir.joinpath('data', 'proc', f'BA_merged_{BT_START_STR}_{BT_END_STR}.csv') # TODO: Change dates
merged2.to_csv(merge_path, index=False)

## Simple Post-Trade Analysis

In [None]:
merged2[merged2.index == 1873]

In [None]:
# Choose col_name to describe
merged2[col_name].describe()

In [None]:
# Post-trade analysis
merged2[merged2['Datetime'] >= pd.to_datetime('2023-11-01 11:39:00')]



# Backtesting
- Pros
    - Test single strategy
    - Have optimizer, graphs
- Cons
    - Cannot trade multiple assets FIXME: not applicable to portfolio
    - Does not trade fractional shares
https://kernc.github.io/backtesting.py/#example


- Other backtesting framework: backtrader, zipline - both can do multi-asset trading
- Backtrader works with Pandas DataFrames, CSV, and real-time data feeds from Interactive Brokers, Oanda, and Visual Chart. 
- 2% rule: https://www.investopedia.com/terms/t/two-percent-rule.asp#:~:text=What%20Is%20the%202%25%20Rule,capital%20on%20any%20single%20trade.
- Try to have less than 10% of drawdown: https://www.quora.com/How-do-I-use-the-never-risk-more-than-2-rule-in-Forex-trading


Hypothesis
- Takes in a df from start to end, with all the ticker data (including those NA for sentiment)
- Enters trade at 549 (My information should backfill)
548	308.0	247.7006	247.7000	247.7000	247.7000	247.7000	1704291540000	8	2024-01-03 14:19:00	2024-01-03 14:19:00	0.156808
549	264.0	247.6105	247.6000	247.6000	247.6000	247.6000	1704291780000	9	2024-01-03 14:23:00	NaT	NaN
550	1157.0	247.5724	247.6000	247.5031	247.6001	247.5031	1704291840000	49	2024-01-03 14:24:00	NaT	NaN
- I can compare the results between lemmatization or not, and fix other variables constant
- I can compare the results between different content and fix others constant



## Strategy

In [13]:
TP_PCT = 0.02
SL_PCT = 0.006
RISK_PER_TRADE = 0.5


class SimpleStmStrat(Strategy):
    """
    Use a proportional amount of cash to trade with the sentiment score indicator.
    """
    # Strategy class should define parameters as class variables before they can be optimized or run with.
    col = None

    # Add the parameters in init
    def __init__(self, broker, data, **kwargs):
        super().__init__(broker, data, **kwargs)  # Make sure the parent class can handle **kwargs appropriately
        self.col = kwargs.get('col', self.col)

    # Initialize additional indicators here if needed
    def init(self):
        # self.trade_size = 40 # This times the next open price cannot exceed equity
        self.tp_pct = TP_PCT
        self.sl_pct = SL_PCT
        self.risk_per_trade = RISK_PER_TRADE # Maximum of the portfolio on one trade

    def next(self):
        cur_stm = self.data[self.col][-1]
        # print(self.data['closest_date'][-1])
        cur_price = self.data['Close'][-1]

        # print(f"-----{self.data['Datetime'][-1]}-----")
        # trade_size = (0.5 * (abs(cur_stm) ** 2) + 0.5) * self.risk_per_trade

        # Can be around 15

        # Decision is made on the time point before entry 
        # Entry Bar is the index, entry price is the open price of the next time point
        # Not sure why exit price is not the open price at the exitBar?? Should be because of tp and sl

        trade_size = self.risk_per_trade 
        if (cur_stm > 0.2): # Many losses if I don't take
            # print(f"Buy: {self.data['closest_date'][-1]}")
            self.buy(size=trade_size, sl=(1 - self.sl_pct) * cur_price, tp=(1 + self.tp_pct) * cur_price)
            # If size is a value between 0 and 1, it is interpreted as a fraction of current available liquidity (cash plus Position.pl minus used margin). A value greater than or equal to 1 indicates an absolute number of units.

        elif cur_stm < -0.2:
            # print(f"Sell: {self.data['closest_date'][-1]}")
            self.sell(size=trade_size, sl=(1 + self.sl_pct) * cur_price, tp=(1 - self.tp_pct) * cur_price)
        else:
            pass
        # print(cur_stm)

class RandomStrat(Strategy):
    """
    A strategy that randomly trades for ttl_trade times, and if it trades,
    the probability of buy and sell is 0.5.
    """
    ttl_trade = 10
    # Add the parameters in init
    def __init__(self, broker, data, **kwargs):
        super().__init__(broker, data, **kwargs)  # Make sure the parent class can handle **kwargs appropriately
        self.ttl_trade = kwargs.get('ttl_trade', self.ttl_trade)
        self.trade_prob = self.ttl_trade / len(self.data)
        # print(f"Total number of data:{len(self.data)}")
        
    # Initialize additional indicators here if needed
    def init(self):
        self.tp_pct = TP_PCT
        self.sl_pct = SL_PCT
        self.risk_per_trade = RISK_PER_TRADE # Maximum of the portfolio on one trade

    def next(self):
        trade_size = self.risk_per_trade 
        cur_price = self.data['Close'][-1]
        trade_flag = np.random.rand() < self.trade_prob
        buy_flag = np.random.rand() > 0.5
        if (trade_flag and buy_flag): # Many losses if I don't take
            # print(f"Buy: {self.data.index[-1]}")
            self.buy(size=trade_size, sl=(1 - self.sl_pct) * cur_price, tp=(1 + self.tp_pct) * cur_price)
            # If size is a value between 0 and 1, it is interpreted as a fraction of current available liquidity (cash plus Position.pl minus used margin). A value greater than or equal to 1 indicates an absolute number of units.

        elif (trade_flag):
            # print(f"Sell: {self.data.index[-1]}")
            self.sell(size=trade_size, sl=(1 + self.sl_pct) * cur_price, tp=(1 - self.tp_pct) * cur_price)
        else:
            pass

In [14]:

merge_path = root_dir.joinpath('data', 'proc', f'BA_merged_{BT_START_STR}_{BT_END_STR}.csv') 
merged2 = pd.read_csv(merge_path, index_col=False)


# TODO: Split into 3 months to analyse
convert_data(merged2)
merged2['Datetime'] = pd.to_datetime(merged2['Datetime'])

# merged2 = merged2[(merged2['Datetime'] >= pd.to_datetime('2023-11-01')) & (merged2['Datetime'] < pd.to_datetime('2023-12-01'))]
# merged2 = merged2[(merged2['Datetime'] >= pd.to_datetime('2023-12-01')) & (merged2['Datetime'] < pd.to_datetime('2024-01-01'))]
# merged2 = merged2[(merged2['Datetime'] >= pd.to_datetime('2024-01-01')) & (merged2['Datetime'] < pd.to_datetime('2024-02-01'))]
merged2 = merged2[(merged2['Datetime'] >= pd.to_datetime('2023-11-01')) & (merged2['Datetime'] < pd.to_datetime('2024-02-01'))]

# TODO: Adjust interest rate based on backtesting period

BACKTEST_PERIOD_ANN = 3 / 12 # 1 month
BACKTEST_FREQUENCY_ANN = 1 / BACKTEST_PERIOD_ANN


## Run Strat for Different Cases

In [15]:
tar_dir = root_dir.joinpath('outputs', 'trade-plots')
tar_dir.mkdir(parents=True, exist_ok=True)
df_list = []

for stm_tech in stm_techs:
    for lemma in lemmas:
        for content in contents:
            results_dict = {
                'stm_tech': stm_tech,
                'lemma': 'No',
                'content': content
            }
            if lemma:
                col_name = f'{content}_{lemma}_pol_{stm_tech}_score'
                filename = str(tar_dir.joinpath(f"{content}_lemma_{stm_tech}.html"))
                results_dict[lemma] = 'Yes'
            else:
                col_name = f'{content}_pol_{stm_tech}_score'
                filename = str(tar_dir.joinpath(f"{content}_no_lemma_{stm_tech}.html"))

            # Running the backtest
            bt = Backtest(
                data=merged2, 
                strategy=SimpleStmStrat, 
                        cash=10000, 
                        margin=1,
                        commission=.0,
                        trade_on_close=False,
                        hedging=True
                        )
            
            results = bt.run(col=col_name)

            # display(results)
            # print(type(returns))
            # display(returns)

            bt.plot(filename=filename,
                    results=results,
                    plot_return=True,
                    open_browser=False)
            
            results_dict.update(results.to_dict())
            df_list.append(results_dict)
            # results_dict['returns'] = list(returns)
            # results_dict.update(results)
            # df_list.append(results_dict)
            # These are the main results that we need
            # print(results.get('Return [%]'), results.get('Max. Drawdown [%]'), results.get('# Trades'), results.get('Win Rate [%]'))



  bt = Backtest(
INFO:bokeh.io.state:Session output file '/Users/tangyiqwan/dev/projects/quant/fyp/outputs/trade-plots/cln_hdl_no_lemma_stc.html' already exists, will be overwritten.
  fig = gridplot(
  fig = gridplot(
  bt = Backtest(
INFO:bokeh.io.state:Session output file '/Users/tangyiqwan/dev/projects/quant/fyp/outputs/trade-plots/cln_smr_no_lemma_stc.html' already exists, will be overwritten.
  fig = gridplot(
  fig = gridplot(
  bt = Backtest(
INFO:bokeh.io.state:Session output file '/Users/tangyiqwan/dev/projects/quant/fyp/outputs/trade-plots/cln_news_no_lemma_stc.html' already exists, will be overwritten.
  fig = gridplot(
  fig = gridplot(
  bt = Backtest(
INFO:bokeh.io.state:Session output file '/Users/tangyiqwan/dev/projects/quant/fyp/outputs/trade-plots/cln_hdl_lemma_stc.html' already exists, will be overwritten.
  fig = gridplot(
  fig = gridplot(
  bt = Backtest(
INFO:bokeh.io.state:Session output file '/Users/tangyiqwan/dev/projects/quant/fyp/outputs/trade-plots/cln_smr

In [16]:
np.random.seed(7)
random_df_list = []
for ttl_trade in range(60, 390, 30):
    # Running the backtest
    # Pass in tick data (more opportunities to trade compared to merged data)
    bt = Backtest(
        data=tick, 
        strategy=RandomStrat, 
                cash=10000, 
                margin=1,
                commission=.0,
                trade_on_close=False,
                hedging=True
                )

    # Estimate 10 trades
    results = bt.run(ttl_trade=ttl_trade)

    tar_dir = root_dir.joinpath('outputs', 'trade-plots')
    tar_dir.mkdir(parents=True, exist_ok=True)
    filename = str(tar_dir.joinpath(f"randomStrat_{ttl_trade}_trades.html"))

    bt.plot(filename=filename,
            results=results,
            plot_return=True,
            open_browser=False)
    
    random_df_list.append(results)


  (data.index.is_numeric() and
  bt = Backtest(
INFO:bokeh.io.state:Session output file '/Users/tangyiqwan/dev/projects/quant/fyp/outputs/trade-plots/randomStrat_60_trades.html' already exists, will be overwritten.
  fig = gridplot(
  fig = gridplot(
  (data.index.is_numeric() and
  bt = Backtest(
INFO:bokeh.io.state:Session output file '/Users/tangyiqwan/dev/projects/quant/fyp/outputs/trade-plots/randomStrat_90_trades.html' already exists, will be overwritten.
  fig = gridplot(
  fig = gridplot(
  (data.index.is_numeric() and
  bt = Backtest(
INFO:bokeh.io.state:Session output file '/Users/tangyiqwan/dev/projects/quant/fyp/outputs/trade-plots/randomStrat_120_trades.html' already exists, will be overwritten.
  fig = gridplot(
  fig = gridplot(
  (data.index.is_numeric() and
  bt = Backtest(
INFO:bokeh.io.state:Session output file '/Users/tangyiqwan/dev/projects/quant/fyp/outputs/trade-plots/randomStrat_150_trades.html' already exists, will be overwritten.
  fig = gridplot(
  fig = grid

## Feature Creation

In [17]:
pd.set_option('display.max_columns', None)  # Show all columns
# pd.set_option('display.max_colwidth', None)  # Show full width of each column

rdf = pd.DataFrame(df_list)
random_rdf = pd.DataFrame(random_df_list)

display(random_rdf.head())
pd.reset_option('display.max_columns')
pd.reset_option('display.max_colwidth')

Unnamed: 0,Start,End,Duration,Exposure Time [%],Equity Final [$],Equity Peak [$],Return [%],Buy & Hold Return [%],Return (Ann.) [%],Volatility (Ann.) [%],Sharpe Ratio,Sortino Ratio,Calmar Ratio,Max. Drawdown [%],Avg. Drawdown [%],Max. Drawdown Duration,Avg. Drawdown Duration,# Trades,Win Rate [%],Best Trade [%],Worst Trade [%],Avg. Trade [%],Max. Trade Duration,Avg. Trade Duration,Profit Factor,Expectancy [%],SQN,Kelly Criterion,_strategy,_equity_curve,_trades
0,0.0,31397.0,31397.0,31.460603,10193.186035,10670.183444,1.93186,6.905782,0.0,,,,0.0,-4.538604,-0.164078,24641.0,296.1875,56.0,26.785714,2.122613,-0.988631,0.083067,1149.0,212.464286,1.200639,0.089787,0.499444,0.040906,RandomStrat(ttl_trade=60),Equity DrawdownPct DrawdownDura...,Size EntryBar ExitBar EntryPrice Exit...
1,0.0,31397.0,31397.0,44.598382,9860.523833,10095.196224,-1.394762,6.905782,0.0,,,,0.0,-4.163824,-0.684126,22942.0,2072.4,103.0,26.213592,6.92509,-6.651644,-0.073473,1471.0,237.203883,0.908246,-0.058197,-0.265927,-0.020142,RandomStrat(ttl_trade=90),Equity DrawdownPct DrawdownDura...,Size EntryBar ExitBar EntryPrice Exit...
2,0.0,31397.0,31397.0,49.82483,10255.372933,10559.846397,2.553729,6.905782,0.0,,,,0.0,-3.948549,-0.239334,9390.0,373.7125,113.0,29.20354,2.196074,-1.448039,0.122533,1155.0,222.353982,1.290872,0.12967,0.527262,0.034283,RandomStrat(ttl_trade=120),Equity DrawdownPct DrawdownDura...,Size EntryBar ExitBar EntryPrice Exi...
3,0.0,31397.0,31397.0,66.931015,10274.145036,10976.81653,2.74145,6.905782,0.0,,,,0.0,-6.785679,-0.235653,16030.0,199.394737,167.0,25.748503,2.87662,-1.745474,0.027228,1883.0,239.047904,1.071923,0.033919,0.516716,0.026616,RandomStrat(ttl_trade=150),Equity DrawdownPct DrawdownDura...,Size EntryBar ExitBar EntryPrice Exit...
4,0.0,31397.0,31397.0,61.621759,9182.876018,10013.5174,-8.17124,6.905782,0.0,,,,0.0,-9.126044,-1.348363,31136.0,4457.0,176.0,20.454545,2.133418,-1.293021,-0.098815,1476.0,243.551136,0.815287,-0.093127,-2.099379,-0.107279,RandomStrat(ttl_trade=180),Equity DrawdownPct DrawdownDura...,Size EntryBar ExitBar EntryPrice Exi...


In [18]:
rdf['# Trades'].describe()

count     30.000000
mean     232.300000
std       89.463208
min       66.000000
25%      181.500000
50%      250.500000
75%      298.750000
max      357.000000
Name: # Trades, dtype: float64

In [19]:

# actual = rdf['actual_ls'][0]
# predicted = rdf['predicted_ls'][0]

# cm = confusion_matrix(actual, predicted)
# print(f"Confusion matrix:\n{cm}")

# # Extracting TP, TN, FP, FN
# # First row is actually negative, second row is actually positive
# TP = cm[1, 1]
# TN = cm[0, 0]
# FP = cm[0, 1]
# FN = cm[1, 0]

# accuracy = accuracy_score(actual, predicted)
# precision = precision_score(actual, predicted)
# recall = recall_score(actual, predicted)
# f1 = f1_score(actual, predicted)

# print(f"Accuracy: {accuracy:.2f}")
# print(f"Precision: {precision:.2f}")
# print(f"Recall: {recall:.2f}")
# print(f"F1 Score: {f1:.2f}")

# Append each dictionary as rows into a new df
# Temporarily adjust display settings to show the full content of one row
def calc_f1(df: pd.DataFrame) -> pd.DataFrame:
    tmp = df.copy(deep=True)
    tmp['f1_score'] = np.nan
    # Iterate through each row in rdf
    for idx, row in tmp.iterrows():
        actual = row['actual_ls']
        predicted = row['predicted_ls']
        
        tmp.at[idx, 'f1_score'] = f1_score(actual, predicted, zero_division=0)
    return tmp

def cacl_sharpe(df: pd.DataFrame) -> pd.DataFrame:
    tmp = df.copy(deep=True)
    # ADJ_INTEREST_RATE_NOV = 4.42 / 100 / BACKTEST_FREQUENCY_ANN # Assume the yield is annualised
    ADJ_INTEREST_RATE_NOV = (1 + 4.42 / 100) ** (1 / BACKTEST_FREQUENCY_ANN) - 1
    # print(ADJ_INTEREST_RATE_NOV)
    # More complicated than this because the portfolio uses varying fractions of the liquidity pool to invest
    # rdf['ReturnPctList'].apply(lambda returns: np.prod([(1 + r) for r in returns]) - 1)
    tmp['ExcessReturn'] = tmp['ReturnPctList'].apply(lambda returnsPct: (pd.Series(returnsPct) - ADJ_INTEREST_RATE_NOV).tolist())
    tmp['ExcessReturnMean'] = tmp['ExcessReturn'].apply(lambda excess_returns: np.mean(excess_returns))
    tmp['ExcessReturnStdDev'] = tmp['ExcessReturn'].apply(lambda excess_returns: np.std(excess_returns))
    tmp['AdjSharpeRatio'] = tmp['ExcessReturnMean'] / tmp['ExcessReturnStdDev'] * np.sqrt(BACKTEST_FREQUENCY_ANN)

    return tmp

In [20]:
def create_features(df):
    tmp = df.copy(deep=True)
    # Positions (positive for long, negative for short) * (diff in exit and entry price) is the profit and loss for each trade
    tmp['pl_list'] = tmp['_trades'].apply(lambda df: (df['Size'] * (df['ExitPrice'] - df['EntryPrice'])).tolist())
    # print(len(rdf['pl_list'][0]))

    # In general if the prediction is neutral it won't trade
    # The actual up or down depends on the entry and exit price
    tmp['actual_ls'] = tmp['_trades'].apply(lambda df: ((df['ExitPrice'] - df['EntryPrice'] >=0).astype(int)).tolist())
    # The predicted up or down depends on my position size (vector)
    tmp['predicted_ls'] = tmp['_trades'].apply(lambda df: ((df['Size'] >= 0).astype(int)).tolist())


    # The ReturnPct in the backtesting framework does not account for the size
    for idx, row in rdf.iterrows():
        df = row['_trades']
        df['ReturnPct'] = df['PnL'] / df['EntryPrice']
        rdf.at[idx, '_trades'] = df

    tmp['ReturnPctList'] = tmp['_trades'].apply(lambda df: df['ReturnPct'].tolist())

    # F1 score
    tmp = calc_f1(tmp)

    # Sharpe ratio
    tmp = cacl_sharpe(tmp)

    # Annualised return
    tmp['annualised_return'] = tmp['Return [%]'] * BACKTEST_FREQUENCY_ANN
    return tmp
# # Check that the sum of profit and loss is equal to the diff between equity final - start (no comms)
# print(rdf['pl_list'].apply(lambda aList: np.sum(aList)) - (rdf['Equity Final [$]'] - pd.Series([equity_start] * len(rdf))))

In [23]:
rdf2 = create_features(df=rdf)
random_rdf2 = create_features(df=random_rdf)

random_rdf2

Unnamed: 0,Start,End,Duration,Exposure Time [%],Equity Final [$],Equity Peak [$],Return [%],Buy & Hold Return [%],Return (Ann.) [%],Volatility (Ann.) [%],...,pl_list,actual_ls,predicted_ls,ReturnPctList,f1_score,ExcessReturn,ExcessReturnMean,ExcessReturnStdDev,AdjSharpeRatio,annualised_return
0,0.0,31397.0,31397.0,31.460603,10193.186035,10670.183444,1.93186,6.905782,0.0,,...,"[50.95999999999984, 100.87999999999988, 98.061...","[1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, ...","[1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, ...","[0.020792446825438793, 0.02057591345388987, 0....",0.327869,"[0.00992101716420013, 0.009704483792651208, 0....",-0.009974,0.011644,-1.713145,7.727441
1,0.0,31397.0,31397.0,44.598382,9860.523833,10095.196224,-1.394762,6.905782,0.0,,...,"[-28.582840000000317, -31.336500000000456, -30...","[0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, ...","[1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, ...","[-0.005790571503818898, -0.006467134454648749,...",0.22449,"[-0.01666200116505756, -0.01733856411588741, -...",-0.011453,0.017412,-1.31559,-5.579047
2,0.0,31397.0,31397.0,49.82483,10255.372933,10559.846397,2.553729,6.905782,0.0,,...,"[-30.326659999999606, -29.38286000000005, -14....","[0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, ...","[1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, ...","[-0.006130610743193432, -0.00597372872396662, ...",0.272727,"[-0.017002040404432095, -0.016845158385205283,...",-0.009575,0.011998,-1.596081,10.214917
3,0.0,31397.0,31397.0,66.931015,10274.145036,10976.81653,2.74145,6.905782,0.0,,...,"[-7.10904000000005, -4.5639999999999645, -14.7...","[0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, ...","[-0.006262698874147765, -0.006053371531646201,...",0.326087,"[-0.017134128535386428, -0.016924801192884864,...",-0.010532,0.011621,-1.812612,10.965801
4,0.0,31397.0,31397.0,61.621759,9182.876018,10013.5174,-8.17124,6.905782,0.0,,...,"[-28.132910000000493, -24.32000000000002, 52.2...","[0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, ...","[1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, ...","[-0.005724444119964089, -0.006764612620230315,...",0.255319,"[-0.016595873781202752, -0.017636042281468978,...",-0.011803,0.010715,-2.203059,-32.684959
5,0.0,31397.0,31397.0,67.083891,9536.888398,10845.070951,-4.631116,6.905782,0.0,,...,"[-30.93999999999994, -28.822499999999707, -27....","[1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...","[0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, ...","[-0.006275709313363631, -0.005999999999999894,...",0.281553,"[-0.017147138974602294, -0.016871429661238557,...",-0.011843,0.015006,-1.578479,-18.524464
6,0.0,31397.0,31397.0,73.300847,10728.298969,11069.304293,7.28299,6.905782,0.0,,...,"[-7.741999999999962, -14.634016800000069, 94.6...","[1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, ...","[0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, ...","[-0.00586659594218264, -0.0059237678261327975,...",0.35,"[-0.016738025603421303, -0.01679519748737146, ...",-0.00972,0.013503,-1.439635,29.131959
7,0.0,31397.0,31397.0,80.658004,10344.255609,10820.201918,3.442556,6.905782,0.0,,...,"[-27.313519999999414, -11.464700000000022, -24...","[1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, ...","[0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, ...","[-0.005573050397877921, -0.004668607728957053,...",0.321918,"[-0.016444480059116584, -0.015540037390195716,...",-0.009867,0.011905,-1.657548,13.770224
8,0.0,31397.0,31397.0,83.884324,10558.285757,11126.06595,5.582858,6.905782,0.0,,...,"[-15.46999999999997, -3.974999999999966, 94.63...","[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-0.0063039677914922265, -0.007004096735826470...",0.341176,"[-0.01717539745273089, -0.017875526397065133, ...",-0.009568,0.012805,-1.494484,22.33143
9,0.0,31397.0,31397.0,85.820753,10944.334844,10961.040229,9.443348,6.905782,0.0,,...,"[-29.941599999999312, -28.444000000000585, -37...","[1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, ...","[-0.006106692120054991, -0.0057895850973752605...",0.362069,"[-0.016978121781293654, -0.016661014758613923,...",-0.010331,0.014186,-1.456448,37.773394


In [24]:
rdf2[rdf2['f1_score'] > 0.5]

Unnamed: 0,stm_tech,lemma,content,Start,End,Duration,Exposure Time [%],Equity Final [$],Equity Peak [$],Return [%],...,pl_list,actual_ls,predicted_ls,ReturnPctList,f1_score,ExcessReturn,ExcessReturnMean,ExcessReturnStdDev,AdjSharpeRatio,annualised_return
7,blob,No,cln_smr,0.0,31397.0,31397.0,36.113765,10672.988005,11202.229936,6.72988,...,"[49.869222000000036, 97.44800000000049, -38.84...","[1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, ...","[0.2606723224086563, 0.5089199916440386, -0.19...",0.510638,"[0.24980089274741762, 0.4980485619827999, -0.2...",0.017532,0.218893,0.160188,26.91952


### Sharpe Ratio
https://home.treasury.gov/resource-center/data-chart-center/interest-rates/TextView?type=daily_treasury_yield_curve&field_tdr_date_value=2023
- On 2023-11-01, the rate of the 1 month T-bill is 4.42% p.a.
- Scaling sharpe ratio: Scale the mean excess returns by the frequency, scale the standard deviation by the square root of frequency,
hence you get freq / sqrt(freq) = sqrt(freq)
- For using compound interest rate to find the monthly rate, need to use a fractional power, instead of discounting (negative exponent)

In [25]:
thresh = 1.0
print(f"Number of sharpe ratio greater than thresh: {np.sum(rdf2['AdjSharpeRatio'] > thresh)}")
print(np.max(rdf2['AdjSharpeRatio']))

Number of sharpe ratio greater than thresh: 0
0.20425817295049517


In [26]:
print(np.mean(rdf['Return [%]']))

print(len(rdf[rdf['Win Rate [%]'] > 50]))

7.7696867160000185
0


In [27]:
rdf2.describe(include='all')

Unnamed: 0,stm_tech,lemma,content,Start,End,Duration,Exposure Time [%],Equity Final [$],Equity Peak [$],Return [%],...,pl_list,actual_ls,predicted_ls,ReturnPctList,f1_score,ExcessReturn,ExcessReturnMean,ExcessReturnStdDev,AdjSharpeRatio,annualised_return
count,30,30,30,30.0,30.0,30.0,30.0,30.0,30.0,30.0,...,30,30,30,30,30.0,30,30.0,30.0,30.0,30.0
unique,5,2,3,,,,,,,,...,30,30,30,30,,30,,,,
top,stc,No,cln_hdl,,,,,,,,...,"[-14.081391999999852, -9.720000000000027, -15....","[0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, ...","[1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, ...","[-0.07432378638035053, -0.05146124523507003, -...",,"[-0.0851952160415892, -0.06233267489630869, -0...",,,,
freq,6,15,10,,,,,,,,...,1,1,1,1,,1,,,,
mean,,,,0.0,31397.0,31397.0,62.077627,10776.968672,11188.694023,7.769687,...,,,,,0.351614,,0.002761,0.180473,0.03147,31.078747
std,,,,0.0,0.0,0.0,16.012658,490.157994,437.839947,4.90158,...,,,,,0.084003,,0.010841,0.018547,0.111188,19.60632
min,,,,0.0,31397.0,31397.0,28.473151,9764.202526,10483.54567,-2.357975,...,,,,,0.191304,,-0.024931,0.155202,-0.248639,-9.431899
25%,,,,0.0,31397.0,31397.0,61.253902,10587.680763,10790.249869,5.876808,...,,,,,0.289039,,-5.6e-05,0.163816,-0.000656,23.507231
50%,,,,0.0,31397.0,31397.0,69.504427,10741.675323,11168.262196,7.416753,...,,,,,0.348055,,0.002814,0.17995,0.034169,29.667013
75%,,,,0.0,31397.0,31397.0,72.955284,10972.672271,11517.021581,9.726723,...,,,,,0.420504,,0.009829,0.191426,0.097503,38.906891


In [28]:
# pd.set_option('display.max_columns', None)  # Show all columns
# pd.set_option('display.max_colwidth', None)  # Show full width of each column

rdf_bottom = rdf2.sort_values(by='Return [%]', ascending=True).head()
rdf_top = rdf2.sort_values(by='Return [%]', ascending=False).head()


# Analysis

## Mann-Whitney U Test

In [29]:
from scipy.stats import mannwhitneyu
import itertools
import numpy as np

# Assuming rdf2 is your DataFrame and 'ReturnPctList' is the column with returns data
N_SAMPLES = len(rdf2)
N_RANDOM_SAMPLES = len(random_rdf2)
P_VALUE_BENCHMARK = 0.05
matrix = np.zeros((N_SAMPLES, N_SAMPLES), dtype=int)

for i in range(N_SAMPLES):
    for j in range(i):  # Lower triangular part
        group_i = rdf2.loc[i, 'ReturnPctList']
        group_j = rdf2.loc[j, 'ReturnPctList']
        stat, p_value = mannwhitneyu(group_i, group_j, alternative='greater')
        if p_value < P_VALUE_BENCHMARK:
            matrix[i, j] = 1  # Mark as 1 if group_i is statistically greater than group_j
            print(i, j, "Statistically greater!")
# Sum across rows to find the "best" one
row_sums = matrix.sum(axis=1)


12 6 Statistically greater!
12 8 Statistically greater!
12 9 Statistically greater!
12 11 Statistically greater!
13 6 Statistically greater!
13 8 Statistically greater!
13 9 Statistically greater!
13 11 Statistically greater!
14 6 Statistically greater!
14 8 Statistically greater!
14 9 Statistically greater!
14 11 Statistically greater!
15 6 Statistically greater!
15 8 Statistically greater!
15 9 Statistically greater!
15 11 Statistically greater!
16 6 Statistically greater!
16 8 Statistically greater!
16 9 Statistically greater!
16 11 Statistically greater!
17 6 Statistically greater!
17 8 Statistically greater!
17 9 Statistically greater!
17 11 Statistically greater!
18 6 Statistically greater!
18 8 Statistically greater!
18 9 Statistically greater!
18 10 Statistically greater!
18 11 Statistically greater!
19 6 Statistically greater!
19 7 Statistically greater!
19 8 Statistically greater!
19 9 Statistically greater!
19 10 Statistically greater!
19 11 Statistically greater!
20 6 Stati

In [32]:
def matrix_to_latex_with_indices(matrix, row_sums, name):
    num_elements = matrix.shape[0]
    latex_str = "\\begin{table}[H]\n\\centering\n"
    latex_str += "\\resizebox{\\textwidth}{!}{%\n"  # Resize table to fit within page width
    latex_str += "\\begin{tabular}{|" + " c |" * (num_elements + 2) + "}\n\\hline\n"
    
    # Header row with indices
    latex_str += " & " + " & ".join(str(i) for i in range(num_elements)) + " & Sum \\\\\n\\hline\n"
    
    for i, row in enumerate(matrix):
        # Row with index
        row_str = str(i) + " & " + ' & '.join('1' if val == 1 else ' ' for val in row) + f" & {row_sums[i]} \\\\\n\\hline\n"
        latex_str += row_str
    latex_str += "\\end{tabular}%\n}\n"
    latex_str += f"\\caption{{name}}\n"
    latex_str += "\\end{table}\n"
    return latex_str

# Assuming 'matrix' is your numpy array and 'row_sums' is the sum across rows
latex_table = matrix_to_latex_with_indices(matrix, row_sums, "test")
print(latex_table)


\begin{table}[H]
\centering
\resizebox{\textwidth}{!}{%
\begin{tabular}{| c | c | c | c | c | c | c | c | c | c | c | c | c | c | c | c | c | c | c | c | c | c | c | c | c | c | c | c | c | c | c | c |}
\hline
 & 0 & 1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10 & 11 & 12 & 13 & 14 & 15 & 16 & 17 & 18 & 19 & 20 & 21 & 22 & 23 & 24 & 25 & 26 & 27 & 28 & 29 & Sum \\
\hline
0 &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   & 0 \\
\hline
1 &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   & 0 \\
\hline
2 &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   & 0 \\
\hline
3 &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   & 0 \\
\hline
4 &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &   &  

In [None]:
# Assuming rdf2 is your DataFrame and 'ReturnPctList' is the column with returns data
matrix = np.zeros((N_SAMPLES, N_RANDOM_SAMPLES), dtype=int)

for i in range(N_SAMPLES):
    for j in range(N_RANDOM_SAMPLES):  # Lower triangular part
        group_i = rdf2.loc[i, 'ReturnPctList']
        group_j = random_rdf2.loc[j, 'ReturnPctList']
        stat, p_value = mannwhitneyu(group_i, group_j, alternative='greater')
        print(p_value)
        if p_value < P_VALUE_BENCHMARK:
            matrix[i, j] = 1  # Mark as 1 if group_i is statistically greater than group_j
            print(i, j, "Statistically greater!")
# Sum across rows to find the "best" one
row_sums = matrix.sum(axis=1)



In [None]:
latex_table = matrix_to_latex_with_indices(matrix, row_sums)
print(latex_table)

In [None]:
rdf2[:5].describe(include='all')

## Analysis

In [None]:
def print_hist(df):
    for i in range(len(df)):
        plt.figure(figsize=(5, 3))
        plt.hist(x=df.loc[i, 'ReturnPctList'], bins=10)
        plt.figure(figsize=(5, 3))
        plt.boxplot(x=df.loc[i, 'ReturnPctList'])
# TODO: Include title

In [None]:
print_hist(rdf2)

In [None]:
combos = list(itertools.combinations(range(0, N_SAMPLES), 2))
count = 0
for i, j in combos:
    ks_stat, ks_p_val = stats.ks_2samp(rdf2.ReturnPctList[i], rdf2.ReturnPctList[j])
    # ks_stat, p_val = stats.ks_2samp(random_rdf2.ReturnPctList[i], random_rdf2.ReturnPctList[j])
    
    # Different distributions
    if ks_p_val < P_VALUE_BENCHMARK:
        count += 1
print(f"Number of pairs with different distributions: {count} / {len(combos)}")

In [None]:
return1 = rdf2.ReturnPctList[0]
return2 = random_rdf2.ReturnPctList[5]
n_samples = len(rdf2)
count = 0
ttl_count = 0
for i in range(N_SAMPLES):
    for j in range(N_RANDOM_SAMPLES):
        ks_stat, ks_p_val = stats.ks_2samp(rdf2.ReturnPctList[i], random_rdf2.ReturnPctList[j])
        # ks_stat, p_val = stats.ks_2samp(random_rdf2.ReturnPctList[i], random_rdf2.ReturnPctList[j])
        
        # Different distributions
        if ks_p_val < P_VALUE_BENCHMARK:
            count += 1
        ttl_count += 1
print(f"Number of pairs with different distributions: {count} / {ttl_count}")

# All the distributions of the samples are different from the random samples

# Present

In [33]:
rdf2.columns

def present(df: pd.DataFrame) -> pd.DataFrame:
    tmp = df.copy(deep=True)
    present_cols = [ 
        'Return [%]', 'annualised_return', '# Trades', 'Win Rate [%]',
        'f1_score', 'AdjSharpeRatio']
    
    if 'stm_tech' in tmp.columns:
        present_cols = ['stm_tech', 'lemma', 'content'] + present_cols
    tmp = tmp[present_cols]
    # , 'Best Trade [%]', 'Worst Trade [%]', 'Avg. Trade [%]'
    # 'Max. Trade Duration', 'Avg. Trade Duration', 'Profit Factor',
    # 'Expectancy [%]', 'SQN', 'Kelly Criterion',
    # 'Avg. Drawdown [%]', 'Max. Drawdown Duration', 'Avg. Drawdown Duration'

    stm_tech_dict = {
        'stc': "Sentic API",
        'blob': "BlobText",
        'sid': "VADER",
        'bert': "BERT",
        'finbert': "FinBERT"
    }

    content_dict = {
        'cln_hdl': 'Headline',
        'cln_smr': 'Summary',
        'cln_news': 'Content'
    }
    if 'stm_tech' in tmp.columns:
        tmp['stm_tech'] = tmp['stm_tech'].apply(lambda name: stm_tech_dict.get(name, None))
        tmp['content'] = tmp['content'].apply(lambda name: content_dict.get(name, None))
    
    for col in ['Return [%]', 'annualised_return', 'Win Rate [%]',
                'f1_score', 'AdjSharpeRatio']:
        tmp[col] = tmp[col].apply(lambda num: round(num, 2))

    tmp['# Trades'] = tmp['# Trades'].apply(lambda num: round(num))
    rename_dict = {
        'stm_tech': 'Model',
        'lemma': 'Lemma',
        'content': 'Text',
        'annualised_return': 'Ann. Return [%]',
        'f1_score': 'F1 Score',
        'AdjSharpeRatio': 'Sharpe Ratio',
    }
    tmp = tmp.rename(columns=rename_dict)
    return tmp


In [52]:
rdf3 = present(rdf2)

In [62]:
combo_cols = ['Model', 'Lemma', 'Text']

rdf3.index.name = 'C'
other_cols = rdf3.columns.difference(combo_cols)
rdf4 = rdf3[combo_cols]
rdf4.reset_index(drop=False, inplace=True)
rdf5 = rdf3[other_cols]
rdf5.reset_index(drop=False, inplace=True)
rdf4

Unnamed: 0,C,Model,Lemma,Text
0,0,Sentic API,No,Headline
1,1,Sentic API,No,Summary
2,2,Sentic API,No,Content
3,3,Sentic API,Yes,Headline
4,4,Sentic API,Yes,Summary
5,5,Sentic API,Yes,Content
6,6,BlobText,No,Headline
7,7,BlobText,No,Summary
8,8,BlobText,No,Content
9,9,BlobText,Yes,Headline


In [77]:
rdf3.columns

Index(['Model', 'Lemma', 'Text', 'Return [%]', 'Ann. Return [%]', '# Trades',
       'Win Rate [%]', 'F1 Score', 'Sharpe Ratio'],
      dtype='object')

In [66]:
latex_str4 = rdf4.to_latex(index=False, float_format="%.2f", escape=True, header=True, longtable=False, caption="Combinations of Models, Lemmatisation, and News Text", label="tab:combinations-of-models-lemmatisation-news-text", position="ht!")
latex_str5 = rdf5.to_latex(index=False, float_format="%.2f", escape=True, header=True, longtable=False, caption="Trading Strategy Performance", label="tab:trading_strategy_performance", position="ht!")

print(latex_str4)


\begin{table}[ht!]
\caption{Combinations of Models, Lemmatisation, and News Text}
\label{tab:combinations-of-models-lemmatisation-news-text}
\begin{tabular}{rlll}
\toprule
C & Model & Lemma & Text \\
\midrule
0 & Sentic API & No & Headline \\
1 & Sentic API & No & Summary \\
2 & Sentic API & No & Content \\
3 & Sentic API & Yes & Headline \\
4 & Sentic API & Yes & Summary \\
5 & Sentic API & Yes & Content \\
6 & BlobText & No & Headline \\
7 & BlobText & No & Summary \\
8 & BlobText & No & Content \\
9 & BlobText & Yes & Headline \\
10 & BlobText & Yes & Summary \\
11 & BlobText & Yes & Content \\
12 & VADER & No & Headline \\
13 & VADER & No & Summary \\
14 & VADER & No & Content \\
15 & VADER & Yes & Headline \\
16 & VADER & Yes & Summary \\
17 & VADER & Yes & Content \\
18 & BERT & No & Headline \\
19 & BERT & No & Summary \\
20 & BERT & No & Content \\
21 & BERT & Yes & Headline \\
22 & BERT & Yes & Summary \\
23 & BERT & Yes & Content \\
24 & FinBERT & No & Headline \\
25 & FinBER

In [91]:
random_rdf3 = present(random_rdf2)
random_rdf3.reset_index(drop=False, names='R', inplace=True)

latex_str3 = random_rdf3.to_latex(index=False, float_format="%.2f", escape=True, header=True, longtable=False, caption="Trading Strategy Performance", label="tab:trading_strategy_performance", position="ht!")
print(latex_str3)

\begin{table}[ht!]
\caption{Trading Strategy Performance}
\label{tab:trading_strategy_performance}
\begin{tabular}{rrrrrrr}
\toprule
R & Return [\%] & Ann. Return [\%] & \# Trades & Win Rate [\%] & F1 Score & Sharpe Ratio \\
\midrule
0 & 1.93 & 7.73 & 56 & 26.79 & 0.33 & -1.71 \\
1 & -1.39 & -5.58 & 103 & 26.21 & 0.22 & -1.32 \\
2 & 2.55 & 10.21 & 113 & 29.20 & 0.27 & -1.60 \\
3 & 2.74 & 10.97 & 167 & 25.75 & 0.33 & -1.81 \\
4 & -8.17 & -32.68 & 176 & 20.45 & 0.26 & -2.20 \\
5 & -4.63 & -18.52 & 196 & 24.49 & 0.28 & -1.58 \\
6 & 7.28 & 29.13 & 218 & 28.44 & 0.35 & -1.44 \\
7 & 3.44 & 13.77 & 275 & 28.00 & 0.32 & -1.66 \\
8 & 5.58 & 22.33 & 321 & 30.22 & 0.34 & -1.49 \\
9 & 9.44 & 37.77 & 313 & 29.07 & 0.36 & -1.46 \\
10 & -6.48 & -25.93 & 369 & 26.83 & 0.32 & -1.67 \\
\bottomrule
\end{tabular}
\end{table}



### Ranking

In [106]:
metrics = ['Return [%]', 'Ann. Return [%]', 'Win Rate [%]', 'F1 Score', 'Sharpe Ratio']
def rank_df(df, idx) -> pd.DataFrame:
    top = df.sort_values(by=metrics[idx], ascending=False)
    top.reset_index(drop=False, names='C', inplace=True)
    top.reset_index(drop=False, names='Rank', inplace=True)
    top['Rank'] += 1
    top = top[['Rank', 'C', 'Model', 'Lemma', 'Text', metrics[idx]]]
    return top
top_return_rdf3 = rank_df(df=rdf3, idx=1)
top_return_rdf3

Unnamed: 0,Rank,C,Model,Lemma,Text,Ann. Return [%]
0,1,25,FinBERT,No,Summary,71.79
1,2,29,FinBERT,Yes,Content,67.37
2,3,26,FinBERT,No,Content,62.55
3,4,28,FinBERT,Yes,Summary,59.39
4,5,13,VADER,No,Summary,48.55
5,6,16,VADER,Yes,Summary,47.4
6,7,19,BERT,No,Summary,42.81
7,8,4,Sentic API,Yes,Summary,39.63
8,9,21,BERT,Yes,Headline,36.73
9,10,24,FinBERT,No,Headline,35.95


In [95]:

return_str = top_return_rdf3.to_latex(index=False, float_format="%.2f", escape=True, header=True, longtable=False, 
                                      caption="Trading Strategy Performance Ranked by Annual Returns", 
                                      label="tab:trading_strategy_performance_by_annual_returns", position="ht!")
print(return_str)

\begin{table}[ht!]
\caption{Trading Strategy Performance Ranked by Annual Returns}
\label{tab:trading_strategy_performance_by_annual_returns}
\begin{tabular}{rrlllr}
\toprule
Rank & C & Model & Lemma & Text & Ann. Return [\%] \\
\midrule
1 & 25 & FinBERT & No & Summary & 71.79 \\
2 & 29 & FinBERT & Yes & Content & 67.37 \\
3 & 26 & FinBERT & No & Content & 62.55 \\
4 & 28 & FinBERT & Yes & Summary & 59.39 \\
5 & 13 & VADER & No & Summary & 48.55 \\
6 & 16 & VADER & Yes & Summary & 47.40 \\
7 & 19 & BERT & No & Summary & 42.81 \\
8 & 4 & Sentic API & Yes & Summary & 39.63 \\
9 & 21 & BERT & Yes & Headline & 36.73 \\
10 & 24 & FinBERT & No & Headline & 35.95 \\
11 & 2 & Sentic API & No & Content & 35.73 \\
12 & 1 & Sentic API & No & Summary & 32.07 \\
13 & 3 & Sentic API & Yes & Headline & 31.69 \\
14 & 12 & VADER & No & Headline & 31.58 \\
15 & 14 & VADER & No & Content & 29.78 \\
16 & 5 & Sentic API & Yes & Content & 29.56 \\
17 & 22 & BERT & Yes & Summary & 28.73 \\
18 & 18 & BERT & N

In [99]:

metrics = ['Return [%]', 'Ann. Return [%]', 'Win Rate [%]', 'F1 Score', 'Sharpe Ratio']
top_f1_rdf3 = rank_df(df=rdf3, idx=3)
f1_str = top_f1_rdf3.to_latex(index=False, float_format="%.2f", escape=True, header=True, longtable=False, 
                                      caption="Trading Strategy Performance Ranked by F1 Score", 
                                      label="tab:trading_strategy_performance_by_F1_score", position="ht!")
print(f1_str)

\begin{table}[ht!]
\caption{Trading Strategy Performance Ranked by F1 Score}
\label{tab:trading_strategy_performance_by_F1_score}
\begin{tabular}{rrlllr}
\toprule
Rank & C & Model & Lemma & Text & F1 Score \\
\midrule
1 & 7 & BlobText & No & Summary & 0.51 \\
2 & 10 & BlobText & Yes & Summary & 0.49 \\
3 & 2 & Sentic API & No & Content & 0.46 \\
4 & 5 & Sentic API & Yes & Content & 0.45 \\
5 & 4 & Sentic API & Yes & Summary & 0.43 \\
6 & 8 & BlobText & No & Content & 0.43 \\
7 & 17 & VADER & Yes & Content & 0.42 \\
8 & 14 & VADER & No & Content & 0.42 \\
9 & 1 & Sentic API & No & Summary & 0.42 \\
10 & 3 & Sentic API & Yes & Headline & 0.40 \\
11 & 0 & Sentic API & No & Headline & 0.39 \\
12 & 16 & VADER & Yes & Summary & 0.39 \\
13 & 13 & VADER & No & Summary & 0.39 \\
14 & 11 & BlobText & Yes & Content & 0.38 \\
15 & 26 & FinBERT & No & Content & 0.35 \\
16 & 9 & BlobText & Yes & Headline & 0.35 \\
17 & 29 & FinBERT & Yes & Content & 0.34 \\
18 & 25 & FinBERT & No & Summary & 0.33 \\

In [98]:
metrics = ['Return [%]', 'Ann. Return [%]', 'Win Rate [%]', 'F1 Score', 'Sharpe Ratio']
top_sharpe_rdf3 = rank_df(df=rdf3, idx=4)
sharpe_str = top_sharpe_rdf3.to_latex(index=False, float_format="%.2f", escape=True, header=True, longtable=False, 
                                      caption="Trading Strategy Performance Ranked by Sharpe Ratio", 
                                      label="tab:trading_strategy_performance_by_sharpe", position="ht!")
print(sharpe_str)

\begin{table}[ht!]
\caption{Trading Strategy Performance Ranked by Sharpe Ratio}
\label{tab:trading_strategy_performance_by_sharpe}
\begin{tabular}{rrlllr}
\toprule
Rank & C & Model & Lemma & Text & Sharpe Ratio \\
\midrule
1 & 25 & FinBERT & No & Summary & 0.20 \\
2 & 29 & FinBERT & Yes & Content & 0.19 \\
3 & 26 & FinBERT & No & Content & 0.17 \\
4 & 7 & BlobText & No & Summary & 0.16 \\
5 & 28 & FinBERT & Yes & Summary & 0.15 \\
6 & 13 & VADER & No & Summary & 0.14 \\
7 & 16 & VADER & Yes & Summary & 0.14 \\
8 & 10 & BlobText & Yes & Summary & 0.10 \\
9 & 19 & BERT & No & Summary & 0.09 \\
10 & 12 & VADER & No & Headline & 0.08 \\
11 & 24 & FinBERT & No & Headline & 0.07 \\
12 & 14 & VADER & No & Content & 0.05 \\
13 & 21 & BERT & Yes & Headline & 0.04 \\
14 & 15 & VADER & Yes & Headline & 0.04 \\
15 & 4 & Sentic API & Yes & Summary & 0.04 \\
16 & 2 & Sentic API & No & Content & 0.03 \\
17 & 17 & VADER & Yes & Content & 0.02 \\
18 & 27 & FinBERT & Yes & Headline & 0.02 \\
19 & 20 & 

## Portfolio Construction
- We actually have quite a good number of "returns" based on different techniques.
- We can utilise the backtest data to choose the techniques that will be used in the future to obtain
the best risk-adjusted return.


## Archive

### ANOVA

- Before removing outliers, 19 significantly different pairs of groups were found, observed p value is about 0.04
- After removing, 15 were found, observed p value is about 0.03


In [None]:
from scipy.stats import f_oneway

def anova(*groups: typing.List, plot:bool=False) -> bool:
    # Calculates n F-statistic between bootstrap samples and return the list
    def bootstrap_f_stat(data_groups, n_bootstraps=1000):
        bs_f_stat_list = []
        # data_groups is all the groups that we want to compare
        
        for _ in range(n_bootstraps):
            # Get a list of randomly chosen samples for each group length
            # The length of each group might be different
            resampled_groups = [np.random.choice(group, size=len(group), replace=True) for group in data_groups]

            # Calculate the F-statistic for ith bootstrap
            # Unzip the resampled_groups to be parameters
            f_stat, p_val = f_oneway(*resampled_groups)
            bs_f_stat_list.append(f_stat)

        return bs_f_stat_list

    # Calculate the observed F-statistic
    obs_f_stat, obs_p_val = f_oneway(*groups)

    # Bootstrap the F-statistic
    bs_f_stat_list = bootstrap_f_stat(data_groups=groups)

    if plot:
        # Plotting the histogram of bootstrapped F-statistics
        plt.figure(figsize=(10, 6))
        plt.hist(bs_f_stat_list, bins=30, color='skyblue', alpha=0.7, label='Bootstrapped F-statistics')

        # Marking the observed F-statistic
        plt.axvline(obs_f_stat, color='red', linestyle='dashed', linewidth=2, label=f'Observed F-statistic ({obs_f_stat:.4f})')

        plt.title('Distribution of Bootstrapped F-statistics with Observed F-statistic')
        plt.xlabel('F-statistic')
        plt.ylabel('Frequency')
        plt.legend()
        plt.show()

    alpha = 0.05

    upper_quantile = np.quantile(bs_f_stat_list, 1 - alpha)

    if obs_f_stat >= upper_quantile:
        print("The difference between groups is statistically significant.")
        return True
    else:
        print("No significant difference between groups was found.")
        return False


In [None]:
rdf.columns

In [None]:

# Combination of 2 numbers from 0 to 29
# TODO: Change the numbers
all_combo = list(itertools.combinations(range(0, 30), 2))

count = 0

for idx in all_combo:
    # Select the 'adj_returns' for each index in idx, creating a list of pd.Series
    # Each pd.Series contains the 'adj_returns' values for one of the selected indices
    group_returns = [rdf2.loc[i, 'ReturnPctList'] for i in idx]

    # Convert the list of returns into a format suitable for the anova function
    # Assuming the anova function is designed to take multiple pd.Series as separate arguments
    stat_diff = anova(*group_returns, plot=False)

    if stat_diff: break

    count += 1

print(count / len(all_combo))

In [None]:
def normality_test(data: np.ndarray, plot=False):
    """
    Parameters:
    - data: a numpy array
    """
    data3 = data
    # data2 = np.log(data)
    # pl2 = pl
    # q1 = data2.quantile(0.25)
    # q3 = data2.quantile(0.75)
    # iqr = q3 - q1

    # # Define outliers
    # lower_bound = q1 - 1.5 * iqr
    # upper_bound = q3 + 1.5 * iqr

    # data3 = data2[(data2 >= lower_bound) & (data2 <= upper_bound)]

    # Normality Test
    _, p_value_normality_group1 = stats.shapiro(data3)

    print(f"Normality Test P-Values: Group1={p_value_normality_group1}")
    if plot:
        # Q-Q Plots for Visual Normality Check
        plt.figure(figsize=(5,3))
        sm.qqplot(data3, line ='45')
        plt.title('Group 1 Q-Q Plot')
        plt.show()

        plt.figure(figsize=(5,3))
        plt.hist(data3, bins=50, alpha=0.75, color='blue')
        plt.title('Returns Distribution')
        plt.xlabel('Returns')
        plt.ylabel('Frequency')
        plt.grid(True)
        plt.show()
    if p_value_normality_group1 < P_VALUE_BENCHMARK:
        return False
    return True

In [None]:
count = 0
for i in range(N_SAMPLES):
    count += normality_test(np.array(rdf2['ReturnPctList'][i]))
print(f"Number of normally distributed returns: {count} / {N_SAMPLES}")


In [None]:
count = 0
for i in range(N_RANDOM_SAMPLES):
    count += normality_test(np.array(random_rdf2['ReturnPctList'][i]))
print(f"Number of normally distributed returns: {count} / {N_SAMPLES}")

In [None]:
import numpy as np

def bootstrap_returns(returns, n_bootstraps=100):
    """Generate bootstrap samples for returns and calculate mean for each sample. Ustilising the Central Limit Theorem"""
    bootstrap_means = np.array([np.mean(np.random.choice(returns, size=len(returns), replace=True)) for _ in range(n_bootstraps)])
    return bootstrap_means

from scipy.stats import kstest, norm

def ks_test_with_theoretical_distribution(bootstrap_means):
    """Perform KS test comparing bootstrap means with a normal distribution."""
    # Assuming the theoretical normal distribution has the same mean and std as the bootstrap_means
    mean, std = np.mean(bootstrap_means), np.std(bootstrap_means)
    return kstest(bootstrap_means, 'norm', args=(mean, std))

def nested_ks_test_for_p_values(p_values):
    """Perform KS test to check if the given p-values are uniformly distributed."""
    return kstest(p_values, 'uniform')

# Mock data: returns for different sentiment analysis techniques
returns_data = {
    'Technique0': pl
    # 'Technique1': np.random.normal(0.05, 0.02, 1000),
    # 'Technique2': np.random.normal(0.04, 0.02, 1000),
    # Add more techniques as needed
}

n_bootstraps = 100
p_values_for_ks_tests = []

for technique, returns in returns_data.items():
    # Step 1: Bootstrap
    bootstrap_means = bootstrap_returns(returns, n_bootstraps)
    
    # Step 2: KS Test with Theoretical Distribution
    ks_stat, ks_p_value = ks_test_with_theoretical_distribution(bootstrap_means)
    print(f"KS test for {technique}: Stat={ks_stat}, P-Value={ks_p_value}")
    
    p_values_for_ks_tests.append(ks_p_value)

# Step 3: Nested-KS Test
nested_ks_stat, nested_ks_p_value = nested_ks_test_for_p_values(p_values_for_ks_tests)
print(f"Nested KS test: Stat={nested_ks_stat}, P-Value={nested_ks_p_value}")


In [None]:
rdf.to_latex(index=False, header=True)

### Next
