# Installation

- `pip install -r requirements.txt`

# Libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import yfinance as yf
import pandas_ta as ta
#import plotly.graph_objects as go
#from plotly.subplots import make_subplots

import sys
import os

# Add the path to the folder containing utils.py
sys.path.append(os.path.abspath('../app/helpers'))

import importlib
import metric_utils
importlib.reload(metric_utils)

# Data Retrieval

Possible user inputs are:

- `tickers`
- `start_date`
- `end_date`

[What are the stock sectors?](https://www.schwab.com/learn/story/what-are-stock-sectors)

| Ticker | Company | Sector |
|:---|:---| :---|
| KO | Coca-Cola | Consumer Staples |
| PEP | Pepsi | Consumer Staples |
| WMT | Walmart | Consumer Staples |
| SBUX | Starbucks | Consumer Discretionary |
| MCD | McDonald's | Consumer Discretionary |
| AAL | American Airlines | Industrials |
| DAL | Delta Airlines | Industrials |
| F | Ford Motors | Industrials |
| VZ | Verizon | Communication Services |
| T | AT&T | Communication Services |
| DIS | Disney | Communication Services |
| BAC | Bank of America | Financials |
| JPM | JP Morgan | Financials |
| MA | Mastercard | Financials |
| V | Visa | Financials |
| ORCL | Oracle | Information Technology |
| AMD | AMD | Information Technology |
| NVDA | Nvidia | Information Technology |
| AAPL | Apple | Information Technology |
| MSFT | Microsoft | Information Technology |

In [2]:
# user to decide this parameters
TICKERS = [
    'KO','PEP','WMT', # consumer staples
    'SBUX','MCD', # consumer discretionary
    'AAL','DAL','F', # industrials
    'VZ', 'T', 'DIS', # communication services
    'BAC','JPM','MA','V', # financials
    'ORCL','AMD','NVDA','AAPL','MSFT', # information technology
] 
START_DATE = "2000-01-01"
END_DATE = "2024-07-31"

Documentation: [yfinance](https://github.com/ranaroussi/yfinance/wiki/Tickers#parameters)

In [None]:
df = yf.download(tickers=TICKERS, interval="1d", start=START_DATE, end=END_DATE, auto_adjust=True, group_by='ticker')
df.head() # see how the raw data format looks

In [None]:
# reformat for easy read from store
df = df.stack(level='Ticker')
df.columns = [s.lower() for s in df.columns]
df.index.names = [s.lower() for s in df.index.names]
df.reset_index(inplace=True)
df.head()

In [None]:
# download the benchmark data for reference later
spy = yf.download(tickers='SPY', interval="1d", start=START_DATE, end=END_DATE, auto_adjust=True)
spy.reset_index(inplace=True)
spy.columns = [c.lower() for c in spy.columns]
if not spy.isnull().sum().any():
    spy.to_csv('../app/static/data/spy.csv', index=False)
    print("Saved to data/spy.csv")

In [None]:
spy = pd.read_csv('../app/static/data/spy.csv', parse_dates=['date'])
spy.set_index('date', inplace=True)
spy

In [None]:
# just a performance statistic of buying and holding the S&P 500 for the entire period
metric_utils.benchmark_performance(spy, START_DATE, END_DATE)

# Data Cleaning

Check for any missing values before storing for further processing.

In [None]:
df.info()

In [None]:
df.isnull().sum()

No missing values to fill. Market holidays have been automatically excluded.

# Data Store

In [10]:
# format is just a csv file with column names above
df.to_csv('../app/static/data/ohlcv.csv', index=False)

# Data Processing

Construct the indicators' values for each stock.

In [None]:
# load the data into proper format for processing
df = pd.read_csv('../app/static/data/ohlcv.csv', parse_dates=['date'])
df.set_index(['date', 'ticker'], inplace=True)
df.head()

Documentation: [pandas-ta](https://twopirllc.github.io/pandas-ta/)

In [12]:
def contruct_indicators(group):

    # indicator: Simple Moving Averages
    group['sma_5'] = ta.sma(group['close'], length=5)
    group['sma_10'] = ta.sma(group['close'], length=10)
    group['sma_21'] = ta.sma(group['close'], length=21)
    group['sma_50'] = ta.sma(group['close'], length=50)
    group['sma_100'] = ta.sma(group['close'], length=100)
    group['sma_200'] = ta.sma(group['close'], length=200)

    # indicator: Exponential Moving Averages
    group['ema_5'] = ta.ema(group['close'], length=5)
    group['ema_10'] = ta.ema(group['close'], length=10)
    group['ema_21'] = ta.ema(group['close'], length=21)
    group['ema_50'] = ta.ema(group['close'], length=50)
    group['ema_100'] = ta.ema(group['close'], length=100)
    group['ema_200'] = ta.ema(group['close'], length=200)

    # indicator: normalized volume
    group['norm_volume_3'] = group['volume'] / group['volume'].rolling(3).median()
    group['norm_volume_5'] = group['volume'] / group['volume'].rolling(5).median()
    group['norm_volume_10'] = group['volume'] / group['volume'].rolling(10).median()
    group['norm_volume_21'] = group['volume'] / group['volume'].rolling(21).median()
    group['norm_volume_50'] = group['volume'] / group['volume'].rolling(50).median()

    # indicator: Relative Strength Index
    group['rsi_7'] = ta.rsi(group['close'], length=7)
    group['rsi_9'] = ta.rsi(group['close'], length=9)
    group['rsi_10'] = ta.rsi(group['close'], length=10)
    group['rsi_14'] = ta.rsi(group['close'], length=14)
    

    # indicator: Average Directional Index
    adx_result = ta.adx(group['high'], group['low'], group['close'], length=3)
    group['adx_3'] = adx_result['ADX_3']
    group['+DI_3'] = adx_result['DMP_3']  # +DI
    group['-DI_3'] = adx_result['DMN_3']  # -DI

    adx_result = ta.adx(group['high'], group['low'], group['close'], length=5)
    group['adx_5'] = adx_result['ADX_5']
    group['+DI_5'] = adx_result['DMP_5']  # +DI
    group['-DI_5'] = adx_result['DMN_5']  # -DI

    adx_result = ta.adx(group['high'], group['low'], group['close'], length=7)
    group['adx_7'] = adx_result['ADX_7']
    group['+DI_7'] = adx_result['DMP_7']  # +DI
    group['-DI_7'] = adx_result['DMN_7']  # -DI

    adx_result = ta.adx(group['high'], group['low'], group['close'], length=14)
    group['adx_14'] = adx_result['ADX_14']
    group['+DI_14'] = adx_result['DMP_14']  # +DI
    group['-DI_14'] = adx_result['DMN_14']  # -DI

    # indicator: Bollinger Bands
    bband_result = ta.bbands(group['close'])
    group['bb_5_lb'] = bband_result['BBL_5_2.0']
    group['bb_5_mb'] = bband_result['BBM_5_2.0']
    group['bb_5_ub'] = bband_result['BBU_5_2.0']
    group['bb_5_bw'] = bband_result['BBB_5_2.0']
    group['bb_5_p'] = bband_result['BBP_5_2.0']

    return group

def log_returns(group, periods=[1]):
    # daily log return
    group['log_return'] = np.log(group['close']) -  np.log(group['close'].shift(1))
    return group

In [None]:
df_returns = df.groupby('ticker', group_keys=False).apply(log_returns)
df_indicators = df_returns.groupby('ticker', group_keys=False).apply(contruct_indicators)
df_indicators

# Data Analysis and Visualization

In this section, we attempt to identify possible patterns from the technical indicators e.g. Moving Average Crossover. \
For it to be meaningful, we analyse the signals generated by the indicators with respect to future returns over some period.

Possible WebApp Framework for Interactive Visualisation: [Dash](https://dash.plotly.com/?_gl=1*ta35r5*_gcl_au*Mjk1MjQ3NTI2LjE3MjY5MjA3ODU.*_ga*Nzk0MzAyNTg4LjE3MjY5MjA3ODY.*_ga_6G7EE0JNSC*MTcyNjkyMDc4NS4xLjAuMTcyNjkyMDc4NS42MC4wLjA)

Difference between Dash and Plotly: [here](https://stackoverflow.com/questions/53146357/whats-the-difference-between-dash-and-plotly)

In [None]:
# possibly a dropdown to let user select ticker to visualize
TICKER = 'KO'
data = df_indicators.xs(level='ticker', key=TICKER)
data

### Strategy 1: EMA Crossover

EMA(21) crossing over the EMA(50) is generally seen as a bullish signal.

This strategy simply takes a long position when it occurs and holds it for K number of days (profiting/losing the K-day returns).

In [15]:
DEFAULT_COLUMNS = ['open', 'high', 'low', 'close']

In [16]:
# required indicators
strategy = data[DEFAULT_COLUMNS + ['rsi_14', 'log_return','ema_21', 'ema_50']].dropna().copy()
# strategy signal
strategy['signal'] = (strategy ['ema_21'] > strategy['ema_50']).astype(np.int32)

In [17]:
strategy['2d_log_return'] = strategy['log_return'].rolling(2).sum()
strategy['3d_log_return'] = strategy['log_return'].rolling(3).sum()
strategy['4d_log_return'] = strategy['log_return'].rolling(4).sum()
strategy['5d_log_return'] = strategy['log_return'].rolling(5).sum()
strategy['7d_log_return'] = strategy['log_return'].rolling(7).sum()
strategy['10d_log_return'] = strategy['log_return'].rolling(10).sum()
strategy.dropna(inplace=True)

In [18]:
K = 7 # can change the number here as desired
strategy['target'] = strategy[f'{K}d_log_return'].shift(-K)  
strategy.dropna(inplace=True)
strategy['log_returns'] = (strategy['signal'] * strategy['target'])
strategy['returns'] = np.exp(strategy.log_returns) - 1

In [None]:
# print the performance statistic of the strategy and the buy-and-hold
start_date = '2010-01-01'
end_date = '2022-12-31'
metric_utils.benchmark_performance(data, start_date, end_date)
metric_utils.strategy_peformance(strategy.loc[start_date:end_date])

In [None]:
strategy

In [None]:
###
start_date = '2010-01-01'
end_date = '2022-12-31'
metric_utils.visualise_pricechart(strategy, start=start_date, end=end_date, indicators=['EMA'], signal_marker=True)

In [None]:
# visualize the performance of the strategy
ax = (strategy.loc[start_date:end_date].returns + 1).cumprod().plot(kind='line', label='EMA Crossover', title='Strategy Performances', ylabel='Total Return (multiples)', figsize=(10,6))
(np.exp(strategy.loc[start_date:end_date].log_return.cumsum())).plot(kind='line', label='Buy and Hold', grid=True, ax=ax)
ax.xaxis.set_major_locator(mdates.YearLocator())  # set ticks for each year
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y')) # format of the year label
plt.legend(loc='upper left');

### Strategy 2: EMA Crossover + RSI

Additional and RSI value below (above) 30 (70) is generally seen as oversold (overbought).

A moving from below 30 (above 70) to above (below) it is seen as a bullish (bearish indicator). We try to overlay these two indicators to improve performance.

In [25]:
# required indicators
strategy = data[DEFAULT_COLUMNS + 
    [
        'volume', 'log_return',
        'ema_5', 'ema_10','ema_21', 'ema_50',
        'rsi_7', 'rsi_9', 'rsi_10', 'rsi_14',
    ]
].dropna()

K = 7 # using the same K=7 as before
RSI_K = 14

# strategy idea
# by adding more indicators as a filters, we should expect lesser trades and perhaps less volatility than previous strategy
strategy['signal'] = (
    (strategy['ema_21'] > strategy['ema_50']) & 
    (strategy[f'rsi_{RSI_K}'] < 50)
).astype(np.int32)

strategy['target'] = strategy['log_return'].rolling(K).sum().shift(-K)
strategy.dropna(inplace=True)
strategy['log_returns'] = (strategy['signal'] * strategy['target'])
strategy['returns'] = np.exp(strategy.log_returns) - 1

In [None]:
strategy

In [None]:
# print the performance statistic of the strategy and the buy-and-hold
start_date = '2010-01-01'
end_date = '2022-12-31'
metric_utils.benchmark_performance(data, start_date, end_date)
metric_utils.strategy_peformance(strategy.loc[start_date:end_date])

In [None]:
###

metric_utils.visualise_pricechart(strategy, start=start_date, end=end_date, indicators=['EMA'], signal_marker=True)

<!-- Notice that the volatility and maximum drawndown is much lesser now, which is desirable.

But as this signal occurs rarely, the number of trades decreases a lot as well. 

Nonetheless, this strategy can be a great addition to an arsenal of strategies. -->

In [None]:
# visualize the performance of the strategy
ax = (strategy.loc[start_date:end_date].returns + 1).cumprod().plot(kind='line', label='EMA Crossover + RSI', title='Strategy Performances', ylabel='Total Return (multiples)', figsize=(10,6))
(np.exp(strategy.loc[start_date:end_date].log_return.cumsum())).plot(kind='line', label='Buy and Hold', grid=True, ax=ax)
ax.xaxis.set_major_locator(mdates.YearLocator())  # set ticks for each year
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y')) # format of the year label
plt.legend(loc='upper left');

### Strategy 3: RSI + ADX

In [32]:
# required indicators
strategy = data[DEFAULT_COLUMNS + 
    [
        'volume', 'log_return',
        'rsi_7', 'rsi_9', 'rsi_10', 'rsi_14',
        'adx_3', 'adx_5', 'adx_7', 'adx_14',

    ]
].dropna()

K = 7 # using the same K=7 as before
RSI_K = 14
ADX_K = 14

# strategy idea
# by adding more indicators as a filters, we should expect lesser trades and perhaps less volatility
strategy['signal'] = (
    (strategy[f'rsi_{RSI_K}'] < 45) &
    (strategy[f'adx_{ADX_K}'] > 30)
).astype(np.int32)

strategy['target'] = strategy['log_return'].rolling(K).sum().shift(-K)
strategy.dropna(inplace=True)
strategy['log_returns'] = (strategy['signal'] * strategy['target'])
strategy['returns'] = np.exp(strategy.log_returns) - 1

In [None]:
# print the performance statistic of the strategy and the buy-and-hold
start_date = '2010-01-01'
end_date = '2022-12-31'
metric_utils.benchmark_performance(data, start_date, end_date)
metric_utils.strategy_peformance(strategy.loc[start_date:end_date])

In [None]:
metric_utils.visualise_pricechart(strategy, start=start_date, end=end_date, indicators=[], signal_marker=True)

In [None]:
# visualize the performance of the strategy
ax = (strategy.loc[start_date:end_date].returns + 1).cumprod().plot(kind='line', label='EMA Crossover', title='Strategy Performances', ylabel='Total Return (multiples)', figsize=(10,6))
(np.exp(strategy.loc[start_date:end_date].log_return.cumsum())).plot(kind='line', label='Buy and Hold', grid=True, ax=ax)
ax.xaxis.set_major_locator(mdates.YearLocator())  # set ticks for each year
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y')) # format of the year label
plt.legend(loc='upper left');

### Strategy 4: Machine Learning + Technical Indicators

Improvement of Strategy 1. The signal EMA(21) > EMA(50) will be the condition for the model to be applied as a filter.
The model learns the validity of the signal from other technical indicator and features such as normalised volumes, bollinger bands, distance between prices from the bounds of the bollinger bands etc.

In [36]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

In [None]:
# create features (independent variables) and target (dependent variable)
strategy = data[DEFAULT_COLUMNS +
    [ 
        'log_return',
        'ema_5', 'ema_10', 'ema_21','ema_50',
        'rsi_14',
        'adx_14',
        'norm_volume_5','norm_volume_10','norm_volume_21',
        'bb_5_lb', 'bb_5_ub', 'bb_5_mb', 'bb_5_bw', 'bb_5_p'
    ]
].dropna()

strategy['x1'] = strategy['bb_5_ub'] - strategy['close']
strategy['x2'] = strategy['close'] - strategy['bb_5_mb']
strategy['x3'] = strategy['close'] - strategy['bb_5_lb'] 
strategy['x4'] = strategy['close'] - strategy['ema_10']
strategy['x5'] = strategy['ema_10'] - strategy['ema_21']
strategy['x6'] = strategy['ema_21'] - strategy['ema_50']

# create target variable to predict - idea is that since the default strategy would not be 100% accurate
# we will use the machine learning model to learn and filter out the signals using information from other indicators
K = 7 # using the same K=7 as before
RSI_K = 14

strategy['signal'] = (
    strategy['ema_21'] > strategy['ema_50']
).astype(np.int32)
strategy['returns'] = np.exp(strategy['signal'] * strategy['log_return'].rolling(K).sum().shift(-K)) - 1
strategy.dropna(inplace=True)

dataset = strategy[strategy.signal == 1].copy()
dataset['target'] = (dataset.returns > 0).astype(np.int32)

# train-test split
TRAIN_END = '2022-12-31' # define last period of training date
TEST_PERIOD_WEEKS = 52
TEST_START = str((pd.to_datetime(TRAIN_END) + pd.Timedelta(value=TEST_PERIOD_WEEKS, unit='W')).date())
train = dataset.loc[:TRAIN_END]
test = dataset.loc[TEST_START:]

# train the model 
FEATURES = ['rsi_14', 'adx_14', 'bb_5_bw', 'bb_5_p','norm_volume_5', 'norm_volume_10','x1', 'x2', 'x3', 'x4', 'x5', 'x6']
train_X, train_y = train[FEATURES], train['target']
model = RandomForestClassifier() # instantiate the model 
model.fit(train_X, train_y) # this api call trains the model

In [None]:
# evaluate the model accuracy
test_X, test_y = test[FEATURES], test['target']
y_pred = model.predict(test_X)
acc = accuracy_score(test_y, y_pred)
f1 = f1_score(test_y, y_pred)

print(f"Model Accuracy: {acc*100:.2f}%")
# print(f"Model F1-Score: {f1:.2f}")

In [None]:
# without using model
out_of_sample_without_model = strategy[strategy.signal == 1].loc[TEST_START:]
metric_utils.strategy_peformance(out_of_sample_without_model)

In [None]:
# visualize the performance of the strategy without using model
(out_of_sample_without_model.returns + 1).cumprod().plot(kind='line', grid=True, title='Strategy Performance', figsize=(10,6));

In [None]:
# strategy performance using the model
# notice the large decrease in maximum drawdown, the model was able to filter our drastic false signals
# winning rate has also improved a lot
out_of_sample_with_model = strategy[strategy.signal == 1].loc[TEST_START:].copy()
out_of_sample_with_model['signal'] = y_pred
out_of_sample_with_model = out_of_sample_with_model[out_of_sample_with_model.signal == 1]
metric_utils.strategy_peformance(out_of_sample_with_model)

In [86]:
# utils.benchmark_performance(spy, '2024-01-01', '2024-07-20') # and we kind of beat the index as well

In [49]:
strategy['signal'] = 0
for datetime in out_of_sample_with_model.index:
    strategy.at[datetime, 'signal'] = 1

In [None]:
metric_utils.visualise_pricechart(strategy, start=str(test.index[0].date()), end=str(test.index[-1].date()), indicators=[], signal_marker=True)

In [None]:
# visualize the performance of the strategy using model - notice the fewer sharp drops throughout the period
(out_of_sample_with_model.returns + 1).cumprod().plot(kind='line', grid=True, title='Strategy Performance', figsize=(10,6));

In [None]:
ax = (out_of_sample_without_model.returns + 1).cumprod().plot(kind='line', label='Strategy 1: EMA Crossover', title='Strategy Performances', ylabel='Total Return (multiples)', figsize=(10,6))
(out_of_sample_with_model.returns + 1).cumprod().plot(kind='line', label='Strategy 4: Strategy 1 + ML Filter', grid=True, ax=ax)
# ax.xaxis.set_major_locator(mdates.YearLocator())  # set ticks for each year
# ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y')) # format of the year label
plt.legend(loc='upper left');

### Simple Price Chart Visualisation

In [175]:
# # TO BE REFINED LATER ON IF NEED BE:

# # Create subplots: 2 rows, 1 column with shared x-axis
# fig = make_subplots(
#     rows=2, cols=1,
#     shared_xaxes=True,
#     vertical_spacing=0.1,
#     row_heights=[0.7, 0.3],  # Relative heights of the subplots
#     subplot_titles=[f'Candlestick with SMA for {TICKER}', 'RSI and ADX']
# )

# # Add candlestick trace to the first row
# fig.add_trace(
#     go.Candlestick(
#         x=data.index,
#         open=data['open'],
#         high=data['high'],
#         low=data['low'],
#         close=data['close'],
#         name='Candlestick'
#     ),
#     row=1, col=1
# )

# # Add moving average trace to the first row
# fig.add_trace(
#     go.Scatter(
#         x=data.index,
#         y=data['sma_10'],
#         mode='lines',
#         name='SMA(10)',
#         line=dict(color='green', width=1)
#     ),
#     row=1, col=1
# )
# fig.add_trace(
#     go.Scatter(
#         x=data.index,
#         y=data['sma_21'],
#         mode='lines',
#         name='SMA(21)',
#         line=dict(color='blue', width=1)
#     ),
#     row=1, col=1
# )

# # Add RSI trace to the second row
# fig.add_trace(
#     go.Scatter(
#         x=data.index,
#         y=data['rsi_7'],
#         mode='lines',
#         name='RSI 7',
#         line=dict(color='purple', width=2)
#     ),
#     row=2, col=1
# )

# # # Add ADX trace to the second row
# fig.add_trace(
#     go.Scatter(
#         x=data.index,
#         y=data['adx_14'],
#         mode='lines',
#         name='ADX 14',
#         line=dict(color='green', width=2)
#     ),
#     row=2, col=1
# )

# # Update layout for the subplots
# fig.update_layout(
#     title=f'OHLC with SMA and Indicators for {TICKER}',
#     xaxis2_title='Date',  # Title for the second subplot's x-axis
#     yaxis=dict(title='OHLC'),
#     width=1200,
#     height=800,
#     yaxis2=dict(title='RSI & ADX (0-100)', range=[0, 100]),  # Set the y-axis range for RSI and ADX
#     xaxis_rangeslider_visible=False,  # Hide range slider
#     legend=dict(x=0.01, y=0.99)
# )
# fig.show()