# Evaluation of Classification Models (LR, KNN, SVM, DT, RF)

## Import libraries

In [158]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import yfinance as yf
import datetime
import time
import random

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import warnings
warnings.filterwarnings('ignore')

## Gathering and Manipulating Stock Data

In [51]:
start_date = datetime.datetime(2011,1,3)
end_date = datetime.datetime(2021,11,3)

# Stocks that were left over after VaR filtering
list_of_tickers = ['A','AAPL','ABC','ABT','ADBE','ADI','ADM','ADP','ADSK','AEE','AEP','AES','AKAM','AMAT','AMD','AMGN','AMZN','ANTM','APA','APD','APH','AVY','BA','BAX','BBWI','BBY','BDX','BF-B','BIIB','BKNG','BLL','BMY','BSX','CAG','CAH','CAT','CCL','CF','CHRW','CI','CL','CLX','CMI','CMS','CNP','COP','COST','CPB','CRM','CSCO','CTAS','CTRA','CTSH','CTXS','CVS','CVX','D','DE','DGX','DHR','DIS','DOV','DRI','DTE','DUK','DVA','DVN','EA','EBAY','ECL','ED','EFX','EIX','EL','EMN','EMR','EOG','ES','ETN','ETR','EXC','EXPD','EXPE','F','FAST','FCX','FDX','FE','FIS','FISV','GD','GE','GILD','GIS','GLW','GPC','GWW','HAL','HAS','HD','HES','HPQ','HRL','HSY','HUM','IBM','IFF','INTC','INTU','IP','IPG','IRM','ISRG','ITW','J','JNJ','JNPR','K','KLAC','KMB','KO','KR','LDOS','LEN','LH','LHX','LLY','LMT','LUMN','MAR','MAS','MCD','MCHP','MCK','MDLZ','MDT','MKC','MMM','MO','MRK','MRO','MSFT','MSI','MU','NEE','NEM','NI','NKE','NLOK','NOC','NTAP','NUE','NVDA','NWL','ORCL','ORLY','OXY','PARA','PAYX','PCAR','PEG','PEP','PFE','PG','PH','PKI','PM','PNW','PPG','PPL','PWR','PXD','QCOM','RL','ROK','ROP','ROST','RSG','RTX','SBUX','SEE','SHW','SJM','SLB','SNA','SO','SPGI','SRE','STZ','SWK','SYK','SYY','T','TAP','TER','TGT','TJX','TMO','TMUS','TPR','TSN','TXN','TXT','UNH','UPS','VFC','VLO','VRSN','VZ','WAT','WBA','WDC','WEC','WHR','WM','WMB','WMT','WY','WYNN','XEL','XOM','XRAY','YUM','ZBH']

all_stocks = pd.DataFrame()

# iterate over each symbol
for ticker in list_of_tickers:
    curr_stock = []
    curr_stock = yf.download(ticker, start=start_date, end=end_date, progress=False)
    curr_stock['Ticker'] = ticker
    all_stocks = all_stocks.append(curr_stock, sort=False)

In [52]:
all_stocks.reset_index(inplace=True)

In [53]:
all_stocks

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Ticker
0,2011-01-03,29.728184,30.143063,29.620888,29.957083,27.047125,4994075.0,A
1,2011-01-04,30.035765,30.114449,29.456366,29.678112,26.795246,5017282.0,A
2,2011-01-05,29.513592,29.849785,29.327612,29.613733,26.737131,4519035.0,A
3,2011-01-06,29.592276,29.928469,29.477825,29.670959,26.788790,4699097.0,A
4,2011-01-07,29.699572,29.899857,29.356222,29.771103,26.879206,3810948.0,A
...,...,...,...,...,...,...,...,...
611076,2021-10-27,145.611649,145.611649,142.009705,142.048538,141.500427,678564.0,ZBH
611077,2021-10-28,141.834946,142.718445,140.708740,141.281555,140.736404,952029.0,ZBH
611078,2021-10-29,138.844666,139.310684,135.184464,138.951462,138.415298,1681063.0,ZBH
611079,2021-11-01,139.359222,142.893204,138.961166,141.543686,140.997528,1176672.0,ZBH


In [5]:
# Restructure the dataframe
all_stocks = all_stocks[['Ticker','Date','Close','High','Low','Open','Volume']]

In [57]:
# Apply exponential smoothing to each stock, use 21 days (periods) as the predictions will be 21 day predictions
smoothed_values = all_stocks.groupby(['Ticker'])[['Close','Low','High','Open','Volume']].transform(lambda x: x.ewm(span = 21).mean())

# Concatenate the smoothed values to the date column to get a new dataframe with smoothed values
all_stocks = pd.concat([all_stocks[['Ticker','Date']], smoothed_values], axis=1, sort=False)

# Compute the daily change in price
all_stocks['Daily Change'] = all_stocks['Close'].diff()

# As all stocks are in same df, at some point the price difference will be calculated between two different stocks, identify these instances
mask = all_stocks['Ticker'] != all_stocks['Ticker'].shift(1)

# Once these rows are identified, make the daily change of them null
all_stocks['Daily Change'] = np.where(mask == True, np.nan, all_stocks['Daily Change'])

# Show these rows
all_stocks[all_stocks.isna().any(axis = 1)]

Unnamed: 0,Ticker,Date,Close,Low,High,Open,Volume,Daily Change
0,A,2011-01-03,29.957083,29.620888,30.143063,29.728184,4994075.0,
2728,AAPL,2011-01-03,11.770357,11.601429,11.795000,11.630000,445138400.0,
5456,ABC,2011-01-03,34.320000,34.250000,34.880001,34.259998,2607800.0,
8184,ABT,2011-01-03,22.944036,22.920046,23.169542,23.169542,19050229.0,
10912,ADBE,2011-01-03,31.290001,30.790001,31.480000,30.830000,6245500.0,
...,...,...,...,...,...,...,...,...
597441,XEL,2011-01-03,23.559999,23.500000,23.680000,23.670000,2199800.0,
600169,XOM,2011-01-03,74.550003,73.639999,74.790001,73.720001,23320700.0,
602897,XRAY,2011-01-03,35.099998,34.430000,35.250000,34.459999,1132900.0,
605625,YUM,2011-01-03,35.291157,35.212078,35.593098,35.442127,4734825.0,


## Compute Techincal Indicators (RSI, Stochastic Oscillator, Williams R, MACD, PROC)

In [58]:
# -----------------------------------------------------------------
# COMPUTE THE 14-DAY RSI (RELATIVE STRENGTH INDEX) 
# -----------------------------------------------------------------
    
# Duplicate the Daily Change column and assign the copy to two dataframes which will hold the positive and negative daily changes
positive_day_change, negative_day_change = all_stocks[['Ticker', 'Daily Change']].copy(), all_stocks[['Ticker', 'Daily Change']].copy()
    
# On positive days, if daily change <  0 then assign a value of 0.
positive_day_change.loc['Daily Change'] = positive_day_change.loc[(positive_day_change['Daily Change'] < 0), 'Daily Change'] = 0
    
# On negative days, if daily change >  0 then assign a value of 0.
negative_day_change.loc['Daily Change'] = negative_day_change.loc[(negative_day_change['Daily Change'] > 0), 'Daily Change'] = 0
    
# When calculating RSI, all daily changes have to be non-negative
negative_day_change['Daily Change'] = negative_day_change['Daily Change'].abs()
    
# Compute the 14 Day Exponential Weighted Moving Average
positive_ewma = positive_day_change.groupby('Ticker')['Daily Change'].transform(lambda x: x.ewm(span = 14).mean())
negative_ewma = negative_day_change.groupby('Ticker')['Daily Change'].transform(lambda x: x.ewm(span = 14).mean())
    
# Compute the RS (relative strength)
rs = positive_ewma / negative_ewma
    
# Calculate the RSI (Relative Strength Index)
rsi = 100.0 - (100.0 / (1.0 + rs))
    
# Append new positive and negative day columns to the df
all_stocks['positive_days'] = positive_day_change['Daily Change']
all_stocks['negative_days'] = negative_day_change['Daily Change']
    
# Append RSI column to the dataframe
all_stocks['RSI'] = rsi

In [59]:
# -----------------------------------------------------------------
# COMPUTE THE STOCHASTIC OSCILLATOR (%K) WITH A 14 DAY PERIOD
# -----------------------------------------------------------------
    
# Duplicate the Low and High columns and assign them to two dataframes 
high_14_day, low_14_day = all_stocks[['Ticker','High']].copy(), all_stocks[['Ticker','Low']].copy()
    
# Get the max and min of each of the 14 day periods
high_14_day = high_14_day.groupby('Ticker')['High'].transform(lambda x: x.rolling(window = 14).max())
low_14_day = low_14_day.groupby('Ticker')['Low'].transform(lambda x: x.rolling(window = 14).min())
    
# Compute %K
stch_osc_pct_k = 100 * ((all_stocks['Close'] - low_14_day) / (high_14_day - low_14_day))
    
# Append High and Low columns to the df
all_stocks['high_14_day'] = high_14_day
all_stocks['low_14_day'] = low_14_day
    
# Append %K column to the dataframe
all_stocks['stch_osc_pct_k'] = stch_osc_pct_k

In [60]:
# -----------------------------------------------------------------
# COMPUTE WILLIAMS %R
# -----------------------------------------------------------------
    
# Duplicate the Low and High columns and assign them to two dataframes 
high_14_day, low_14_day = all_stocks[['Ticker', 'High']].copy(), all_stocks[['Ticker','Low']].copy()
    
# Get the max and min of each of the 14 day periods
high_14_day = high_14_day.groupby('Ticker')['High'].transform(lambda x: x.rolling(window = 14).max())
low_14_day = low_14_day.groupby('Ticker')['Low'].transform(lambda x: x.rolling(window = 14).min())
    
# Calculate William %R indicator.
williams_pct_r = ((high_14_day - all_stocks['Close']) / (high_14_day - low_14_day)) * - 100

# Append Williams %R to the dataframe
all_stocks['williams_pct_r'] = williams_pct_r

In [61]:
# -----------------------------------------------------------------
# COMPUTE MACD (Moving Average Covergence Divergence)
# -----------------------------------------------------------------
    
# Calculate 26 day ema
ema_26 = all_stocks.groupby('Ticker')['Close'].transform(lambda x: x.ewm(span = 26).mean())
    
# Calculate 12 day ema
ema_12 = all_stocks.groupby('Ticker')['Close'].transform(lambda x: x.ewm(span = 12).mean())
    
# Compute MACD
macd = ema_12 - ema_26
    
# Apply 9 day ema to MACD value to get the signal, which is used to indicate if the price is going up or down
signal = macd.ewm(span = 9).mean()
    
# Append MACD and signal to dataframe
all_stocks['MACD'] = macd
all_stocks['signal'] = signal


In [62]:
# -----------------------------------------------------------------
# COMPUTE PROC (Price Rate of Change)
# -----------------------------------------------------------------
    
# Compute 21 day PROC (Price Rate of Change)
all_stocks['price_rate_of_change'] = all_stocks.groupby('Ticker')['Close'].transform(lambda x: x.pct_change(periods = 21))

In [63]:
# Extract the 'Close' column from the dataframe.
target_column = all_stocks.groupby('Ticker')['Close']
    
# Determine the direction of stock price after 21 days for each row (+1 if positive, and 0 if negative)
target_column = target_column.transform(lambda x: x <= x.shift(-21))
    
# Convert boolean values and create 'Target' column and append to the dataframe
all_stocks['Target'] = target_column * 1
    
# Drop rows with null values
all_stocks = all_stocks.dropna()

## Data Preparation

In [146]:
# Create pandas dataframe
df_samples = pd.DataFrame()

# Randomly select stocks
rndm_stocks = random.choices(list_of_tickers, k=5)

# Create new dataset using these stocks
df_samples = all_stocks.loc[all_stocks['Ticker'] == rndm_stocks[0]]
for i in range(1, 4):
    df_samples = df_samples.append(all_stocks.loc[all_stocks['Ticker'] == rndm_stocks[i]])

In [147]:
df_samples

Unnamed: 0,Ticker,Date,Close,Low,High,Open,Volume,Daily Change,positive_days,negative_days,RSI,high_14_day,low_14_day,stch_osc_pct_k,williams_pct_r,MACD,signal,price_rate_of_change,Target
482885,RSG,2011-02-02,30.403325,30.116687,30.607894,30.334213,2.382929e+06,0.011178,0.011178,0.000000,88.161735,30.607894,29.363189,83.564843,-16.435157,0.123159,0.060968,0.011758,0
482886,RSG,2011-02-03,30.397867,30.118049,30.595829,30.345039,2.388104e+06,-0.005457,0.000000,0.005457,86.838216,30.607894,29.363189,83.126419,-16.873581,0.128082,0.074391,0.014051,0
482887,RSG,2011-02-04,30.367729,30.101046,30.594227,30.344530,2.395136e+06,-0.030139,0.000000,0.030139,79.256045,30.607894,29.408931,79.968930,-20.031070,0.128322,0.085177,0.017720,0
482888,RSG,2011-02-07,30.385984,30.114963,30.599813,30.341071,2.364305e+06,0.018255,0.018255,0.000000,80.449080,30.607894,29.493129,80.093542,-19.906458,0.128268,0.093795,0.018812,0
482889,RSG,2011-02-08,30.394321,30.137294,30.605786,30.368751,2.366458e+06,0.008337,0.008337,0.000000,81.024199,30.607894,29.577196,79.278818,-20.721182,0.127309,0.100498,0.020691,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
313720,J,2021-10-27,135.333616,134.221396,136.603837,135.488732,4.942449e+05,0.066638,0.066638,0.000000,90.067510,136.603837,132.279768,70.624393,-29.375607,0.284481,0.099154,0.010648,0
313721,J,2021-10-28,135.692378,134.441269,137.063489,135.593393,4.936863e+05,0.358762,0.358762,0.000000,92.416438,137.063489,132.372185,70.773349,-29.226651,0.360394,0.151402,0.011602,0
313722,J,2021-10-29,136.122162,134.851154,137.425899,135.903084,4.901421e+05,0.429784,0.429784,0.000000,94.284713,137.425899,132.372185,74.202386,-25.797614,0.450048,0.211131,0.015912,0
313723,J,2021-11-01,136.539238,135.331958,137.868090,136.397349,4.871019e+05,0.417077,0.417077,0.000000,95.520428,137.868090,132.372185,75.821064,-24.178936,0.548432,0.278591,0.018562,0


In [148]:
# Extract features from dataset and assign to X and extract target from dataset and assign to y
X = df_samples[['RSI','stch_osc_pct_k','williams_pct_r','price_rate_of_change','MACD']]
y = df_samples['Target']

# Split X and y into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0, stratify=y)

# Define a StandardScaler object
sc = StandardScaler()

# Preprocess the data before fitting it to ML model
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)


## Hyperparameter Tuning and Model Evaluation

In [159]:
classif_models = {}
param_grid = {}
best_hyperparam = {}

classif_models['LR'] = LogisticRegression()
classif_models['KNN'] = KNeighborsClassifier()
classif_models['SVM'] = SVC()
classif_models['DT'] = DecisionTreeClassifier()
classif_models['RF'] = RandomForestClassifier()

accuracy = {
    'LR': 0,
    'KNN': 0,
    'SVM': 0,
    'DT': 0,
    'RF': 0
}

precision = {
    'LR': 0,
    'KNN': 0,
    'SVM': 0,
    'DT': 0,
    'RF': 0
}

recall = {
    'LR': 0,
    'KNN': 0,
    'SVM': 0,
    'DT': 0,
    'RF': 0
}

f1 = {
    'LR': 0,
    'KNN': 0,
    'SVM': 0,
    'DT': 0,
    'RF': 0
}


In [160]:
# Define the hyperparameters which will be tested for each of the clustering algorithms
param_grid['LR'] = {'penalty': ['l1','l2','none'], 'C': [0.1, 1, 10, 100, 1000], 'solver': ['newton-cg', 'lbfgs','liblinear', 'sag', 'saga']} 
param_grid['RF'] = {'n_estimators': [10,25,50,100,150,250], 'criterion': ['gini','entropy'], 'max_depth': [None,10,25,50,75,100,150]} 
param_grid['KNN'] = {'n_neighbors': [3,5,7,13,15,21], 'weights': ['uniform','distance'], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}
param_grid['SVM'] = {'C': [0.1, 1, 10, 100, 1000], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}
param_grid['DT'] = { 'criterion': ['gini','entropy'], 'max_depth': [None,10,25,50,75,100,150]} 


In [165]:
for name in classif_models.keys():
    clf = GridSearchCV(classif_models[name], param_grid[name], cv=5, scoring='accuracy')
    
    # Fit classification model to data
    clf.fit(X_train, y_train)
    
    classif_models[name].fit(X_train, y_train)
    y_pred = classif_models[name].predict(X_test)
    
    # Store best hyperparameters
    best_hyperparam[name] = clf.best_params_
    
    # Use model to make predictions
    y_pred = clf.predict(X_test)
    
    # Compute the accuracy, precision, recall and f1_score
    accuracy[name] = accuracy_score(y_pred, y_test)
    precision[name] = precision_score(y_pred, y_test)
    recall[name] = recall_score(y_pred, y_test)
    f1[name] = f1_score(y_pred, y_test)
    

In [166]:
df_model = pd.DataFrame(index=classif_models.keys(), columns=['Accuracy', 'Precision', 'Recall', 'F1 Score'])
df_model['Accuracy'] = accuracy.values()
df_model['Precision'] = precision.values()
df_model['Recall'] = recall.values()
df_model['F1 Score'] = f1.values()

df_model

Unnamed: 0,Accuracy,Precision,Recall,F1 Score
LR,0.657433,0.757903,0.696669,0.725997
KNN,0.731764,0.801079,0.762849,0.781497
SVM,0.703139,0.795682,0.731915,0.762468
DT,0.707756,0.744796,0.76183,0.753216
RF,0.742382,0.818042,0.767173,0.791791


From the results, we can see that Random Forest is the best classification algorithm. Below the optimal hyperparameters for RF is displayed which will be used in the 'Stock Portfolio Construction using Machine Learning - FULL VERSION' jupyter notebook.

In [167]:
best_hyperparam['RF']

{'criterion': 'gini', 'max_depth': 50, 'n_estimators': 100}