# Decision Tree

## Setting Up

In [147]:
import pandas as pd 
import numpy as np

In [148]:
import sklearn
assert sklearn.__version__ >= "0.20"
import yfinance as yf

np.random.seed(42)

from sklearn.svm import LinearSVC 
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, f1_score, precision_score

from sklearn.model_selection import RandomizedSearchCV
from imblearn.over_sampling import RandomOverSampler

In [149]:
metrics_df = pd.DataFrame(columns =['Model', 'Accuracy', 'Precision', 'Recall', 'F1', 'Confusion Matrix'])

## Import Data 

In [150]:
X_train = pd.read_excel("../data/model_inputs/X_train.xlsx")
y_train = pd.read_excel("../data/model_inputs/y_train.xlsx")

X_test = pd.read_excel("../data/model_inputs/X_test.xlsx")
y_test = pd.read_excel("../data/model_inputs/y_test.xlsx")

In [151]:
X_train = X_train.rename(columns ={'Unnamed: 0': 'Date'})
X_train = X_train.set_index('Date')

y_train = y_train.rename(columns ={'Unnamed: 0': 'Date'})
y_train = y_train.set_index('Date')

X_test = X_test.rename(columns ={'Unnamed: 0': 'Date'})
X_test = X_test.set_index('Date')

y_test = y_test.rename(columns ={'Unnamed: 0': 'Date'})
y_test = y_test.set_index('Date')

### RANDOM OVERSAMPLING RESAMPLE

In [152]:
def random_oversampler(X_train, y_train):
    oversample = RandomOverSampler(sampling_strategy='not majority')
    X_over, y_over = oversample.fit_resample(X_train, y_train)
    return X_over, y_over

### Random Resample

In [153]:
data_resample = random_oversampler(X_train, y_train)

In [154]:
X_train = data_resample[0]
y_train = data_resample[1]

In [155]:
y_train['decision'].value_counts()

 0    1383
 1    1383
-1    1383
Name: decision, dtype: int64

## Initializing Decision Tree

In [156]:
tree = DecisionTreeClassifier(max_depth=2, random_state=42)

### Fitting Data

In [157]:
tree.fit(X_train, y_train)

### Predicting on train set

In [158]:
model = 'Decision Tree TRAIN'
y_predict = tree.predict(X_train)
cm = confusion_matrix(y_train, y_predict)
accuracy = accuracy_score(y_train, y_predict)
precision = precision_score(y_train, y_predict, average = "weighted")
recall = recall_score(y_train, y_predict, average = "weighted")
f1 = f1_score(y_train, y_predict, average = "weighted")

In [159]:
int_metric = pd.DataFrame([[model, accuracy, precision, recall, f1, cm]], 
                          columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1', 'Confusion Matrix'])

metrics_df = metrics_df.append(int_metric)

In [160]:
print("Confusion_Matrix:\n{}"
      .format(cm))
print("Accuracy_Score: {}\n"
      .format(accuracy))
print("Precision_Score: {}\n"
      .format(precision))
print("Recall Score: {}\n"
      .format(recall))
print("F1 Score: {}\n"
      .format(f1))

Confusion_Matrix:
[[ 876  507    0]
 [  19 1318   46]
 [ 110  688  585]]
Accuracy_Score: 0.6697999517956134

Precision_Score: 0.7744047914363692

Recall Score: 0.6697999517956134

F1 Score: 0.6637310610727949



### Predicting on test set

In [161]:
model = 'Decision Tree TEST'
y_predict = tree.predict(X_test)
cm = confusion_matrix(y_test, y_predict)
accuracy = accuracy_score(y_test, y_predict)
precision = precision_score(y_test, y_predict, average = "weighted")
recall = recall_score(y_test, y_predict, average = "weighted")
f1 = f1_score(y_test, y_predict, average = "weighted")

  _warn_prf(average, modifier, msg_start, len(result))


In [162]:
int_metric = pd.DataFrame([[model, accuracy, precision, recall, f1, cm]], 
                          columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1', 'Confusion Matrix'])

metrics_df = metrics_df.append(int_metric)

In [163]:
print("Confusion_Matrix:\n{}"
      .format(cm))
print("Accuracy_Score: {}\n"
      .format(accuracy))
print("Precision_Score: {}\n"
      .format(precision))
print("Recall Score: {}\n"
      .format(recall))
print("F1 Score: {}\n"
      .format(f1))

Confusion_Matrix:
[[  0  21   0]
 [  0 137   0]
 [  0  10   0]]
Accuracy_Score: 0.8154761904761905

Precision_Score: 0.6650014172335601

Recall Score: 0.8154761904761905

F1 Score: 0.7325917252146761



## RandomizedSearchCV Tuning Decision Tree
### Parameters includes: n_estimators, max_features, max_depth, min_samples_split, min_samples_leaf, bootstrap

In [30]:
criterion = ['gini', 'entropy', 'log_loss']
splitter = ['best', 'random']
max_depth = [int(x) for x in np.linspace(10,110, num = 11)]
min_samples_split = [2,5,10]
min_samples_leaf = [1,2,4]
max_features = ['auto','sqrt', 'log2']

params = {'criterion': criterion, 'splitter': splitter, 
          'max_depth': max_depth, 'min_samples_split': min_samples_split, 
         'min_samples_leaf':min_samples_leaf, 'max_features': max_features}

rs = RandomizedSearchCV(tree, params, random_state = 42, n_iter = 200)
rs.fit(X_train, y_train)



















In [31]:
rs.best_params_

{'splitter': 'best',
 'min_samples_split': 5,
 'min_samples_leaf': 4,
 'max_features': 'auto',
 'max_depth': 50,
 'criterion': 'entropy'}

In [32]:
rs.best_score_

0.9549287136483207

## Initialising tuned Decision Tree

In [164]:
tree = DecisionTreeClassifier(splitter = 'best', min_samples_split = 5, 
                              min_samples_leaf = 4, max_features = 'auto', 
                              max_depth = 50, criterion = 'entropy')

### Fitting Data

In [165]:
tree.fit(X_train, y_train)



### Predicting on train set

In [166]:
model = 'Decision Tree TRAIN Tuned'
y_predict = tree.predict(X_train)
cm = confusion_matrix(y_train, y_predict)
accuracy = accuracy_score(y_train, y_predict)
precision = precision_score(y_train, y_predict, average = "weighted")
recall = recall_score(y_train, y_predict, average = "weighted")
f1 = f1_score(y_train, y_predict, average = "weighted")

In [167]:
int_metric = pd.DataFrame([[model, accuracy, precision, recall, f1, cm]], 
                          columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1', 'Confusion Matrix'])

metrics_df = metrics_df.append(int_metric)

In [168]:
print("Confusion_Matrix:\n{}"
      .format(cm))
print("Accuracy_Score: {}\n"
      .format(accuracy))
print("Precision_Score: {}\n"
      .format(precision))
print("Recall Score: {}\n"
      .format(recall))
print("F1 Score: {}\n"
      .format(f1))

Confusion_Matrix:
[[1383    0    0]
 [  12 1358   13]
 [   0    0 1383]]
Accuracy_Score: 0.9939744516751025

Precision_Score: 0.9940285095151533

Recall Score: 0.9939744516751025

F1 Score: 0.9939605483398561



### Predicting on test set

In [169]:
model = 'Decision Tree TEST Tuned'
y_predict = tree.predict(X_test)
cm = confusion_matrix(y_test, y_predict)
accuracy = accuracy_score(y_test, y_predict)
precision = precision_score(y_test, y_predict, average = "weighted")
recall = recall_score(y_test, y_predict, average = "weighted")
f1 = f1_score(y_test, y_predict, average = "weighted")

  _warn_prf(average, modifier, msg_start, len(result))


In [170]:
int_metric = pd.DataFrame([[model, accuracy, precision, recall, f1, cm]], 
                          columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1', 'Confusion Matrix'])

metrics_df = metrics_df.append(int_metric)

In [171]:
print("Confusion_Matrix:\n{}"
      .format(cm))
print("Accuracy_Score: {}\n"
      .format(accuracy))
print("Precision_Score: {}\n"
      .format(precision))
print("Recall Score: {}\n"
      .format(recall))
print("F1 Score: {}\n"
      .format(f1))

Confusion_Matrix:
[[  3  18   0]
 [  5 132   0]
 [  0  10   0]]
Accuracy_Score: 0.8035714285714286

Precision_Score: 0.7196428571428571

Recall Score: 0.8035714285714286

F1 Score: 0.750729793833242



In [172]:
metrics_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1,Confusion Matrix
0,Decision Tree TRAIN,0.6698,0.774405,0.6698,0.663731,"[[876, 507, 0], [19, 1318, 46], [110, 688, 585]]"
0,Decision Tree TEST,0.815476,0.665001,0.815476,0.732592,"[[0, 21, 0], [0, 137, 0], [0, 10, 0]]"
0,Decision Tree TRAIN Tuned,0.993974,0.994029,0.993974,0.993961,"[[1383, 0, 0], [12, 1358, 13], [0, 0, 1383]]"
0,Decision Tree TEST Tuned,0.803571,0.719643,0.803571,0.75073,"[[3, 18, 0], [5, 132, 0], [0, 10, 0]]"


# Decision Tree Business Metrics

## Import Data 

In [173]:
X_train = pd.read_excel("../data/model_inputs/X_train.xlsx")
y_train = pd.read_excel("../data/model_inputs/y_train.xlsx")

X_test = pd.read_excel("../data/model_inputs/X_test.xlsx")
y_test = pd.read_excel("../data/model_inputs/y_test.xlsx")

In [174]:
X_train = X_train.rename(columns ={'Unnamed: 0': 'Date'})
X_train = X_train.set_index('Date')

y_train = y_train.rename(columns ={'Unnamed: 0': 'Date'})
y_train = y_train.set_index('Date')

X_test = X_test.rename(columns ={'Unnamed: 0': 'Date'})
X_test = X_test.set_index('Date')

y_test = y_test.rename(columns ={'Unnamed: 0': 'Date'})
y_test = y_test.set_index('Date')

In [175]:
whole_df_x = pd.concat([X_train, X_test])
whole_df_x = whole_df_x.reset_index()
whole_df_x['year'] = pd.DatetimeIndex(whole_df_x['Date']).year
tuned_model = tree

In [176]:
business_metric_results = pd.DataFrame(columns=['exp_strategy_annual_return', 'exp_benchmark_annual_return', 'strategy_over_benchmark'])

for year in [2016, 2017, 2018, 2019, 2020, 2021, 2022]:
    year_data = whole_df_x[whole_df_x['year'] == year]
    year_data = year_data.set_index('Date')
    year_data = year_data.drop(['year'], axis = 1)
    predict_x = tuned_model.predict(np.array(year_data)) 
    predictions = predict_x

    df_pred = pd.DataFrame({'prediction':predictions}, index=year_data.index)
    df_pred = df_pred.replace({2:1, 1:-1}) # convert classes to buy hold sell
    dates = df_pred.index

    if year == 2022:
        end_date = "2022-09-02"
    else:
        end_date = str(year+1) + "-01-01"
    df_prices = yf.download("^GSPC", start=dates[0], end=end_date)[['Adj Close']]

    # create positions column
    positions = []
    prev = 0
    for i in range(len(df_pred)):
        positions.append(df_pred.iloc[i]['prediction'])
        # if df_pred.iloc[i]['prediction'] == 0:
        #     positions.append(prev)
        # else:
        #     prev = df_pred.iloc[i]['prediction']
        #     positions.append(prev)

    df_business = pd.DataFrame()
    df_business['stock_daily_log_return'] = np.log(df_prices /df_prices.shift(1))['Adj Close']
    df_business['prediction'] = df_pred['prediction']
    df_business['position'] = positions
    df_business['benchmark'] = 1 # long and hold strategy
    df_business["strategy_Returns"] = df_business["stock_daily_log_return"] * df_business["position"].shift(1)
    df_business["benchmark_Returns"] = df_business["stock_daily_log_return"] * df_business["benchmark"].shift(1)

    # Annual Mean Returns or Expected returns
    expected_strategy_annual_return = np.exp(df_business['strategy_Returns'].mean() * 252) - 1 
    expected_benchmark_annual_return = np.exp(df_business['benchmark_Returns'].mean() * 252) - 1 
    strategy_over_benchmark = expected_strategy_annual_return-expected_benchmark_annual_return
    business_metric_results.loc[year] = [expected_strategy_annual_return, expected_benchmark_annual_return, strategy_over_benchmark]
    # print(f'Expected Annual Returns: Strategy: {round(expected_strategy_annual_return*100, 2)}%  |  Stock: {round(expected_benchmark_annual_return*100, 2)}%')

business_metric_results

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed




[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed




[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed




[*********************100%***********************]  1 of 1 completed




Unnamed: 0,exp_strategy_annual_return,exp_benchmark_annual_return,strategy_over_benchmark
2016,0.0,0.107616,-0.107616
2017,-0.000797,0.185753,-0.186549
2018,0.060847,-0.070634,0.131482
2019,-0.042127,0.288443,-0.33057
2020,0.236131,0.152929,0.083202
2021,0.003426,0.28923,-0.285804
2022,-0.069028,-0.249185,0.180157
