# DEMO 4

Cross Validation with Decision Trees


In [10]:
import sys
import os

# Manually set the path relative to the py file's location that you want to import
func_lib_path = os.path.abspath(os.path.join(os.getcwd(), '../'))# Add the path to sys.path
sys.path.append(func_lib_path)

# Now you can import func_lib
import func_lib
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
historical_prices = func_lib.createHistPrices()
list_of_momentums = [1, 5, 15, 20]
total_returns     = func_lib.computingReturns(historical_prices, list_of_momentums)
total_returns.dropna(inplace=True)

In [3]:
total_returns.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,F_1_d_returns,1_d_returns,5_d_returns,15_d_returns,20_d_returns
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A,2000-02-01,0.077465,0.07271,0.048938,0.029918,-0.013889
A,2000-02-02,0.01634,0.077465,0.119854,0.125,0.150376
A,2000-02-03,-0.019293,0.01634,0.136073,0.166979,0.246493
A,2000-02-04,0.042623,-0.019293,0.120294,0.127542,0.270834
A,2000-02-07,-0.006289,0.042623,0.201133,0.162706,0.223077


In [4]:
# Converting the 'F_1_d_returns' to binary based on whether the value is positive or not
total_returns['F_1_d_returns_Ind'] = total_returns['F_1_d_returns'].apply(lambda x: 1 if x > 0 else 0)
total_returns.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,F_1_d_returns,1_d_returns,5_d_returns,15_d_returns,20_d_returns,F_1_d_returns_Ind
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
A,2000-02-01,0.077465,0.07271,0.048938,0.029918,-0.013889,1
A,2000-02-02,0.01634,0.077465,0.119854,0.125,0.150376,1
A,2000-02-03,-0.019293,0.01634,0.136073,0.166979,0.246493,0
A,2000-02-04,0.042623,-0.019293,0.120294,0.127542,0.270834,1
A,2000-02-07,-0.006289,0.042623,0.201133,0.162706,0.223077,0


In [5]:
# Determine the split index for 70% of the dates
unique_dates = total_returns.index.get_level_values('Date').unique()
split_date = unique_dates[int(0.7 * len(unique_dates))]
split_date

Timestamp('2017-01-20 00:00:00')

In [6]:
# Create the training set: all data before the split date
train_data = total_returns.loc[total_returns.index.get_level_values('Date') < split_date]

# Create the testing set: all data from the split date onwards
test_data = total_returns.loc[total_returns.index.get_level_values('Date') >= split_date]

In [7]:
total_returns  = test_data['F_1_d_returns']

In [8]:
features = ['1_d_returns', '5_d_returns', '15_d_returns', '20_d_returns']
target   = ['F_1_d_returns_Ind']

# Split the data into training and testing sets
X_train = train_data[features]
X_test  = test_data[features]
y_train = train_data[target]
y_test  = test_data[target]

In [9]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

X_train_scaled = pd.DataFrame(X_train_scaled, index=X_train.index, columns=X_train.columns)
X_test_scaled  = pd.DataFrame(X_test_scaled, index=X_test.index, columns=X_test.columns)

In [11]:
# Define the parameter grid
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10, 15, 20],
    'min_samples_leaf': [1, 2, 4, 6, 8],
    'max_features': [None, 'auto', 'sqrt', 'log2']
}


# Initialize Decision Tree classifier
dt = DecisionTreeClassifier()

# Initialize GridSearchCV with the model and parameter grid
grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit the GridSearchCV to the training data
grid_search.fit(X_train_scaled, y_train)

# Get the best model
best_model = grid_search.best_estimator_


Fitting 5 folds for each of 1000 candidates, totalling 5000 fits



KeyboardInterrupt



In [None]:
model_name = 'DecTree'
# Make predictions
y_pred = best_model.predict(X_test_scaled)
y_pred_df = pd.DataFrame(y_pred, index= X_test_scaled.index)

y_pred_df.rename(columns={0:model_name}, inplace=True)

y_test_df = pd.DataFrame(y_test)

In [None]:
y_test_and_pred = pd.merge(y_pred_df, total_returns, left_index=True, right_index=True)
#y_test_and_pred = pd.merge(y_test_and_pred, total_returns, left_index=True, right_index=True)

In [None]:
model_name = 'Decision Tree'
# Define trading strategy based on RSI
def trading_strategy(y_pred):
    if y_pred >0:
        return  1 # Go long
    else:
        return 0
        

In [None]:
cum_returns, calendar_returns = func_lib.compute_BM_Perf(y_test_and_pred)

In [None]:
cum_returns, calendar_returns = func_lib.compute_strat_perf(y_test_and_pred, cum_returns, calendar_returns, trading_strategy, model_name)