In [1]:
%load_ext watermark
%watermark

%load_ext autoreload
%autoreload 2


# import standard libs
from IPython.display import display
from IPython.core.debugger import set_trace as bp
from pathlib import PurePath, Path
import sys
import time
from collections import OrderedDict as od
import re
import os
import json
import datetime
import pickle


# import python scientific stack
import pandas as pd
import pandas_datareader.data as web
pd.set_option('display.max_rows', 10)
from dask import dataframe as dd
from dask.diagnostics import ProgressBar
from multiprocessing import cpu_count
pbar = ProgressBar()
pbar.register()
import numpy as np
import scipy.stats as stats
import statsmodels.api as sm
from numba import jit
import math
# import ffn


# import visual tools
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline
import seaborn as sns

plt.style.use('seaborn-talk')
plt.style.use('bmh')
#plt.rcParams['font.family'] = 'DejaVu Sans Mono'
plt.rcParams['font.size'] = 9.5
plt.rcParams['font.weight'] = 'medium'
plt.rcParams['figure.figsize'] = 10,7
blue, green, red, purple, gold, teal = sns.color_palette('colorblind', 6)

RANDOM_STATE = 777

print()

Last updated: 2024-09-07T12:30:22.261252-04:00

Python implementation: CPython
Python version       : 3.8.19
IPython version      : 8.12.2

Compiler    : Clang 16.0.6 
OS          : Darwin
Release     : 23.6.0
Machine     : arm64
Processor   : arm
CPU cores   : 8
Architecture: 64bit




  plt.style.use('seaborn-talk')


In [2]:
import os

# Run the setup script
%run ../../config/setup_project.py

# Call the function to set up the project path
setup_project_path()

# Now you can import your modules
from src.utils import helper as h_
import src.ch_02.code_ch_02 as f_ch2
import src.ch_03.code_ch_03 as f_ch3
import src.ch_04.code_ch_04 as f_ch4
import src.ch_05.code_ch_05 as f_ch5
import src.ch_06.code_ch_06 as f_ch6
import src.ch_07.code_ch_07 as f_ch7
import src.ch_08.code_ch_08 as f_ch8


Project root added to sys.path: /Users/paulkelendji/Desktop/GitHub_paul/ML-Asset_Management
Config path added to sys.path: /Users/paulkelendji/Desktop/GitHub_paul/ML-Asset_Management/config
Current sys.path: ['/Users/paulkelendji/miniconda3/envs/financial_math/lib/python38.zip', '/Users/paulkelendji/miniconda3/envs/financial_math/lib/python3.8', '/Users/paulkelendji/miniconda3/envs/financial_math/lib/python3.8/lib-dynload', '', '/Users/paulkelendji/miniconda3/envs/financial_math/lib/python3.8/site-packages', '/Users/paulkelendji/miniconda3/envs/financial_math/lib/python3.8/site-packages/setuptools/_vendor', '/Users/paulkelendji/Desktop/GitHub_paul/ML-Asset_Management', '/Users/paulkelendji/Desktop/GitHub_paul/ML-Asset_Management/config']
Project root added to sys.path: /Users/paulkelendji/Desktop/GitHub_paul/ML-Asset_Management
Config path added to sys.path: /Users/paulkelendji/Desktop/GitHub_paul/ML-Asset_Management/config
Current sys.path: ['/Users/paulkelendji/miniconda3/envs/financ

---

In [3]:
dataset, cont = f_ch8.getTestData()

In [4]:
cont.head()

Unnamed: 0,bin,w,t1
1986-05-12 12:30:25.451645,0,0.0001,1986-05-18 14:25:15.597123940
1986-05-13 12:30:25.451645,0,0.0001,1986-05-15 23:00:51.365790475
1986-05-14 12:30:25.451645,0,0.0001,1986-05-24 01:26:29.359801697
1986-05-15 12:30:25.451645,0,0.0001,1986-05-17 10:07:02.236412876
1986-05-16 12:30:25.451645,0,0.0001,1986-05-26 09:08:38.491527716


# 7.1 Why is shuffling a dataset before conducting k-fold CV generally a bad idea in finance? What is the purpose of shuffling? Why does shuffling defeat the purpose of k-fold CV in financial datasets?

### Answer:

Shuffling could to obtain training data with a time later than the test data, which is not realistic in finance. It would cause data leakage and the results would be biased.

## Chat GPT

Your answer is on the right track, but we can enhance and clarify it by adding more context and detail. Here’s an improved version of the answer:

---

### **Answer:**

In finance, time plays a critical role in how data should be treated. Shuffling the dataset before performing k-fold cross-validation (CV) can introduce issues because it disrupts the temporal ordering of the data. In financial datasets, the past is used to predict the future. If shuffling occurs, this temporal structure is broken, and training data could inadvertently include observations from the future (relative to the test data), leading to **data leakage**. 

The purpose of shuffling in machine learning is typically to ensure that each fold in k-fold CV contains a representative sample of the entire dataset, reducing biases caused by data ordering. However, in finance, **data dependency** and **autocorrelation** (e.g., stock prices often depend on previous values) mean that shuffling would violate the assumption that future prices are unknown at the time predictions are made. This would create **look-ahead bias**, where information from the future is used to predict the past, resulting in unrealistically high performance metrics and **biased results**.

Therefore, shuffling defeats the purpose of k-fold CV in financial datasets because it disrupts the temporal relationship between observations, making the model evaluation invalid for real-world financial predictions, where such time-based dependencies must be preserved.

---

This version provides more context on why shuffling is done in non-financial datasets and explicitly connects shuffling to the potential pitfalls (data leakage and look-ahead bias) when used improperly in finance.

---

**Data leakage** and **look-ahead bias** are closely related concepts, but they are not exactly the same. Both lead to overly optimistic model performance, but they differ in how they occur.

### **Data Leakage**:
Data leakage occurs when information from outside the training dataset leaks into the model during training, allowing the model to "cheat" by learning patterns it wouldn’t normally have access to in a real-world scenario. Leakage can happen in many forms:
1. **Feature leakage**: When the model is trained on features that contain future information or other data that should not be available at the time of prediction. For example, using a feature that contains future stock prices in a model designed to predict future returns would cause data leakage.
2. **Target leakage**: This happens when the model has access to the target variable in an indirect form. For example, if a feature used in the model is highly correlated with the outcome or target variable (e.g., using "total sales" to predict "profit"), it can cause leakage.

In summary, **data leakage** is a broader term that refers to any unintended inclusion of information that would not be available at the time of prediction, whether it’s about future events or about the target variable itself.

### **Look-ahead Bias**:
Look-ahead bias is a specific form of data leakage that happens in **time-series data**, like financial datasets. It occurs when the model is trained using information from the future that wouldn't be available at the time the predictions are made. For example, in a stock market dataset, if the model uses future stock prices or returns to predict current stock prices, it will result in look-ahead bias because the model is using future information to predict the past.

In summary, **look-ahead bias** specifically deals with improper handling of temporal sequences, where future data is mistakenly made available to the model during training, violating the natural time-ordering of events.

### Key Differences:
- **Scope**: Data leakage can occur in any type of data (not necessarily time-series), while look-ahead bias is specific to time-series or datasets where time-ordering is crucial (like in finance).
- **Nature of Information**: Look-ahead bias deals specifically with the use of future information to predict past or current outcomes, whereas data leakage can refer to any kind of improper data use, including correlations or other relationships that shouldn’t be accessible to the model.

Both are critical to avoid, especially in financial modeling, because they can lead to misleading performance estimates and overfitting, making models appear more accurate than they actually are in practice.



---

# 7.2 Take a pair of matrices (X, y), representing observed features and labels. These could be one of the datasets derived from the exercises in Chapter 3.

In [5]:
# Dataset
X, Y = f_ch8.getTestData(n_features=6, n_informative=2, n_redundant=3, n_samples=10000)

## (a) Derive the performance from a 10-fold CV of an RF classifier on (X, y), without shuffling.

In [6]:
# Libraries
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


# usual RF
clf0 = RandomForestClassifier(
    n_estimators=1_000,  # 1_000 trees
    class_weight="balanced_subsample",  # prevent minority class from being ignored
    criterion="entropy"  # information gain
)

# Ensemble of estimators with base estimator as a decision tree
clf1 = DecisionTreeClassifier(
    criterion="entropy",  # information gain
    max_features="sqrt",  # sqrt(n_features) to force diversity among trees
    class_weight="balanced"  # prevent minority class from being ignored
)
clf1 = BaggingClassifier(
    estimator=clf1,  # base estimator
    n_estimators=1_000,  # 1_000 trees
    max_samples=0.94,  # average uniqueness
    max_features=1.0  # all features for bagging
)

# Bagging classifier on RF where max_samples is set to average uniqueness
clf2 = RandomForestClassifier(
    n_estimators=1,  # 1 tree
    criterion="entropy",  # information gain
    bootstrap=False,  # no bootstrap
    class_weight="balanced_subsample"  # prevent minority class from being ignored
)

clf2 = BaggingClassifier(
    estimator=clf2,  # base estimator
    n_estimators=1_000,  # 1_000 trees
    max_samples=0.94,  # average uniqueness
    max_features=1.0  # all features for bagging
)

In [None]:
# Ensure Y is correctly defined
sample_weights = Y['w']
t1 = Y['t1']
Y = Y['bin']
classifiers = [clf0, clf1, clf2]


In [None]:

from sklearn.model_selection import cross_val_score, KFold
import numpy as np

# Create a KFold cross-validator
kf = KFold(n_splits=10, shuffle=False)

# Perform 10-fold cross-validation without shuffling
for clf in classifiers:
    print("Classifier:", clf)
    cv_scores = cross_val_score(clf, X, Y, cv=kf, scoring='accuracy', n_jobs=-1)
    print(f"Accuracy for each fold: {cv_scores}")
    print(f"Mean accuracy: {np.mean(cv_scores):.4f}\n")


In [None]:
from joblib import Parallel, delayed
from sklearn.model_selection import cross_val_score, KFold
import numpy as np

# Create a KFold cross-validator
kf = KFold(n_splits=10, shuffle=False)

# Function to perform cross-validation
def evaluate_classifier(clf, X, Y, cv):
    cv_scores = cross_val_score(clf, X, Y, cv=cv, scoring='accuracy', n_jobs=-1)
    print(f"Classifier: {clf}")
    print(f"Accuracy for each fold: {cv_scores}")
    print(f"Mean accuracy: {np.mean(cv_scores):.4f}\n")
    return cv_scores

# Run the classifiers in parallel
results = Parallel(n_jobs=-1)(delayed(evaluate_classifier)(clf, X, Y, kf) for clf in classifiers)


## (b) Derive the performance from a 10-fold CV of an RF on (X, y), with shuffling

In [None]:
from joblib import Parallel, delayed
from sklearn.model_selection import cross_val_score, KFold
import numpy as np

# Create a KFold cross-validator
kf = KFold(n_splits=10, shuffle=True)

# Function to perform cross-validation
def evaluate_classifier(clf, X, Y, cv):
    cv_scores = cross_val_score(clf, X, Y, cv=cv, scoring='accuracy', n_jobs=-1)
    print(f"Classifier: {clf}")
    print(f"Accuracy for each fold: {cv_scores}")
    print(f"Mean accuracy: {np.mean(cv_scores):.4f}\n")
    return cv_scores

# Run the classifiers in parallel
results = Parallel(n_jobs=-1)(delayed(evaluate_classifier)(clf, X, Y, kf) for clf in classifiers)


## 7.3 Take the same pair of matrices (X, y) you used in exercise 2.
### (a) Derive the performance from a 10-fold purged CV of an RF on (X, y), with 1% embargo

In [None]:
clf = clf2
f_ch7.cvScore(clf, X, Y, 
              sample_weight=sample_weights,
              scoring='accuracy', 
              t1=t1, cv=10, 
              pctEmbargo=0.01)

In [None]:
t1

In [None]:
f_ch4.main(t1, sLength=250, uniqueObj=0.5)