In [5]:
%load_ext watermark
%watermark

%load_ext autoreload
%autoreload 2


# import standard libs
from IPython.display import display
from IPython.core.debugger import set_trace as bp
from pathlib import PurePath, Path
import sys
import time
from collections import OrderedDict as od
import re
import os
import json
import datetime
import pickle


# import python scientific stack
import pandas as pd
import pandas_datareader.data as web
pd.set_option('display.max_rows', 10)
from dask import dataframe as dd
from dask.diagnostics import ProgressBar
from multiprocessing import cpu_count
pbar = ProgressBar()
pbar.register()
import numpy as np
import scipy.stats as stats
import statsmodels.api as sm
from numba import jit
import math
# import ffn


# import visual tools
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline
import seaborn as sns

plt.style.use('seaborn-talk')
plt.style.use('bmh')
#plt.rcParams['font.family'] = 'DejaVu Sans Mono'
plt.rcParams['font.size'] = 9.5
plt.rcParams['font.weight'] = 'medium'
plt.rcParams['figure.figsize'] = 10,7
blue, green, red, purple, gold, teal = sns.color_palette('colorblind', 6)

RANDOM_STATE = 777

print()

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
Last updated: 2024-09-06T14:14:06.635145-04:00

Python implementation: CPython
Python version       : 3.8.19
IPython version      : 8.12.2

Compiler    : Clang 16.0.6 
OS          : Darwin
Release     : 23.6.0
Machine     : arm64
Processor   : arm
CPU cores   : 8
Architecture: 64bit

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload



  plt.style.use('seaborn-talk')


In [6]:
import os

# Run the setup script
%run ../../config/setup_project.py

# Call the function to set up the project path
setup_project_path()

# Now you can import your modules
from src.utils import helper as h_
import src.ch_02.code_ch_02 as f_ch2
import src.ch_03.code_ch_03 as f_ch3
import src.ch_04.code_ch_04 as f_ch4
import src.ch_05.code_ch_05 as f_ch5
import src.ch_06.code_ch_06 as f_ch6
import src.ch_08.code_ch_08 as f_ch8


Project root added to sys.path: /Users/paulkelendji/Desktop/GitHub_paul/ML-Asset_Management
Config path added to sys.path: /Users/paulkelendji/Desktop/GitHub_paul/ML-Asset_Management/config
Current sys.path: ['/Users/paulkelendji/miniconda3/envs/financial_math/lib/python38.zip', '/Users/paulkelendji/miniconda3/envs/financial_math/lib/python3.8', '/Users/paulkelendji/miniconda3/envs/financial_math/lib/python3.8/lib-dynload', '', '/Users/paulkelendji/miniconda3/envs/financial_math/lib/python3.8/site-packages', '/Users/paulkelendji/miniconda3/envs/financial_math/lib/python3.8/site-packages/setuptools/_vendor', '/Users/paulkelendji/Desktop/GitHub_paul/ML-Asset_Management', '/Users/paulkelendji/Desktop/GitHub_paul/ML-Asset_Management/config', '/Users/paulkelendji/Desktop/GitHub_paul/ML-Asset_Management', '/Users/paulkelendji/Desktop/GitHub_paul/ML-Asset_Management/config', '/Users/paulkelendji/Desktop/GitHub_paul', '/Users/paulkelendji/Desktop/GitHub_paul/ML-Asset_Management', '/Users/paulk

In [7]:
# load ../data/variables_ch2.pkl
%run ../ch_02/code_ch_02.py

path = '../../data/variables_ch2.pkl'
import pickle
with open(path, 'rb') as f:
    bars = pickle.load(f)
    bar_time = pickle.load(f)
    
# df as bars['Dollar'].df_OLHC without 'cusum' column
df = bars['Dollar'].df_OLHC.drop(columns=['cusum'])
# For the purpose of this example, remove rows where time_close is duplicated
# (keep the first row)
# Remove rows where time_close is duplicated, keeping the first occurrence
df = df.drop_duplicates(subset='time_close', keep='first')
# set index as 'time_close'
df = df.set_index('time_close')


In [8]:

# CLOSE PRICE AND DAILY VOLATILITY
# Step 1 : get the daily volatility
close = df['close']
dailyVol = f_ch3.getDailyVol(close, span0=100).dropna()

# from series to df
close = pd.DataFrame(close)
dailyVol = pd.DataFrame(dailyVol)

---

## 6.1 Why is bagging based on random sampling with replacement? Would bagging still reduce a forecast’s variance if sampling were without replacement?


Bagging is effective when models are trained on samples that are as independant/
as possible. If we sample without replacement, each sample will depend on the/
the previous one, which will make the models more correlated and reduce the/
effectiveness of bagging.

## 6.2 Suppose that your training set is based on highly overlap labels (i.e., with low uniqueness, as defined in Chapter 4).
- (a) Does this make bagging prone to overfitting, or just ineffective? Why?
- (b) Is out-of-bag accuracy generally reliable in financial applications? Why?

(a) Id samples are not IID at all, the correlation between samples, and thus\
the models will be high, and bagging won't reduce de variace of the prediction,\
therefore innefective.

(b) Due to redundancy, the sampling with replacement will create training set\
samples that are very similar to out-of-bag samples. Therefore the out-of-bag\
accuracy is inflated and not reliable.

### 6.3.3 Observation Redundancy

---

## 6.3 Build an ensemble of estimators, where the base estimator is a decision tree.
### (a) How is this ensemble different from an RF?
### (b) Using sklearn, produce a bagging classifier that behaves like an RF. What parameters did you have to set up, and how?

In [27]:
# Dataset

data, cont = f_ch8.getTestData(n_features=40, n_informative=3, n_redundant=30, n_samples=10000)
data.head()

Unnamed: 0,I_0,I_1,I_2,R_0,R_1,R_2,R_3,R_4,R_5,R_6,...,R_27,R_28,R_29,N_0,N_1,N_2,N_3,N_4,N_5,N_6
1986-05-12 15:57:13.403201,1.748267,2.690969,-0.762359,-1.793042,-0.257715,-2.555273,-1.771813,0.354711,0.950059,0.420574,...,1.270633,-3.875453,-0.313406,-1.101311,0.633439,0.492835,0.71158,-0.752805,3.333879,-1.413558
1986-05-13 15:57:13.403201,2.089446,2.897283,-0.347177,-2.379418,0.023721,-3.089672,-2.315702,0.767668,0.702689,0.583657,...,1.694423,-4.426854,-0.595684,-0.733146,1.76019,-0.265049,0.039147,0.423577,1.603988,-0.85226
1986-05-14 15:57:13.403201,1.359057,1.124433,-0.449752,-0.871561,-0.48847,-1.522993,-0.588815,0.096156,-0.011085,0.490655,...,0.283697,-2.085859,0.180009,-0.42714,-1.157007,0.85371,0.540003,0.547669,0.944567,-0.732269
1986-05-15 15:57:13.403201,0.942504,-0.076773,0.246008,-0.483019,-0.237725,-0.803192,-0.092439,0.243977,-0.826416,0.507796,...,-0.086203,-0.709872,0.198837,1.181418,-0.596872,0.109188,0.28919,1.054555,-1.120703,-0.576455
1986-05-16 15:57:13.403201,1.076313,1.796803,-1.74134,-0.190357,-1.149907,-1.184029,-0.096258,-0.731054,1.143103,0.169718,...,-0.103153,-2.276741,0.619001,-0.012835,0.525344,-1.820744,0.484778,-0.175199,-1.932807,0.585696


SNIPPET 6.2 THREE WAYS OF SETTING UP AN RF
```python
clf0=RandomForestClassiﬁer(n_estimators=1000,class_weight='balanced_subsample',
criterion='entropy')
clf1=DecisionTreeClassiﬁer(criterion='entropy',max_features='auto',
class_weight='balanced')
clf1=BaggingClassiﬁer(base_estimator=clf1,n_estimators=1000,max_samples=avgU)
clf2=RandomForestClassiﬁer(n_estimators=1,criterion='entropy',bootstrap=False,
class_weight='balanced_subsample')
clf2=BaggingClassiﬁer(base_estimator=clf2,n_estimators=1000,max_samples=avgU,
max_features=1.)
```

Let's break down how to approach each part of this question.

### (a) How is this ensemble different from an RF?

The main difference between a general ensemble of decision trees and a Random Forest (RF) lies in the **feature selection** process:

- **Ensemble of Decision Trees with Bagging**: In this case, bagging is applied to generate bootstrap samples (random subsets of the data with replacement). A decision tree is trained on each of these subsets, and the final prediction is based on the aggregation (e.g., majority vote for classification) of all the individual trees’ predictions. **However, each node of a decision tree in this ensemble will have access to all features** when deciding on splits, which can lead to more correlated trees.

- **Random Forest**: In addition to bagging, Random Forest adds a second level of randomness: at each split in a decision tree, only a random subset of the features is considered for finding the best split. This forces trees to be less correlated by ensuring that not all trees focus on the same strong predictors.

**Key Difference**: Random Forest reduces the correlation between trees by limiting the number of features available at each node split, making the ensemble less prone to overfitting and better at generalization than a basic ensemble of decision trees.

### (b) Using `sklearn`, produce a bagging classifier that behaves like an RF. What parameters did you have to set up, and how?

To make a bagging classifier behave like a Random Forest, you need to:

1. Use a **DecisionTreeClassifier** as the base estimator.
2. Add randomness by limiting the number of features considered at each node split (`max_features`).
3. Set up **bagging** using `BaggingClassifier`, which creates bootstrapped samples.

Here’s the code to set up a `BaggingClassifier` that mimics Random Forest behavior:

```python
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Generate synthetic dataset
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Decision tree with feature subsampling to mimic RF
base_estimator = DecisionTreeClassifier(max_features='sqrt', criterion='entropy')

# Bagging classifier with Decision Tree as base estimator
bagging_clf = BaggingClassifier(
    base_estimator=base_estimator,
    n_estimators=100,  # Number of trees
    max_samples=1.0,   # Use bootstrapped samples
    bootstrap=True,    # Sampling with replacement
    n_jobs=-1,         # Parallel processing
    random_state=42
)

# Train the classifier
bagging_clf.fit(X_train, y_train)

# Make predictions and evaluate
y_pred = bagging_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Bagging Classifier Accuracy: {accuracy:.4f}')
```

### Parameters to set up:
1. **`max_features='sqrt'`**: This is crucial because it limits the number of features considered at each split, which mimics Random Forest's behavior.
2. **`n_estimators`**: Number of trees in the ensemble, typically large (100+).
3. **`bootstrap=True`**: Ensures sampling with replacement, like in Random Forest.
4. **`max_samples=1.0`**: Use all available samples for bootstrapping (same as Random Forest).
5. **`criterion='entropy'`**: Using entropy as the criterion for node splits, though this is not mandatory, as `gini` is also common.

This setup ensures that the bagging classifier behaves like a Random Forest. Let me know if you need further clarification!

# Answering (b) myself

In [30]:
# Libraries
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Dataset
X, Y = f_ch8.getTestData(n_features=40, n_informative=3, n_redundant=30, n_samples=10000)
Y = Y["bin"]

# usual RF
clf0 = RandomForestClassifier(
    n_estimators=1_000,  # 1_000 trees
    class_weight="balanced_subsample",  # prevent minority class from being ignored
    criterion="entropy"  # information gain
)

# Ensemble of estimators with base estimator as a decision tree
clf1 = DecisionTreeClassifier(
    criterion="entropy",  # information gain
    max_features="sqrt",  # sqrt(n_features) to force diversity among trees
    class_weight="balanced"  # prevent minority class from being ignored
)
clf1 = BaggingClassifier(
    estimator=clf1,  # base estimator
    n_estimators=1_000,  # 1_000 trees
    max_samples=0.6,  # average uniqueness
    max_features=1.0  # all features for bagging
)

# Bagging classifier on RF where max_samples is set to average uniqueness
clf2 = RandomForestClassifier(
    n_estimators=1,  # 1 tree
    criterion="entropy",  # information gain
    bootstrap=False,  # no bootstrap
    class_weight="balanced_subsample"  # prevent minority class from being ignored
)

clf2 = BaggingClassifier(
    estimator=clf2,  # base estimator
    n_estimators=1_000,  # 1_000 trees
    max_samples=0.6,  # average uniqueness
    max_features=1.0  # all features for bagging
)

In [31]:
classifiers = [clf0, clf1, clf2]

# split data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.5, random_state=42)

# fit and predict

for clf in classifiers:
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__,
            accuracy_score(y_test, y_pred))
    
# The ensemble of estimators with base estimator as a decision tree is different from a RF in that the former uses a single decision tree as the base estimator, while the latter uses a forest of decision trees as the base estimator. The ensemble of estimators with base estimator as a decision tree is more similar to a bagging classifier with a decision tree as the base estimator, as the base estimator is a single decision tree in both cases. The ensemble of estimators with base estimator as a decision tree is also more similar to a bagging classifier with a RF as the base estimator, as the base estimator is a single decision tree in both cases.


RandomForestClassifier 0.9624
BaggingClassifier 0.9616
BaggingClassifier 0.9616
