In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load dataset (assuming CSV)
smarket = pd.read_csv("../../data/Smarket.csv")

# Encode Direction as 0/1
le = LabelEncoder()
smarket['Direction_encoded'] = le.fit_transform(smarket['Direction'])  # Up=1, Down=0

# Features and target
X = smarket[['Lag1','Lag2','Lag3','Lag4','Lag5','Volume']]
y = smarket['Direction_encoded']

# Train/test split (e.g., 2001 for training, 2002 for testing)
train = smarket['Year'] == 2001
test = smarket['Year'] == 2002

X_train, X_test = X[train], X[test]
y_train, y_test = y[train], y[test]


In [2]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from ISLP.bart import BART
import numpy as np

# --- Boosting ---
boost = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.01, random_state=0)
boost.fit(X_train, y_train)
y_pred_boost = boost.predict(X_test)
acc_boost = accuracy_score(y_test, y_pred_boost)

# --- Bagging ---
bag = BaggingClassifier(n_estimators=500, random_state=0)
bag.fit(X_train, y_train)
y_pred_bag = bag.predict(X_test)
acc_bag = accuracy_score(y_test, y_pred_bag)

# --- Random Forest ---
rf = RandomForestClassifier(n_estimators=500, random_state=0)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
acc_rf = accuracy_score(y_test, y_pred_rf)

# --- Logistic Regression ---
logreg = LogisticRegression(max_iter=5000)
logreg.fit(X_train, y_train)
y_pred_log = logreg.predict(X_test)
acc_log = accuracy_score(y_test, y_pred_log)

# --- BART ---
X_train_bart = X_train.astype(np.float32)
y_train_bart = y_train.astype(np.float32)
X_test_bart = X_test.astype(np.float32)
y_test_bart = y_test.astype(np.float32)

bart_model = BART(random_state=0, burnin=50, ndraw=200)
bart_model.fit(X_train_bart, y_train_bart)

yhat_test = bart_model.predict(X_test_bart)  # continuous output
y_pred_bart = (yhat_test > 0.5).astype(int)

acc_bart = accuracy_score(y_test, y_pred_bart)

# --- Confusion matrices ---
cm_boost = confusion_matrix(y_test, y_pred_boost)
cm_bag = confusion_matrix(y_test, y_pred_bag)
cm_rf = confusion_matrix(y_test, y_pred_rf)
cm_log = confusion_matrix(y_test, y_pred_log)
cm_bart = confusion_matrix(y_test, y_pred_bart)

print("Accuracy:")
print(f"Boosting: {acc_boost:.3f}")
print(cm_boost)
print(f"Bagging: {acc_bag:.3f}")
print(cm_bag)
print(f"Random Forest: {acc_rf:.3f}")
print(cm_rf)
print(f"Logistic Regression: {acc_log:.3f}")
print(cm_log)
print(f"BART Test Accuracy: {acc_bart:.3f}")
print(cm_bart)


Accuracy:
Boosting: 0.508
[[74 66]
 [58 54]]
Bagging: 0.488
[[71 69]
 [60 52]]
Random Forest: 0.496
[[76 64]
 [63 49]]
Logistic Regression: 0.536
[[115  25]
 [ 92  20]]
BART Test Accuracy: 0.464
[[75 65]
 [70 42]]


The confusion matrices and accuracies for the Smarket dataset illustrate how different modeling approaches handle this noisy financial time series. Logistic regression achieves the highest overall accuracy (**53.6%**) and correctly identifies a relatively large number of “Down” days (115), but it struggles with predicting “Up” days, with only 20 correct. Boosting, bagging, and random forests perform worse, with accuracies between **48.8–50.8%**, and their confusion matrices show that they tend to misclassify many days in both directions, reflecting overfitting to idiosyncratic patterns in the training data. BART performs the worst (**46.4%**), similarly showing poor separation between “Up” and “Down” days.

The explanation lies in the characteristics of the Smarket data: the lagged returns and volume have only a weak predictive signal for the next day’s market direction, and the time series is dominated by noise. Ensemble methods like boosting and random forests, which are highly flexible and can capture nonlinear interactions, tend to **overfit the training data**, learning spurious fluctuations that do not generalize to the test set. Logistic regression, being simpler and linear, effectively captures the modest linear relationships without overfitting, which explains its slightly better performance. BART, despite its flexibility, also suffers from overfitting and possibly limited sampling in this implementation, which reduces its predictive accuracy. Overall, these results illustrate that in high-noise, weak-signal datasets, **simpler linear models may outperform more flexible ensemble methods**, even though the latter are theoretically more powerful.