1. Training and Fine Tuning a decision tree for Moons Dataset

In [None]:
# 1. Train and fine-tune a decision tree for the moons dataset
import numpy as np
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split, GridSearchCV, ShuffleSplit
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from scipy.stats import mode


In [2]:
# a. Generate moons dataset
X, y = make_moons(n_samples=10000, noise=0.4, random_state=42)

In [3]:
# b. Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# c. Grid search for best hyperparameters
param_grid = {'max_leaf_nodes': list(range(2, 100))}
grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)
best_tree = grid_search.best_estimator_


In [5]:
# d. Train on full training set and evaluate
best_tree.fit(X_train, y_train)
y_pred = best_tree.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"Best Decision Tree Test Accuracy: {acc:.4f}")

Best Decision Tree Test Accuracy: 0.8695


2. Grow a forest (ensemble of decision trees)


In [6]:
# a. Generate 1,000 random subsets of 100 samples each
n_trees = 1000
n_instances = 100
rs = ShuffleSplit(n_splits=n_trees, train_size=n_instances, random_state=42)
subsets = list(rs.split(X_train))

In [7]:
# b. Train a decision tree on each subset
forest = []
for idx, (subset_idx, _) in enumerate(subsets):
    tree = DecisionTreeClassifier(max_leaf_nodes=best_tree.max_leaf_nodes, random_state=idx)
    tree.fit(X_train[subset_idx], y_train[subset_idx])
    forest.append(tree)

In [8]:
# c. Majority-vote predictions for each test instance
all_preds = np.zeros((n_trees, len(X_test)), dtype=np.int8)
for idx, tree in enumerate(forest):
    all_preds[idx] = tree.predict(X_test)

# Use scipy's mode for majority voting
maj_vote_preds, _ = mode(all_preds, axis=0, keepdims=False)
maj_vote_preds = maj_vote_preds.ravel()

# d. Evaluate ensemble accuracy
ensemble_acc = accuracy_score(y_test, maj_vote_preds)
print(f"Ensemble (Forest) Test Accuracy: {ensemble_acc:.4f}")

Ensemble (Forest) Test Accuracy: 0.8710
