In [1]:
# === Builder 1: Initialize Module 3 Master Notebook ===
# Creates module3_master/Module_03_Master.ipynb with title, overview, imports, and dataset load.

import os, nbformat as nbf

# -----------------------------
# 1. Create folder for module 3
# -----------------------------
OUT_DIR = "module3_master"
os.makedirs(OUT_DIR, exist_ok=True)

OUT_PATH = os.path.join(OUT_DIR, "Module_03_Master.ipynb")

def md(txt): 
    return nbf.v4.new_markdown_cell(txt)

def code(txt):
    return nbf.v4.new_code_cell(txt)

# -----------------------------
# 2. Create an empty notebook
# -----------------------------
nb = nbf.v4.new_notebook()
cells = []

# -----------------------------
# 3. Title & Overview
# -----------------------------
cells += [
    md("""# üìò Module 3 ‚Äî Classification

## Logistic Regression ‚Ä¢ Decision Trees ‚Ä¢ Ensembles ‚Ä¢ ROC/AUC ‚Ä¢ Thresholding ‚Ä¢ Model Tuning

This module introduces **classification**, one of the core techniques in supervised learning.

We will cover:
1. Binary vs. multiclass classification  
2. Logistic Regression  
3. Decision Trees  
4. Ensemble Methods (Random Forest & Boosting)  
5. Classification Metrics (Confusion Matrix, ROC, Precision‚ÄìRecall, F1)  
6. Model Tuning & Validation  
7. Hands-On Exercises (Heart Disease, Spam, Wine)

Throughout the module, we will use synthetic datasets from `datasets_module3.py`,
with a consistent random seed (1955) to ensure reproducible examples.
""")
]

# -----------------------------
# 4. Imports + dataset import
# -----------------------------
cells += [
    code("""# --- Imports for Module 3 ---
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV, validation_curve
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, precision_recall_curve,
    confusion_matrix, classification_report
)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Dataset helper functions
from datasets_module3 import make_heart_disease_synth

# Consistent seed
SEED = 1955
""")
]

# -----------------------------
# 5. Load Heart Disease dataset
# -----------------------------
cells += [
    md("## 3.0 ‚Äî Load Dataset (Heart Disease)\nWe will use this dataset across Sections 3.2‚Äì3.6."),
    code("""# Load synthetic heart disease dataset
df = make_heart_disease_synth(n=600, seed=SEED)

# Preview the dataset
df.head()""")
]

# -----------------------------
# 6. Section Placeholders
# -----------------------------
cells += [
    md("""---

# 3.1 ‚Äî Introduction to Classification  
*Visual intuition for binary vs. nonlinear boundaries*  
*(builder 2 will populate this section)*  
---

# 3.2 ‚Äî Logistic Regression  
*(builder 3 will populate this section)*  
---

# 3.3 ‚Äî Decision Trees  
*(builder 4 will populate this section)*  
---

# 3.4 ‚Äî Ensemble Methods (Random Forest & Boosting)  
*(builder 5 will populate this section)*  
---

# 3.5 ‚Äî Classification Metrics (Confusion Matrix, ROC, PR Curve)  
*(builder 6 will populate this section)*  
---

# 3.6 ‚Äî Model Tuning & Validation  
*(builder 7 will populate this section)*  
---
""")
]

# -----------------------------
# 7. Save notebook
# -----------------------------
nb['cells'] = cells

with open(OUT_PATH, "w", encoding="utf-8") as f:
    f.write(nbf.writes(nb))

print(f"Module 3 Master Notebook created at: {OUT_PATH}")


Module 3 Master Notebook created at: module3_master/Module_03_Master.ipynb


In [2]:
# === Builder 2: Append Section 3.1 to Module_03_Master.ipynb ===
# Adds visual demo for classification decision boundaries.

import os, nbformat as nbf

OUT_DIR = "module3_master"
OUT_PATH = os.path.join(OUT_DIR, "Module_03_Master.ipynb")

nb = nbf.read(open(OUT_PATH, "r", encoding="utf-8"), as_version=4)

def md(txt): 
    return nbf.v4.new_markdown_cell(txt)

def code(txt): 
    return nbf.v4.new_code_cell(txt)

cells = []

# ---------------------------------------------------------
# Section Header
# ---------------------------------------------------------
cells += [
    md("""# 3.1 ‚Äî Introduction to Classification

In this section, we build intuition about **binary classification** by visualizing 
decision boundaries for two simple models:

- Logistic Regression (linear boundary)  
- Decision Tree (nonlinear boundary)

We use a toy dataset (`make_moons`) to clearly show the difference between linear and nonlinear classifiers.
""")
]

# ---------------------------------------------------------
# Code: Decision Boundary Demo
# ---------------------------------------------------------
cells += [
    code("""# --- 3.1 Decision Boundary Visualization ---

from sklearn.datasets import make_moons
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import numpy as np

# Generate synthetic 2D dataset for visualization
X_moons, y_moons = make_moons(n_samples=400, noise=0.25, random_state=1955)

# Split for training/testing (not strictly necessary here, but consistent with workflow)
from sklearn.model_selection import train_test_split
X_tr_m, X_te_m, y_tr_m, y_te_m = train_test_split(
    X_moons, y_moons, test_size=0.3, random_state=1955
)

# Train classifiers
log_clf = LogisticRegression().fit(X_tr_m, y_tr_m)
tree_clf = DecisionTreeClassifier(max_depth=4, random_state=1955).fit(X_tr_m, y_tr_m)

# Helper function to plot decision boundaries
def plot_decision_boundary(model, X, y, title):
    x_min, x_max = X[:,0].min() - 0.5, X[:,0].max() + 0.5
    y_min, y_max = X[:,1].min() - 0.5, X[:,1].max() + 0.5
    xx, yy = np.meshgrid(
        np.linspace(x_min, x_max, 300),
        np.linspace(y_min, y_max, 300)
    )

    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    cmap_bg = ListedColormap(["#F6C4C4", "#C4E4F6"])
    cmap_pts = ListedColormap(["#E53935", "#1E88E5"])

    plt.figure(figsize=(5,4))
    plt.contourf(xx, yy, Z, cmap=cmap_bg, alpha=0.7)
    plt.scatter(X[:,0], X[:,1], c=y, cmap=cmap_pts, edgecolor="k", s=20)
    plt.title(title)
    plt.xlabel("Feature 1")
    plt.ylabel("Feature 2")
    plt.show()

# Plot logistic regression decision boundary
plot_decision_boundary(log_clf, X_moons, y_moons,
    "Logistic Regression ‚Äî Linear Decision Boundary")

# Plot decision tree decision boundary
plot_decision_boundary(tree_clf, X_moons, y_moons,
    "Decision Tree (depth=4) ‚Äî Nonlinear Decision Boundary")
""")
]

# ---------------------------------------------------------
# Append to notebook
# ---------------------------------------------------------
nb.cells.extend(cells)

with open(OUT_PATH, "w", encoding="utf-8") as f:
    f.write(nbf.writes(nb))

print("Builder 2 applied ‚Äî Section 3.1 added to Module_03_Master.")


Builder 2 applied ‚Äî Section 3.1 added to Module_03_Master.


In [3]:
# === Builder 3: Append Section 3.2 ‚Äî Logistic Regression ===

import os, nbformat as nbf

OUT_DIR = "module3_master"
OUT_PATH = os.path.join(OUT_DIR, "Module_03_Master.ipynb")

nb = nbf.read(open(OUT_PATH, "r", encoding="utf-8"), as_version=4)

def md(txt): 
    return nbf.v4.new_markdown_cell(txt)

def code(txt): 
    return nbf.v4.new_code_cell(txt)

cells = []

# ---------------------------------------------------------
# Markdown header for Section 3.2
# ---------------------------------------------------------
cells.append(md("""
# 3.2 ‚Äî Logistic Regression (Heart Disease)

In this section, we fit a **logistic regression classifier** to the heart disease dataset.
We will:

- Clean & prepare the dataset  
- Build a preprocessing pipeline  
- Train a logistic regression model  
- Extract and interpret coefficients  
- Use predicted probabilities  
- Explore threshold tuning  
- Evaluate using a confusion matrix and classification metrics  

This mirrors the process from Module 2 (OLS) but adapted for classification.
"""))

# ---------------------------------------------------------
# Code: Preprocess + Train Logistic Regression
# ---------------------------------------------------------
cells.append(code("""
# --- 3.2 Logistic Regression Pipeline ---

# Separate features and target
X = df.drop('disease', axis=1)
y = df['disease']

# Identify numeric & categorical columns
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

# Preprocessing pipeline:
# - numeric: median impute + standardization
# - categorical: most-frequent impute + one-hot encoding

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scale', StandardScaler())
])

cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

pre = ColumnTransformer([
    ('num', num_pipe, num_cols),
    ('cat', cat_pipe, cat_cols)
])

# Create train/test split
Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.20, random_state=1955)

# Logistic Regression pipeline
log_reg = Pipeline([
    ('pre', pre),
    ('model', LogisticRegression(max_iter=500, random_state=1955))
])

# Fit model
log_reg.fit(Xtr, ytr)

# Predict class labels and probabilities
yhat = log_reg.predict(Xte)
yprob = log_reg.predict_proba(Xte)[:, 1]

# Display first few predictions
yprob[:10]
"""))

# ---------------------------------------------------------
# Extract Coefficients
# ---------------------------------------------------------
cells.append(code("""
# --- Extract Logistic Regression Coefficients ---

# Get underlying model
lr_model = log_reg.named_steps['model']

# Get one-hot-encoded categorical names
cat_features = log_reg.named_steps['pre'].named_transformers_['cat'] \
    .named_steps['onehot'].get_feature_names_out(cat_cols)

# Combined feature names after preprocessing
feature_names = np.concatenate([num_cols, cat_features])

coef_table = pd.DataFrame({
    "Feature": feature_names,
    "Coefficient": lr_model.coef_.flatten()
}).sort_values("Coefficient", ascending=False)

coef_table
"""))

# ---------------------------------------------------------
# Threshold tuning
# ---------------------------------------------------------
cells.append(code("""
# --- Threshold Tuning Demo ---

thresholds = [0.3, 0.5, 0.7]

for t in thresholds:
    preds_t = (yprob >= t).astype(int)
    prec = precision_score(yte, preds_t)
    rec  = recall_score(yte, preds_t)
    print(f"Threshold={t:.2f} ‚Üí Precision={prec:.3f}, Recall={rec:.3f}")
"""))

# ---------------------------------------------------------
# Confusion Matrix + Metrics
# ---------------------------------------------------------
cells.append(code("""
# --- Classification Metrics ---

cm = confusion_matrix(yte, yhat)

acc = accuracy_score(yte, yhat)
prec = precision_score(yte, yhat)
rec = recall_score(yte, yhat)
f1 = f1_score(yte, yhat)
auc = roc_auc_score(yte, yprob)

print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1 Score:", f1)
print("ROC AUC:", auc)

# Display confusion matrix
cm
"""))

# ---------------------------------------------------------
# Markdown: Placeholder for later interpretation
# ---------------------------------------------------------
cells.append(md("""
### ‚úçÔ∏è Interpretation Notes (to be completed after running the section)

- Discuss which features have the strongest positive/negative coefficients  
- Explain probability outputs and what a 0.7 threshold means  
- Compare precision/recall trade-offs  
- Interpret the confusion matrix  
- Explain ROC AUC in plain English  
"""))

# ---------------------------------------------------------
# Append Builder 3
# ---------------------------------------------------------
nb.cells.extend(cells)

with open(OUT_PATH, "w", encoding="utf-8") as f:
    f.write(nbf.writes(nb))

print("Builder 3 applied ‚Äî Section 3.2 Logistic Regression added to Module_03_Master.")


Builder 3 applied ‚Äî Section 3.2 Logistic Regression added to Module_03_Master.


In [4]:
# === Builder 4: Append Section 3.3 ‚Äî Decision Trees ===

import os, nbformat as nbf

OUT_DIR = "module3_master"
OUT_PATH = os.path.join(OUT_DIR, "Module_03_Master.ipynb")

nb = nbf.read(open(OUT_PATH, "r", encoding="utf-8"), as_version=4)

def md(txt):
    return nbf.v4.new_markdown_cell(txt)

def code(txt):
    return nbf.v4.new_code_cell(txt)

cells = []

# ---------------------------------------------------------
# Section Header
# ---------------------------------------------------------
cells.append(md("""
# 3.3 ‚Äî Decision Trees (Heart Disease)

In this section, we fit a **decision tree classifier** to the heart disease dataset.
We will:

- Use the same preprocessing pipeline as logistic regression  
- Fit a decision tree  
- Visualize the tree structure  
- Interpret decision paths  
- Compare training vs test accuracy  
- Explore model complexity (max_depth)  
"""))

# ---------------------------------------------------------
# Fit Decision Tree Classifier
# ---------------------------------------------------------
cells.append(code("""
# --- 3.3 Decision Tree Classifier ---

# Reuse preprocessing from logistic regression section
# (num_cols, cat_cols, pre, Xtr, Xte, ytr, yte already defined)

tree = Pipeline([
    ('pre', pre),
    ('model', DecisionTreeClassifier(max_depth=4, random_state=1955))
])

tree.fit(Xtr, ytr)

# Predictions
yhat_tree = tree.predict(Xte)

# Performance metrics
acc = accuracy_score(yte, yhat_tree)
prec = precision_score(yte, yhat_tree)
rec = recall_score(yte, yhat_tree)
f1 = f1_score(yte, yhat_tree)

print("Decision Tree Metrics (depth=4)")
print("Accuracy :", acc)
print("Precision:", prec)
print("Recall   :", rec)
print("F1 Score :", f1)
"""))

# ---------------------------------------------------------
# Visualization of the Tree
# ---------------------------------------------------------
cells.append(code("""
# --- Visualize the Decision Tree ---

plt.figure(figsize=(16, 6))
plot_tree(
    tree.named_steps['model'],
    feature_names=num_cols + list(
        tree.named_steps['pre']
        .named_transformers_['cat']
        .named_steps['onehot']
        .get_feature_names_out(cat_cols)
    ),
    class_names=["No Disease", "Disease"],
    filled=True,
    rounded=True,
    fontsize=8
)
plt.show()
"""))

# ---------------------------------------------------------
# Train vs Test Accuracy (Depth Experiment)
# ---------------------------------------------------------
cells.append(code("""
# --- Train vs Test Accuracy for different tree depths ---

depths = [1, 2, 3, 4, 5, 6, 8, 10]

train_acc = []
test_acc = []

for d in depths:
    model = Pipeline([
        ('pre', pre),
        ('model', DecisionTreeClassifier(max_depth=d, random_state=1955))
    ])
    model.fit(Xtr, ytr)
    train_acc.append(model.score(Xtr, ytr))
    test_acc.append(model.score(Xte, yte))

print("Depths tested:", depths)
print("Train Accuracy:", train_acc)
print("Test Accuracy :", test_acc)
"""))

# ---------------------------------------------------------
# Placeholder for later interpretation
# ---------------------------------------------------------
cells.append(md("""
### ‚úçÔ∏è Interpretation Notes (to complete after running Section 3.3)

- What features appear near the top of the tree? Why?  
- Which splits seem most important?  
- Compare training vs test accuracy:  
  - Where does overfitting start?  
  - Which depth gives the best generalization?  
- Discuss the difference between logistic regression‚Äôs linear boundary 
  and the nonlinear, rule-based structure of a tree.

---
"""))

# ---------------------------------------------------------
# Append content and save notebook
# ---------------------------------------------------------
nb.cells.extend(cells)

with open(OUT_PATH, "w", encoding="utf-8") as f:
    f.write(nbf.writes(nb))

print("Builder 4 applied ‚Äî Section 3.3 Decision Trees added to Module_03_Master.")


Builder 4 applied ‚Äî Section 3.3 Decision Trees added to Module_03_Master.


In [5]:
# === Builder 5: Append Section 3.4 ‚Äî Ensemble Methods ===

import os, nbformat as nbf

OUT_DIR = "module3_master"
OUT_PATH = os.path.join(OUT_DIR, "Module_03_Master.ipynb")

nb = nbf.read(open(OUT_PATH, "r", encoding="utf-8"), as_version=4)

def md(txt):
    return nbf.v4.new_markdown_cell(txt)

def code(txt):
    return nbf.v4.new_code_cell(txt)

cells = []

# ---------------------------------------------------------
# Section Header
# ---------------------------------------------------------
cells.append(md("""
# 3.4 ‚Äî Ensemble Methods (Random Forest & Boosting)

In this section we explore **ensemble methods**, which combine multiple models to improve
accuracy, stability, and generalization.

We will train and compare:
- **Random Forest** (bagging-based ensemble)  
- **Gradient Boosting** (boosting-based model, similar to XGBoost)

We will examine:
- Performance metrics  
- Feature importances  
- Differences in model behavior  
"""))

# ---------------------------------------------------------
# Random Forest & Gradient Boosting Fit
# ---------------------------------------------------------
cells.append(code("""
# --- 3.4 Ensemble Models: Random Forest & Gradient Boosting ---

# Random Forest Classifier (Bagging)
rf = Pipeline([
    ('pre', pre),
    ('model', RandomForestClassifier(
        n_estimators=200, 
        max_depth=None, 
        random_state=1955
    ))
])

# Gradient Boosting Classifier (Boosting; XGBoost-like behavior)
gb = Pipeline([
    ('pre', pre),
    ('model', GradientBoostingClassifier(
        learning_rate=0.05,
        n_estimators=200,
        max_depth=3,
        random_state=1955
    ))
])

# Fit models
rf.fit(Xtr, ytr)
gb.fit(Xtr, ytr)

# Predictions
yhat_rf = rf.predict(Xte)
yhat_gb = gb.predict(Xte)

yprob_rf = rf.predict_proba(Xte)[:, 1]
yprob_gb = gb.predict_proba(Xte)[:, 1]

# Performance Metrics
def metrics_dict(name, yhat, yprob):
    return {
        "Model": name,
        "Accuracy": accuracy_score(yte, yhat),
        "Precision": precision_score(yte, yhat),
        "Recall": recall_score(yte, yhat),
        "F1": f1_score(yte, yhat),
        "AUC": roc_auc_score(yte, yprob),
    }

ensemble_results = pd.DataFrame([
    metrics_dict("Random Forest", yhat_rf, yprob_rf),
    metrics_dict("Gradient Boosting", yhat_gb, yprob_gb)
])

ensemble_results
"""))

# ---------------------------------------------------------
# Feature Importances (RF + GB)
# ---------------------------------------------------------
cells.append(code("""
# --- 3.4 Feature Importances ---

# Extract feature names after one-hot encoding
cat_features = pre.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(cat_cols)
feature_names = np.concatenate([num_cols, cat_features])

# Random Forest importances
rf_importances = rf.named_steps['model'].feature_importances_

# Gradient Boosting importances
gb_importances = gb.named_steps['model'].feature_importances_

# Plot importances side-by-side
plt.figure(figsize=(12,5))

plt.subplot(1,2,1)
plt.barh(feature_names, rf_importances)
plt.title("Random Forest Feature Importances")
plt.tight_layout()

plt.subplot(1,2,2)
plt.barh(feature_names, gb_importances)
plt.title("Gradient Boosting Feature Importances")
plt.tight_layout()

plt.show()
"""))

# ---------------------------------------------------------
# Placeholder Markdown for interpretation
# ---------------------------------------------------------
cells.append(md("""
### ‚úçÔ∏è Interpretation Notes (to complete after running Section 3.4)

- Compare Random Forest vs Gradient Boosting metrics  
- Discuss why boosting sometimes outperforms bagging  
- Examine which features are most important and why  
- Connect results to the conceptual slides on bagging vs boosting  
- Note differences in model stability and overfitting behavior  

---
"""))

# ---------------------------------------------------------
# Append Builder 5 output to notebook
# ---------------------------------------------------------
nb.cells.extend(cells)

with open(OUT_PATH, "w", encoding="utf-8") as f:
    f.write(nbf.writes(nb))

print("Builder 5 applied ‚Äî Section 3.4 (Ensemble Methods) added to Module_03_Master.")


Builder 5 applied ‚Äî Section 3.4 (Ensemble Methods) added to Module_03_Master.


In [6]:
# === Builder 6: Append Section 3.5 ‚Äî Classification Metrics (ROC, PR Curve, Confusion Matrix) ===

import os, nbformat as nbf

OUT_DIR = "module3_master"
OUT_PATH = os.path.join(OUT_DIR, "Module_03_Master.ipynb")

nb = nbf.read(open(OUT_PATH, "r", encoding="utf-8"), as_version=4)

def md(txt): 
    return nbf.v4.new_markdown_cell(txt)

def code(txt):
    return nbf.v4.new_code_cell(txt)

cells = []

# ---------------------------------------------------------
# Markdown Header
# ---------------------------------------------------------
cells.append(md("""
# 3.5 ‚Äî Classification Metrics  
Confusion Matrix ‚Ä¢ ROC Curve ‚Ä¢ Precision‚ÄìRecall Curve ‚Ä¢ Threshold Effects

In this section we evaluate classifiers using key metrics for binary classification.

We will:
- Visualize the **confusion matrix**  
- Plot the **ROC curve** and compute **AUC**  
- Plot the **Precision‚ÄìRecall curve**  
- Explore how **threshold changes** affect performance  
"""))

# ---------------------------------------------------------
# Confusion Matrix Visualization
# ---------------------------------------------------------
cells.append(code("""
# --- 3.5 Confusion Matrix (Visualization) ---

# yhat (class predictions) and yprob (probabilities) were created in Section 3.2

cm = confusion_matrix(yte, yhat)

plt.figure(figsize=(5,4))
plt.imshow(cm, cmap='Blues')
plt.title("Confusion Matrix ‚Äî Logistic Regression")
plt.colorbar()

plt.xticks([0,1], ["Pred 0", "Pred 1"])
plt.yticks([0,1], ["True 0", "True 1"])

for (i, j), value in np.ndenumerate(cm):
    plt.text(j, i, f"{value}", ha='center', va='center', fontsize=14)

plt.show()

cm
"""))

# ---------------------------------------------------------
# ROC Curve
# ---------------------------------------------------------
cells.append(code("""
# --- ROC Curve & AUC ---

fpr, tpr, thresholds = roc_curve(yte, yprob)
auc = roc_auc_score(yte, yprob)

plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, label=f"AUC = {auc:.3f}")
plt.plot([0,1], [0,1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve ‚Äî Logistic Regression")
plt.legend()
plt.show()
"""))

# ---------------------------------------------------------
# Precision‚ÄìRecall Curve
# ---------------------------------------------------------
cells.append(code("""
# --- Precision‚ÄìRecall Curve ---

precisions, recalls, pr_thresholds = precision_recall_curve(yte, yprob)

plt.figure(figsize=(6,5))
plt.plot(recalls, precisions)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision‚ÄìRecall Curve ‚Äî Logistic Regression")
plt.show()
"""))

# ---------------------------------------------------------
# Threshold Effects (Precision vs Recall)
# ---------------------------------------------------------
cells.append(code("""
# --- Threshold Effects on Precision & Recall ---

test_thresholds = [0.2, 0.5, 0.8]

for t in test_thresholds:
    preds = (yprob >= t).astype(int)
    prec = precision_score(yte, preds)
    rec  = recall_score(yte, preds)
    print(f"Threshold={t:.2f} ‚Üí Precision={prec:.3f} | Recall={rec:.3f}")
"""))

# ---------------------------------------------------------
# Placeholder Markdown
# ---------------------------------------------------------
cells.append(md("""
### ‚úçÔ∏è Interpretation Notes (to complete after running Section 3.5)

- What do the confusion matrix values mean in context?  
- Explain the shape of the ROC curve and why AUC matters  
- Compare ROC vs Precision‚ÄìRecall curves  
- Explain how lowering or raising the threshold changes FP, FN, precision, and recall  
- Provide examples of when you would prefer:  
  - High recall  
  - High precision  
  - High AUC  

---
"""))

# ---------------------------------------------------------
# Append & Save Notebook
# ---------------------------------------------------------
nb.cells.extend(cells)

with open(OUT_PATH, "w", encoding="utf-8") as f:
    f.write(nbf.writes(nb))

print("Builder 6 applied ‚Äî Section 3.5 Classification Metrics added to Module_03_Master.")


Builder 6 applied ‚Äî Section 3.5 Classification Metrics added to Module_03_Master.


In [7]:
# === Builder 7: Append Section 3.6 ‚Äî Model Tuning & Validation ===

import os, nbformat as nbf

OUT_DIR = "module3_master"
OUT_PATH = os.path.join(OUT_DIR, "Module_03_Master.ipynb")

# Load existing notebook
nb = nbf.read(open(OUT_PATH, "r", encoding="utf-8"), as_version=4)

def md(txt):
    return nbf.v4.new_markdown_cell(txt)

def code(txt):
    return nbf.v4.new_code_cell(txt)

cells = []

# ---------------------------------------------------------
# Section Header
# ---------------------------------------------------------
cells.append(md("""
# 3.6 ‚Äî Model Tuning & Validation

In this section we explore **model tuning**, **validation curves**, and 
underfitting vs. overfitting. We will:

- Tune Logistic Regression using the regularization parameter **C**
- Tune Decision Trees using **max_depth**
- Use `validation_curve` for visual inspection
- Use GridSearchCV for deeper searches
"""))

# ---------------------------------------------------------
# Validation Curve ‚Äî Logistic Regression (C parameter)
# ---------------------------------------------------------
cells.append(code("""
# --- 3.6 Validation Curve: Logistic Regression C parameter ---

from sklearn.model_selection import validation_curve

# Values of C to test (inverse of regularization strength)
C_values = np.logspace(-3, 3, 7)

train_scores, val_scores = validation_curve(
    estimator=log_reg, 
    X=Xtr, y=ytr,
    param_name="model__C",
    param_range=C_values,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

train_mean = train_scores.mean(axis=1)
val_mean = val_scores.mean(axis=1)

plt.figure(figsize=(6,4))
plt.semilogx(C_values, train_mean, marker="o", label="Train Accuracy")
plt.semilogx(C_values, val_mean, marker="s", label="Validation Accuracy")
plt.xlabel("C (inverse of regularization strength)")
plt.ylabel("Accuracy")
plt.title("Validation Curve ‚Äî Logistic Regression")
plt.legend()
plt.show()
"""))

# ---------------------------------------------------------
# Validation Curve ‚Äî Decision Tree (max_depth)
# ---------------------------------------------------------
cells.append(code("""
# --- 3.6 Validation Curve: Decision Tree max_depth ---

depth_range = [1, 2, 3, 4, 5, 6, 8, 10]

train_scores, val_scores = validation_curve(
    estimator=Pipeline([('pre', pre), ('model', DecisionTreeClassifier(random_state=1955))]),
    X=Xtr, y=ytr,
    param_name="model__max_depth",
    param_range=depth_range,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

train_mean = train_scores.mean(axis=1)
val_mean = val_scores.mean(axis=1)

plt.figure(figsize=(6,4))
plt.plot(depth_range, train_mean, marker="o", label="Train Accuracy")
plt.plot(depth_range, val_mean, marker="s", label="Validation Accuracy")
plt.xlabel("Tree max_depth")
plt.ylabel("Accuracy")
plt.title("Validation Curve ‚Äî Decision Tree")
plt.legend()
plt.show()
"""))

# ---------------------------------------------------------
# Simple GridSearchCV ‚Äî Logistic Regression C
# ---------------------------------------------------------
cells.append(code("""
# --- 3.6 GridSearchCV: Logistic Regression C ---

grid_C = {"model__C": np.logspace(-3, 3, 10)}

gs_log = GridSearchCV(
    estimator=log_reg,
    param_grid=grid_C,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

gs_log.fit(Xtr, ytr)

print("Best C:", gs_log.best_params_['model__C'])
print("Best CV Accuracy:", gs_log.best_score_)

best_log = gs_log.best_estimator_
print("Test Accuracy:", best_log.score(Xte, yte))
"""))

# ---------------------------------------------------------
# Simple GridSearchCV ‚Äî Decision Tree Depth
# ---------------------------------------------------------
cells.append(code("""
# --- 3.6 GridSearchCV: Decision Tree max_depth ---

param_depth = {"model__max_depth": [1, 2, 3, 4, 5, 6, 8, 10]}

gs_tree = GridSearchCV(
    estimator=Pipeline([('pre', pre), ('model', DecisionTreeClassifier(random_state=1955))]),
    param_grid=param_depth,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

gs_tree.fit(Xtr, ytr)

print("Best max_depth:", gs_tree.best_params_['model__max_depth'])
print("Best CV Accuracy:", gs_tree.best_score_)

best_tree = gs_tree.best_estimator_
print("Test Accuracy:", best_tree.score(Xte, yte))
"""))

# ---------------------------------------------------------
# Placeholder Markdown for future explanation
# ---------------------------------------------------------
cells.append(md("""
### ‚úçÔ∏è Interpretation Notes (to fill after running Section 3.6)

- In the logistic regression validation curve:
  - Why does accuracy drop for very small C?
  - Why might accuracy drop for very large C?
- In the decision tree validation curve:
  - Which depth leads to overfitting?
  - Which depth appears to generalize best?
- Compare GridSearchCV‚Äôs selected hyperparameters to the validation curve insights.
- Discuss the trade-offs between:
  - Underfitting vs Overfitting  
  - Simplicity vs Complexity  
  - Accuracy vs Interpretability  

---
"""))

# ---------------------------------------------------------
# Append cells and save
# ---------------------------------------------------------
nb.cells.extend(cells)

with open(OUT_PATH, "w", encoding="utf-8") as f:
    f.write(nbf.writes(nb))

print("Builder 7 applied ‚Äî Section 3.6 added to Module_03_Master. Module 3 build complete!")


Builder 7 applied ‚Äî Section 3.6 added to Module_03_Master. Module 3 build complete!


In [8]:
# === Builder 3.7A: Create Hands-On Exercise A (Heart Disease Classification) ===

import os, nbformat as nbf

OUT_DIR = "module3_master"
os.makedirs(OUT_DIR, exist_ok=True)

OUT_PATH = os.path.join(OUT_DIR, "03_Hands_On_Exercise_A_HeartDisease.ipynb")

def md(txt): 
    return nbf.v4.new_markdown_cell(txt)

def code(txt): 
    return nbf.v4.new_code_cell(txt)

nb = nbf.v4.new_notebook()
cells = []

# -----------------------------------------------------------
# TITLE
# -----------------------------------------------------------
cells.append(md("""
# üß™ Module 3 ‚Äî Hands-On Exercise A  
## Heart Disease Classification (Logistic Regression ‚Ä¢ Trees ‚Ä¢ Ensembles ‚Ä¢ Metrics)

### Goal
- Compare **four classifiers** side-by-side  
- Practice evaluating with **multiple metrics** (not just accuracy)  
- Explore **thresholding**, **ROC curves**, and **model tuning**  
- Gain intuition for **trade-offs** between interpretability and performance  
"""))

# -----------------------------------------------------------
# IMPORTS + DATASET LOAD
# -----------------------------------------------------------
cells.append(code("""
# --- Imports ---
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, precision_recall_curve, confusion_matrix
)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Dataset helper
from datasets_module3 import make_heart_disease_synth

SEED = 1955

# --- Step 1: Load Dataset ---
df = make_heart_disease_synth(n=600, seed=SEED)
df.head()
"""))

cells.append(md("""
### üîç Step 1 ‚Äî Explore the Dataset
Use `df.head()`, `df.info()`, and `df.describe()` to understand the features and the target (`disease`).
"""))

# -----------------------------------------------------------
# STEP 2 ‚Äî CLEAN & PREPARE
# -----------------------------------------------------------
cells.append(md("""
## üßº Step 2 ‚Äî Clean & Prepare the Data
We will:
- Drop missing targets  
- Identify numeric and categorical columns  
- Build a preprocessing pipeline (impute + scale/encode)  
"""))

cells.append(code("""
# --- Step 2: Clean & Prepare ---

X = df.drop('disease', axis=1)
y = df['disease']

num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

from sklearn.impute import SimpleImputer

num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scale', StandardScaler())
])

cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

pre = ColumnTransformer([
    ('num', num_pipe, num_cols),
    ('cat', cat_pipe, cat_cols)
])
"""))

# -----------------------------------------------------------
# STEP 3 ‚Äî TRAIN/TEST SPLIT
# -----------------------------------------------------------
cells.append(md("## üîÄ Step 3 ‚Äî Train/Test Split"))

cells.append(code("""
Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, random_state=SEED)
Xtr.shape, Xte.shape
"""))

# -----------------------------------------------------------
# STEP 4 ‚Äî BASELINE LOGISTIC REGRESSION
# -----------------------------------------------------------
cells.append(md("## ‚öôÔ∏è Step 4 ‚Äî Logistic Regression (Baseline)"))

cells.append(code("""
log_reg = Pipeline([
    ('pre', pre),
    ('model', LogisticRegression(max_iter=500, random_state=SEED))
])

log_reg.fit(Xtr, ytr)
yhat_lr = log_reg.predict(Xte)
yprob_lr = log_reg.predict_proba(Xte)[:, 1]

# Metrics
print("Logistic Regression Metrics:")
print("Accuracy :", accuracy_score(yte, yhat_lr))
print("Precision:", precision_score(yte, yhat_lr))
print("Recall   :", recall_score(yte, yhat_lr))
print("F1 Score :", f1_score(yte, yhat_lr))
print("AUC      :", roc_auc_score(yte, yprob_lr))
"""))

# -----------------------------------------------------------
# STEP 5 ‚Äî DECISION TREE
# -----------------------------------------------------------
cells.append(md("## üå≥ Step 5 ‚Äî Decision Tree Classifier"))

cells.append(code("""
tree = Pipeline([
    ('pre', pre),
    ('model', DecisionTreeClassifier(max_depth=4, random_state=SEED))
])

tree.fit(Xtr, ytr)
yhat_tree = tree.predict(Xte)
yprob_tree = tree.predict_proba(Xte)[:, 1]

print("Decision Tree Metrics:")
print("Accuracy :", accuracy_score(yte, yhat_tree))
print("Precision:", precision_score(yte, yhat_tree))
print("Recall   :", recall_score(yte, yhat_tree))
print("F1 Score :", f1_score(yte, yhat_tree))
print("AUC      :", roc_auc_score(yte, yprob_tree))
"""))

# -----------------------------------------------------------
# STEP 6 ‚Äî ENSEMBLE MODELS
# -----------------------------------------------------------
cells.append(md("## üå≤ Step 6 ‚Äî Random Forest & Gradient Boosting"))

cells.append(code("""
# Random Forest
rf = Pipeline([
    ('pre', pre),
    ('model', RandomForestClassifier(n_estimators=200, random_state=SEED))
])

# Gradient Boosting
gb = Pipeline([
    ('pre', pre),
    ('model', GradientBoostingClassifier(
        learning_rate=0.05, 
        n_estimators=200, 
        max_depth=3,
        random_state=SEED))
])

rf.fit(Xtr, ytr)
gb.fit(Xtr, ytr)

yhat_rf = rf.predict(Xte)
yprob_rf = rf.predict_proba(Xte)[:, 1]

yhat_gb = gb.predict(Xte)
yprob_gb = gb.predict_proba(Xte)[:, 1]

print("Random Forest AUC:", roc_auc_score(yte, yprob_rf))
print("Gradient Boosting AUC:", roc_auc_score(yte, yprob_gb))
"""))

# -----------------------------------------------------------
# STEP 7 ‚Äî METRICS SUMMARY TABLE
# -----------------------------------------------------------
cells.append(md("## üìä Step 7 ‚Äî Compare All Models (Metrics Table)"))

cells.append(code("""
def evaluate(name, pred, prob):
    return {
        "Model": name,
        "Accuracy": accuracy_score(yte, pred),
        "Precision": precision_score(yte, pred),
        "Recall": recall_score(yte, pred),
        "F1": f1_score(yte, pred),
        "AUC": roc_auc_score(yte, prob)
    }

results = pd.DataFrame([
    evaluate("Logistic Regression", yhat_lr, yprob_lr),
    evaluate("Decision Tree", yhat_tree, yprob_tree),
    evaluate("Random Forest", yhat_rf, yprob_rf),
    evaluate("Gradient Boosting", yhat_gb, yprob_gb)
])

results.sort_values("AUC", ascending=False)
"""))

# -----------------------------------------------------------
# STEP 8 ‚Äî ROC CURVES FOR ALL MODELS
# -----------------------------------------------------------
cells.append(md("## üìà Step 8 ‚Äî ROC Curves (All Models)"))

cells.append(code("""
plt.figure(figsize=(7,6))

for name, prob in [
    ("Logistic", yprob_lr),
    ("Tree", yprob_tree),
    ("RF", yprob_rf),
    ("GB", yprob_gb)
]:
    fpr, tpr, _ = roc_curve(yte, prob)
    plt.plot(fpr, tpr, label=name)

plt.plot([0,1],[0,1],'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curves ‚Äî All Models")
plt.legend()
plt.show()
"""))

# -----------------------------------------------------------
# STEP 9 ‚Äî GRID SEARCH
# -----------------------------------------------------------
cells.append(md("## üõ†Ô∏è Step 9 ‚Äî Model Tuning (Grid Search)"))

cells.append(code("""
params = {"model__max_depth": [2,3,4,5,6,8]}

gs_tree = GridSearchCV(
    Pipeline([('pre', pre), ('model', DecisionTreeClassifier(random_state=SEED))]),
    param_grid=params,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

gs_tree.fit(Xtr, ytr)

print("Best Tree Depth:", gs_tree.best_params_['model__max_depth'])
print("Best CV Acc    :", gs_tree.best_score_)
"""))

# -----------------------------------------------------------
# STEP 10 ‚Äî REFLECTION
# -----------------------------------------------------------
cells.append(md("""
## üß† Step 10 ‚Äî Reflection Questions

- Which model performed best overall? Why?  
- Which metric (Accuracy, Precision, Recall, F1, AUC) changed your opinion the most?  
- When might you prefer Logistic Regression over Random Forest?  
- Would you deploy Gradient Boosting if interpretability mattered?  
- How did tuning the Decision Tree affect performance?  

---
"""))

# -----------------------------------------------------------
# SAVE THE NOTEBOOK
# -----------------------------------------------------------
nb["cells"] = cells

with open(OUT_PATH, "w", encoding="utf-8") as f:
    f.write(nbf.writes(nb))

print("3.7A Hands-On Exercise created at:", OUT_PATH)


3.7A Hands-On Exercise created at: module3_master/03_Hands_On_Exercise_A_HeartDisease.ipynb


In [9]:
# === Builder 3.7B: Create Hands-On Exercise B (Spam Classification) ===

import os, nbformat as nbf

OUT_DIR = "module3_master"
os.makedirs(OUT_DIR, exist_ok=True)

OUT_PATH = os.path.join(OUT_DIR, "03_Hands_On_Exercise_B_Spam.ipynb")

def md(text):
    return nbf.v4.new_markdown_cell(text)

def code(text):
    return nbf.v4.new_code_cell(text)

nb = nbf.v4.new_notebook()
cells = []

# -----------------------------------------------------------
# TITLE
# -----------------------------------------------------------
cells.append(md("""
# üß™ Module 3 ‚Äî Hands-On Exercise B  
## Spam Classification (Binary Classification)

### Goal
- Compare **four classifiers** on a spam-detection dataset  
- Focus on *probabilities, thresholds, and trade-offs*  
- Assess models using **precision, recall, F1, AUC, accuracy**  
- Visualize **ROC curves**  
- Perform simple **hyperparameter tuning**

The dataset is synthetic but realistic, inspired by engineered text features
(e.g., number of links, caps, free-domain senders, spammy words, etc.).
"""))

# -----------------------------------------------------------
# IMPORTS + DATA LOADING
# -----------------------------------------------------------
cells.append(code("""
# --- Imports ---
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, precision_recall_curve, confusion_matrix
)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.impute import SimpleImputer

# Dataset helper
from datasets_module3 import make_spam_synth

SEED = 1955

# --- Step 1: Load Dataset ---
df = make_spam_synth(n=1000, seed=SEED)
df.head()
"""))

cells.append(md("""
### üîç Step 1 ‚Äî Explore the Dataset
Use:
```python
df.info()
df.describe()
df['spam'].value_counts()
to understand feature distributions and class balance.
"""))

#-----------------------------------------------------------
#STEP 2 ‚Äî CLEAN & PREPARE
#-----------------------------------------------------------

cells.append(md("""

üßº Step 2 ‚Äî Clean & Prepare the Data

We will:

Identify numeric and categorical columns

Build preprocessing pipeline

median imputation (numeric)

most-frequent imputation (categorical)

scaling + one-hot
"""))

cells.append(code("""

--- Step 2: Clean & Prepare ---

X = df.drop('spam', axis=1)
y = df['spam']

num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

num_pipe = Pipeline([
('imputer', SimpleImputer(strategy='median')),
('scale', StandardScaler())
])

cat_pipe = Pipeline([
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])

pre = ColumnTransformer([
('num', num_pipe, num_cols),
('cat', cat_pipe, cat_cols)
])
"""))

#-----------------------------------------------------------
#STEP 3 ‚Äî SPLIT
#-----------------------------------------------------------

cells.append(md("## üîÄ Step 3 ‚Äî Train/Test Split"))

cells.append(code("""
Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, random_state=SEED)
Xtr.shape, Xte.shape
"""))

#-----------------------------------------------------------
#STEP 4 ‚Äî LOGISTIC REGRESSION
#-----------------------------------------------------------

cells.append(md("## ‚öôÔ∏è Step 4 ‚Äî Logistic Regression (Baseline)"))

cells.append(code("""
log_reg = Pipeline([
('pre', pre),
('model', LogisticRegression(max_iter=500, random_state=SEED))
])

log_reg.fit(Xtr, ytr)
yhat_lr = log_reg.predict(Xte)
yprob_lr = log_reg.predict_proba(Xte)[:, 1]

print("Logistic Regression Metrics:")
print("Accuracy :", accuracy_score(yte, yhat_lr))
print("Precision:", precision_score(yte, yhat_lr))
print("Recall :", recall_score(yte, yhat_lr))
print("F1 Score :", f1_score(yte, yhat_lr))
print("AUC :", roc_auc_score(yte, yprob_lr))
"""))

#-----------------------------------------------------------
#STEP 5 ‚Äî DECISION TREE
#-----------------------------------------------------------

cells.append(md("## üå≥ Step 5 ‚Äî Decision Tree Classifier"))

cells.append(code("""
tree = Pipeline([
('pre', pre),
('model', DecisionTreeClassifier(max_depth=4, random_state=SEED))
])

tree.fit(Xtr, ytr)
yhat_tree = tree.predict(Xte)
yprob_tree = tree.predict_proba(Xte)[:, 1]

print("Decision Tree Metrics:")
print("Accuracy :", accuracy_score(yte, yhat_tree))
print("Precision:", precision_score(yte, yhat_tree))
print("Recall :", recall_score(yte, yhat_tree))
print("F1 Score :", f1_score(yte, yhat_tree))
print("AUC :", roc_auc_score(yte, yprob_tree))
"""))

#-----------------------------------------------------------
#STEP 6 ‚Äî ENSEMBLES
#-----------------------------------------------------------

cells.append(md("## üå≤ Step 6 ‚Äî Random Forest & Gradient Boosting"))

cells.append(code("""

Random Forest

rf = Pipeline([
('pre', pre),
('model', RandomForestClassifier(n_estimators=200, random_state=SEED))
])

Gradient Boosting

gb = Pipeline([
('pre', pre),
('model', GradientBoostingClassifier(
learning_rate=0.05,
n_estimators=200,
max_depth=3,
random_state=SEED))
])

rf.fit(Xtr, ytr)
gb.fit(Xtr, ytr)

yhat_rf = rf.predict(Xte)
yprob_rf = rf.predict_proba(Xte)[:, 1]

yhat_gb = gb.predict(Xte)
yprob_gb = gb.predict_proba(Xte)[:, 1]

print("Random Forest AUC:", roc_auc_score(yte, yprob_rf))
print("Gradient Boosting AUC:", roc_auc_score(yte, yprob_gb))
"""))

#-----------------------------------------------------------
#STEP 7 ‚Äî METRICS SUMMARY TABLE
#-----------------------------------------------------------

cells.append(md("## üìä Step 7 ‚Äî Compare All Models (Metrics Table)"))

cells.append(code("""
def evaluate(name, pred, prob):
return {
"Model": name,
"Accuracy": accuracy_score(yte, pred),
"Precision": precision_score(yte, pred),
"Recall": recall_score(yte, pred),
"F1": f1_score(yte, pred),
"AUC": roc_auc_score(yte, prob)
}

results = pd.DataFrame([
evaluate("Logistic Regression", yhat_lr, yprob_lr),
evaluate("Decision Tree", yhat_tree, yprob_tree),
evaluate("Random Forest", yhat_rf, yprob_rf),
evaluate("Gradient Boosting", yhat_gb, yprob_gb)
])

results.sort_values("AUC", ascending=False)
"""))

#-----------------------------------------------------------
#STEP 8 ‚Äî ROC CURVES
#-----------------------------------------------------------

cells.append(md("## üìà Step 8 ‚Äî ROC Curves (All Models)"))

cells.append(code("""
plt.figure(figsize=(7,6))

for name, prob in [
("Logistic", yprob_lr),
("Tree", yprob_tree),
("RF", yprob_rf),
("GB", yprob_gb)
]:
fpr, tpr, _ = roc_curve(yte, prob)
plt.plot(fpr, tpr, label=name)

plt.plot([0,1],[0,1],'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curves ‚Äî All Models")
plt.legend()
plt.show()
"""))

#-----------------------------------------------------------
#STEP 9 ‚Äî GRID SEARCH
#-----------------------------------------------------------

cells.append(md("## üõ† Step 9 ‚Äî Model Tuning (Grid Search)"))

cells.append(code("""
params = {"model__max_depth": [2,3,4,5,6,8]}

gs_tree = GridSearchCV(
Pipeline([('pre', pre), ('model', DecisionTreeClassifier(random_state=SEED))]),
param_grid=params,
cv=5,
scoring="accuracy",
n_jobs=-1
)

gs_tree.fit(Xtr, ytr)

print("Best Tree Depth:", gs_tree.best_params_['model__max_depth'])
print("Best CV Acc:", gs_tree.best_score_)
"""))

#-----------------------------------------------------------
#STEP 10 ‚Äî REFLECTION
#-----------------------------------------------------------

cells.append(md("""

üß† Step 10 ‚Äî Reflection Questions

Which model performed best overall?

Which metric (Accuracy, Precision, Recall, F1, AUC) changed your assessment the most?

When is high precision more important than high recall in spam detection?

Would you deploy a black-box model (RF/GB) if interpretability matters?

How did tuning the Decision Tree affect performance?

"""))

#-----------------------------------------------------------
#SAVE NOTEBOOK
#-----------------------------------------------------------

nb["cells"] = cells

with open(OUT_PATH, "w", encoding="utf-8") as f:
    f.write(nbf.writes(nb))

print("3.7B (Spam) Hands-On Notebook CREATED at:", OUT_PATH)

3.7B (Spam) Hands-On Notebook CREATED at: module3_master/03_Hands_On_Exercise_B_Spam.ipynb


In [11]:
# === Builder 3.7C: Create Hands-On Exercise C (Wine Multiclass Classification) ===

import os
import nbformat as nbf

# ------------- Setup paths -------------
OUT_DIR = "module3_master"
os.makedirs(OUT_DIR, exist_ok=True)

OUT_PATH = os.path.join(OUT_DIR, "03_Hands_On_Exercise_C_Wine.ipynb")

# Helpers to build cells
def md(text: str):
    return nbf.v4.new_markdown_cell(text)

def code(text: str):
    return nbf.v4.new_code_cell(text)

# Create a new notebook object
nb = nbf.v4.new_notebook()
cells = []

# ------------- Title cell -------------
cells.append(md(
"""# üß™ Module 3 ‚Äî Hands-On Exercise C  
## Wine Multiclass Classification (Red vs White vs Ros√©)

### Goal
- Work with a **multiclass** classification problem (3 wine types)  
- Compare **four classifiers** (Logistic Regression, Decision Tree, Random Forest, Gradient Boosting)  
- Use metrics such as **accuracy**, **macro precision**, **macro recall**, **macro F1**  
- Visualize a **3√ó3 confusion matrix**  
- Explore *one-vs-rest* ROC curves for each class  
- Tune tree depth using **GridSearchCV**  

The dataset is synthetic but realistic, based on typical wine chemistry properties.
"""
))

# ------------- Imports + dataset load -------------
cells.append(code(
"""# --- Imports & Data Loading ---

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, roc_curve, roc_auc_score
)
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Dataset helper
from datasets_module3 import make_wine_synth

SEED = 1955

# --- Step 1: Load Dataset ---
df = make_wine_synth(n=800, seed=SEED)
df.head()
"""
))

cells.append(md(
"""### üîç Step 1 ‚Äî Explore the Dataset

Use:
```python
df.info()
df.describe()
df['wine_type'].value_counts()
to understand:

the feature distributions

class balance among the three wine types

which columns are numeric vs potential categorical fields.
"""
))

#------------- Step 2: Clean & Prepare -------------

cells.append(md(
"""## üßº Step 2 ‚Äî Clean & Prepare the Data

We will:

Separate features and target

Build a preprocessing pipeline for numeric features

(All features are numeric, so no one-hot encoding is needed here.)
"""
))

cells.append(code(
"""# --- Step 2: Clean & Prepare ---

Features and target

X = df.drop('wine_type', axis=1)
y = df['wine_type'] # 0 = red, 1 = white, 2 = ros√©

Identify numeric columns

num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [] # no categorical columns in this synthetic dataset

Preprocessing: numeric pipeline (median imputation + scaling)

num_pipe = Pipeline([
('imputer', SimpleImputer(strategy='median')),
('scale', StandardScaler())
])

We still use a ColumnTransformer for consistency with other exercises

pre = ColumnTransformer([
('num', num_pipe, num_cols)
# no categorical block here
])
"""
))

#------------- Step 3: Train/Test Split -------------

cells.append(md(
"""## üîÄ Step 3 ‚Äî Train/Test Split

We will hold out 20% of the data as a test set for final evaluation.
"""
))

cells.append(code(
"""# --- Step 3: Split into train and test sets ---

Xtr, Xte, ytr, yte = train_test_split(
X, y, test_size=0.2, random_state=SEED, stratify=y
)

Xtr.shape, Xte.shape, ytr.value_counts(), yte.value_counts()
"""
))

#------------- Step 4: Logistic Regression -------------

cells.append(md(
"""## ‚öôÔ∏è Step 4 ‚Äî Multiclass Logistic Regression (One-vs-Rest / Softmax)

Logistic Regression can be extended to multiclass prediction.
We will fit a single model that predicts all three wine types.
"""
))

cells.append(code(
"""# --- Step 4: Logistic Regression (multiclass) ---

log_reg = Pipeline([
('pre', pre),
('model', LogisticRegression(
max_iter=1000,
multi_class='auto',
random_state=SEED
))
])

log_reg.fit(Xtr, ytr)

yhat_lr = log_reg.predict(Xte)
yprob_lr = log_reg.predict_proba(Xte) # shape: (n_samples, 3)

print("Logistic Regression Metrics:")
print("Accuracy :", accuracy_score(yte, yhat_lr))
print("Macro Precision:", precision_score(yte, yhat_lr, average='macro'))
print("Macro Recall :", recall_score(yte, yhat_lr, average='macro'))
print("Macro F1 :", f1_score(yte, yhat_lr, average='macro'))
"""
))

#------------- Step 5: Decision Tree -------------

cells.append(md(
"""## üå≥ Step 5 ‚Äî Decision Tree Classifier

We now fit a Decision Tree and compare it with Logistic Regression.
"""
))

cells.append(code(
"""# --- Step 5: Decision Tree (multiclass) ---

tree = Pipeline([
('pre', pre),
('model', DecisionTreeClassifier(
max_depth=5,
random_state=SEED
))
])

tree.fit(Xtr, ytr)

yhat_tree = tree.predict(Xte)
yprob_tree = tree.predict_proba(Xte)

print("Decision Tree Metrics:")
print("Accuracy :", accuracy_score(yte, yhat_tree))
print("Macro Precision:", precision_score(yte, yhat_tree, average='macro'))
print("Macro Recall :", recall_score(yte, yhat_tree, average='macro'))
print("Macro F1 :", f1_score(yte, yhat_tree, average='macro'))
"""
))

#------------- Step 6: Ensembles -------------

cells.append(md(
"""## üå≤ Step 6 ‚Äî Random Forest & Gradient Boosting (Multiclass)

We now try ensemble models:

Random Forest (bagging)

Gradient Boosting (boosting-style, similar to XGBoost)
"""
))

cells.append(code(
"""# --- Step 6: Ensemble Models (RF & GB) ---

rf = Pipeline([
('pre', pre),
('model', RandomForestClassifier(
n_estimators=200,
max_depth=None,
random_state=SEED
))
])

gb = Pipeline([
('pre', pre),
('model', GradientBoostingClassifier(
learning_rate=0.05,
n_estimators=200,
max_depth=3,
random_state=SEED
))
])

rf.fit(Xtr, ytr)
gb.fit(Xtr, ytr)

yhat_rf = rf.predict(Xte)
yprob_rf = rf.predict_proba(Xte)

yhat_gb = gb.predict(Xte)
yprob_gb = gb.predict_proba(Xte)

print("Random Forest Metrics:")
print("Accuracy :", accuracy_score(yte, yhat_rf))
print("Macro Precision:", precision_score(yte, yhat_rf, average='macro'))
print("Macro Recall :", recall_score(yte, yhat_rf, average='macro'))
print("Macro F1 :", f1_score(yte, yhat_rf, average='macro'))

print("\nGradient Boosting Metrics:")
print("Accuracy :", accuracy_score(yte, yhat_gb))
print("Macro Precision:", precision_score(yte, yhat_gb, average='macro'))
print("Macro Recall :", recall_score(yte, yhat_gb, average='macro'))
print("Macro F1 :", f1_score(yte, yhat_gb, average='macro'))
"""
))

#------------- Step 7: Metrics Summary Table -------------

cells.append(md(
"""## üìä Step 7 ‚Äî Metrics Summary (All Models)

We will compare all models using macro-averaged Precision, Recall, and F1
to treat all three classes equally.
"""
))

cells.append(code(
"""# --- Step 7: Compare All Models (Metrics Table) ---

def evaluate_multiclass(name, y_true, y_pred):
return {
"Model": name,
"Accuracy": accuracy_score(y_true, y_pred),
"Macro Precision": precision_score(y_true, y_pred, average='macro'),
"Macro Recall": recall_score(y_true, y_pred, average='macro'),
"Macro F1": f1_score(y_true, y_pred, average='macro')
}

results = pd.DataFrame([
evaluate_multiclass("Logistic Regression", yte, yhat_lr),
evaluate_multiclass("Decision Tree", yte, yhat_tree),
evaluate_multiclass("Random Forest", yte, yhat_rf),
evaluate_multiclass("Gradient Boosting", yte, yhat_gb)
])

results.sort_values("Macro F1", ascending=False)
"""
))

#------------- Step 8: Confusion Matrix -------------

cells.append(md(
"""## üß© Step 8 ‚Äî Confusion Matrix (3√ó3)

We now inspect a 3√ó3 confusion matrix to see which wine types
are most often confused with each other.
"""
))

cells.append(code(
"""# --- Step 8: Confusion Matrix (3√ó3) ---

cm = confusion_matrix(yte, yhat_rf) # you can swap RF for any model

plt.figure(figsize=(5,4))
plt.imshow(cm, cmap='Blues')
plt.title("Confusion Matrix ‚Äî Random Forest")
plt.colorbar()
plt.xticks([0,1,2], ["Red","White","Ros√©"])
plt.yticks([0,1,2], ["Red","White","Ros√©"])

for (i, j), value in np.ndenumerate(cm):
plt.text(j, i, str(value), ha='center', va='center', fontsize=12)

plt.xlabel("Predicted Class")
plt.ylabel("True Class")
plt.tight_layout()
plt.show()

cm
"""
))

#------------- Step 9: One-vs-Rest ROC Curves -------------

cells.append(md(
"""## üìà Step 9 ‚Äî One-vs-Rest ROC Curves (3 Classes)

We can treat each class as "positive" in turn and compute a one-vs-rest ROC curve.
"""
))

cells.append(code(
"""# --- Step 9: One-vs-Rest ROC Curves (using Random Forest probabilities) ---

yprob_rf has shape (n_samples, 3)

n_classes = 3
plt.figure(figsize=(7,6))

for c in range(n_classes):
# Binarize: class c vs all others
y_true_c = (yte == c).astype(int)
y_score_c = yprob_rf[:, c]
fpr, tpr, _ = roc_curve(y_true_c, y_score_c)
auc_c = roc_auc_score(y_true_c, y_score_c)
plt.plot(fpr, tpr, label=f"Class {c} (AUC={auc_c:.2f})")

plt.plot([0,1],[0,1],'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("One-vs-Rest ROC Curves ‚Äî Random Forest")
plt.legend()
plt.show()
"""
))

#------------- Step 10: Grid Search -------------

cells.append(md(
"""## üõ† Step 10 ‚Äî Simple Hyperparameter Tuning (Decision Tree Depth)

We perform a small grid search over max_depth for a Decision Tree to see
how depth affects multiclass classification performance.
"""
))

cells.append(code(
"""# --- Step 10: GridSearchCV on Decision Tree max_depth ---

param_grid = {"model__max_depth": [2,3,4,5,6,8]}

gs_tree = GridSearchCV(
estimator=Pipeline([
('pre', pre),
('model', DecisionTreeClassifier(random_state=SEED))
]),
param_grid=param_grid,
cv=5,
scoring='accuracy',
n_jobs=-1
)

gs_tree.fit(Xtr, ytr)

print("Best max_depth:", gs_tree.best_params_['model__max_depth'])
print("Best CV Accuracy:", gs_tree.best_score_)
print("Test Accuracy with best tree:", gs_tree.best_estimator_.score(Xte, yte))
"""
))

#------------- Reflection questions -------------

cells.append(md(
"""## üß† Reflection Questions

Which model gave the best Macro F1 score?

Did one model favor certain classes over others? (e.g., always predict "red")

How does the confusion matrix help you understand which classes are hard to separate?

Would you choose interpretability (Logistic, Tree) or performance (Random Forest, GB)
for this task if you were a wine producer or quality-control analyst?

How does tuning max_depth affect tree overfitting vs underfitting in this multiclass setting?

"""
))

#------------- Save notebook -------------

nb["cells"] = cells

with open(OUT_PATH, "w", encoding="utf-8") as f:
    f.write(nbf.writes(nb))

print("3.7C Hands-On Exercise (Wine Multiclass) CREATED at:", OUT_PATH)

3.7C Hands-On Exercise (Wine Multiclass) CREATED at: module3_master/03_Hands_On_Exercise_C_Wine.ipynb
