In [17]:
# Week 6 — Decision Trees & Random Forests
#Dataset: `carclaims 12.csv` | Target: `FraudFound`


In [18]:
import os, numpy as np, pandas as pd, matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, average_precision_score, confusion_matrix, roc_curve, precision_recall_curve

df = pd.read_csv("carclaims 12.csv")
df.columns = [c.strip() for c in df.columns]
target = "FraudFound"
yraw = df[target]
y = yraw if yraw.dtype!=object else yraw.astype(str).str.upper().map(
    {"Y":1,"YES":1,"1":1,"TRUE":1,"T":1,"N":0,"NO":0,"0":0,"FALSE":0,"F":0}
).astype(int)
X = df.drop(columns=[target])
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,stratify=y,random_state=42)

numeric = Pipeline([("impute", SimpleImputer(strategy="median")), ("scale", StandardScaler())])
categorical = Pipeline([("impute", SimpleImputer(strategy="most_frequent")), ("onehot", OneHotEncoder(handle_unknown="ignore"))])
pre = ColumnTransformer([("num", numeric, num_cols), ("cat", categorical, cat_cols)])


In [19]:
tree = Pipeline([("pre", pre), ("clf", DecisionTreeClassifier(random_state=42))])
param_tree = {
    "clf__max_depth": [None, 6, 10, 14],
    "clf__min_samples_leaf": [1, 2, 5],
    "clf__min_samples_split": [2, 10, 20]
}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
gs_tree = GridSearchCV(tree, param_tree, scoring="roc_auc", cv=cv, n_jobs=-1, refit=True)
gs_tree.fit(X_train, y_train)
best_tree = gs_tree.best_estimator_
proba_t = best_tree.predict_proba(X_test)[:,1]
pred_t  = (proba_t>=0.5).astype(int)

acc_t = accuracy_score(y_test, pred_t)
f1_t  = f1_score(y_test, pred_t)
auc_t = roc_auc_score(y_test, proba_t)
ap_t  = average_precision_score(y_test, proba_t)
gs_tree.best_params_, acc_t, f1_t, auc_t, ap_t


({'clf__max_depth': 6,
  'clf__min_samples_leaf': 1,
  'clf__min_samples_split': 20},
 0.9426070038910506,
 0.09230769230769231,
 0.8314022542722095,
 0.2288753423339097)

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, average_precision_score

candidates = [
    {"n_estimators": 200, "max_depth": None, "min_samples_leaf": 1, "max_features": "sqrt"},
    {"n_estimators": 200, "max_depth": 12,   "min_samples_leaf": 2, "max_features": "sqrt"},
    {"n_estimators": 300, "max_depth": 12,   "min_samples_leaf": 2, "max_features": "log2"},
    {"n_estimators": 300, "max_depth": 8,    "min_samples_leaf": 2, "max_features": "sqrt"},
]

best_oob = -1
best_model = None
for cfg in candidates:
    pipe = Pipeline([
        ("pre", pre),
        ("clf", RandomForestClassifier(
            random_state=42, n_jobs=-1, oob_score=True, bootstrap=True,
            max_samples=0.7,              # subsample for speed
            class_weight="balanced_subsample",
            **cfg
        ))
    ])
    pipe.fit(X_train, y_train)
    oob = pipe.named_steps["clf"].oob_score_
    if oob > best_oob:
        best_oob, best_model = oob, pipe

proba = best_model.predict_proba(X_test)[:, 1]
pred  = (proba >= 0.5).astype(int)

acc = accuracy_score(y_test, pred)
f1  = f1_score(y_test, pred)
auc = roc_auc_score(y_test, proba)
ap  = average_precision_score(y_test, proba)

print("Selected by OOB:", best_model.named_steps["clf"].get_params())
print(f"OOB: {best_oob:.3f} | Accuracy: {acc:.3f} | F1: {f1:.3f} | ROC-AUC: {auc:.3f} | PR-AUC: {ap:.3f}")



Selected by OOB: {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': 'balanced_subsample', 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': 0.7, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 200, 'n_jobs': -1, 'oob_score': True, 'random_state': 42, 'verbose': 0, 'warm_start': False}
OOB: 0.940 | Accuracy: 0.940 | F1: 0.011 | ROC-AUC: 0.852 | PR-AUC: 0.284


In [None]:


# Safety check (leave this as-is):
try:
    best_tree = BEST_TREE_CHOICE
except NameError:
    try:
        # If Cell 4 already set best_tree, use it directly
        best_tree
    except NameError:
        raise NameError(
            "Set BEST_TREE_CHOICE to your fitted DecisionTree pipeline "
            "(e.g., gs_tree.best_estimator_ or tree_fast) before running this cell."
        )

# ---- 2) Imports ----
import os, numpy as np, pandas as pd, matplotlib.pyplot as plt
from sklearn.metrics import (
    roc_curve, precision_recall_curve, confusion_matrix,
    roc_auc_score, average_precision_score, accuracy_score, f1_score
)

# ---- 3) Ensure output folder ----
os.makedirs("figs_wk6", exist_ok=True)

# ---- 4) Predict proba and labels (Decision Tree) ----
proba_t = best_tree.predict_proba(X_test)[:, 1]
pred_t  = (proba_t >= 0.5).astype(int)

# ---- 5) Metrics ----
acc_t = accuracy_score(y_test, pred_t)
f1_t  = f1_score(y_test, pred_t)
auc_t = roc_auc_score(y_test, proba_t)
ap_t  = average_precision_score(y_test, proba_t)
print(f"[Decision Tree] Accuracy: {acc_t:.3f} | F1: {f1_t:.3f} | ROC-AUC: {auc_t:.3f} | PR-AUC: {ap_t:.3f}")

# ---- 6) ROC curve ----
fpr, tpr, _ = roc_curve(y_test, proba_t)
plt.figure()
plt.plot(fpr, tpr, label=f"AUC = {auc_t:.3f}")
plt.plot([0, 1], [0, 1], "--", linewidth=1)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Week 6 — ROC (Decision Tree)")
plt.legend()
plt.savefig("figs_wk6/roc_tree.png", bbox_inches="tight")
plt.close()

# ---- 7) Precision–Recall curve ----
prec, rec, _ = precision_recall_curve(y_test, proba_t)
plt.figure()
plt.plot(rec, prec, label=f"AP = {ap_t:.3f}")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Week 6 — Precision–Recall (Decision Tree)")
plt.legend()
plt.savefig("figs_wk6/pr_tree.png", bbox_inches="tight")
plt.close()

# ---- 8) Confusion matrix ----
cm = confusion_matrix(y_test, pred_t)
plt.figure()
plt.imshow(cm, cmap="Blues")
for (i, j), v in np.ndenumerate(cm):
    plt.text(j, i, str(v), ha="center", va="center")
plt.title("Week 6 — Confusion Matrix (Decision Tree)")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.savefig("figs_wk6/cm_tree.png", bbox_inches="tight")
plt.close()

# ---- 9) Feature importances (handles OneHot naming robustly) ----
# Try to recover feature names from the ColumnTransformer in 'pre'
try:
    pre = best_tree.named_steps.get("pre", None)
    clf = best_tree.named_steps.get("clf", None)
    if pre is None or clf is None:
        raise ValueError("Pipeline must have steps 'pre' and 'clf'.")

    # Numeric feature names (from earlier in the notebook)
    numeric_names = list(num_cols) if 'num_cols' in globals() else []

    # Categorical OHE names
    cat_names = []
    try:
        cat_trans = pre.named_transformers_.get("cat", None)
        if cat_trans is not None and hasattr(cat_trans, "named_steps"):
            ohe = cat_trans.named_steps.get("onehot", None)
            if ohe is not None and hasattr(ohe, "get_feature_names_out") and 'cat_cols' in globals():
                cat_names = list(ohe.get_feature_names_out(cat_cols))
    except Exception:
        pass

    feat_names = numeric_names + cat_names

    importances = clf.feature_importances_
    if len(feat_names) != len(importances):
        # Fallback to generic names if sizes don't match
        feat_names = [f"feat_{i}" for i in range(len(importances))]
except Exception:
    # Last-resort fallback
    importances = best_tree.named_steps["clf"].feature_importances_
    feat_names  = [f"feat_{i}" for i in range(len(importances))]

imp_df = pd.DataFrame({"feature": feat_names, "importance": importances}).sort_values("importance", ascending=False)

plt.figure(figsize=(7, 6))
imp_df.head(20).sort_values("importance").plot(
    kind="barh", x="feature", y="importance", legend=False
)
plt.title("Week 6 — Decision Tree: Top 20 Feature Importances")
plt.tight_layout()
plt.savefig("figs_wk6/feature_importance_tree.png", bbox_inches="tight")
plt.close()

# Display top-20 importances in the notebook
imp_df.head(20)


[Decision Tree] Accuracy: 0.943 | F1: 0.092 | ROC-AUC: 0.831 | PR-AUC: 0.229


Unnamed: 0,feature,importance
76,Fault_Third Party,0.237938
136,AddressChange-Claim_2 to 3 years,0.177253
147,BasePolicy_Liability,0.169087
2,Age,0.128214
3,PolicyNumber,0.090526
78,PolicyType_Sedan - Collision,0.065359
119,AgeOfPolicyHolder_26 to 30,0.036882
9,Month_Aug,0.032383
81,PolicyType_Sport - Collision,0.020874
139,AddressChange-Claim_under 6 months,0.016095


<Figure size 700x600 with 0 Axes>