In [None]:
# Week 6 — Decision Trees & Random Forests
#Dataset: `carclaims 12.csv` | Target: `FraudFound`


In [2]:
import os, numpy as np, pandas as pd, matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, average_precision_score, confusion_matrix, roc_curve, precision_recall_curve

df = pd.read_csv("carclaims 12.csv")
df.columns = [c.strip() for c in df.columns]
target = "FraudFound"
yraw = df[target]
y = yraw if yraw.dtype!=object else yraw.astype(str).str.upper().map(
    {"Y":1,"YES":1,"1":1,"TRUE":1,"T":1,"N":0,"NO":0,"0":0,"FALSE":0,"F":0}
).astype(int)
X = df.drop(columns=[target])
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,stratify=y,random_state=42)

numeric = Pipeline([("impute", SimpleImputer(strategy="median")), ("scale", StandardScaler())])
categorical = Pipeline([("impute", SimpleImputer(strategy="most_frequent")), ("onehot", OneHotEncoder(handle_unknown="ignore"))])
pre = ColumnTransformer([("num", numeric, num_cols), ("cat", categorical, cat_cols)])


In [3]:
tree = Pipeline([("pre", pre), ("clf", DecisionTreeClassifier(random_state=42))])
param_tree = {
    "clf__max_depth": [None, 6, 10, 14],
    "clf__min_samples_leaf": [1, 2, 5],
    "clf__min_samples_split": [2, 10, 20]
}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
gs_tree = GridSearchCV(tree, param_tree, scoring="roc_auc", cv=cv, n_jobs=-1, refit=True)
gs_tree.fit(X_train, y_train)
best_tree = gs_tree.best_estimator_
proba_t = best_tree.predict_proba(X_test)[:,1]
pred_t  = (proba_t>=0.5).astype(int)

acc_t = accuracy_score(y_test, pred_t)
f1_t  = f1_score(y_test, pred_t)
auc_t = roc_auc_score(y_test, proba_t)
ap_t  = average_precision_score(y_test, proba_t)
gs_tree.best_params_, acc_t, f1_t, auc_t, ap_t


({'clf__max_depth': 6,
  'clf__min_samples_leaf': 1,
  'clf__min_samples_split': 20},
 0.9426070038910506,
 0.09230769230769231,
 0.8314022542722095,
 0.2288753423339097)

KeyboardInterrupt: 

In [None]:
os.makedirs("figs_wk6", exist_ok=True)

# ROC (RF)
fpr, tpr, _ = roc_curve(y_test, proba_rf)
plt.figure(); plt.plot(fpr,tpr); plt.plot([0,1],[0,1],'--')
plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate"); plt.title("Week 6 — ROC (Random Forest)")
plt.savefig("figs_wk6/roc_rf.png", bbox_inches="tight"); plt.close()

# PR (RF)
prec, rec, _ = precision_recall_curve(y_test, proba_rf)
plt.figure(); plt.plot(rec,prec)
plt.xlabel("Recall"); plt.ylabel("Precision"); plt.title("Week 6 — Precision-Recall (Random Forest)")
plt.savefig("figs_wk6/pr_rf.png", bbox_inches="tight"); plt.close()

# Confusion (RF)
cm = confusion_matrix(y_test, pred_rf)
plt.figure(); plt.imshow(cm)
for (i,j),v in np.ndenumerate(cm): plt.text(j,i,str(v),ha='center',va='center')
plt.title("Week 6 — Confusion Matrix (RF)"); plt.xlabel("Predicted"); plt.ylabel("Actual")
plt.savefig("figs_wk6/cm_rf.png", bbox_inches="tight"); plt.close()

# Feature importance (top 20)
# extract feature names after one-hot:
ohe = best_rf.named_steps["pre"].named_transformers_["cat"].named_steps["onehot"]
cat_names = ohe.get_feature_names_out(cat_cols).tolist()
feat_names = num_cols + cat_names
importances = best_rf.named_steps["clf"].feature_importances_
imp_df = pd.DataFrame({"feature": feat_names, "importance": importances}).sort_values("importance", ascending=False)

plt.figure(figsize=(7,6))
imp_df.head(20).sort_values("importance").plot(kind="barh", x="feature", y="importance", legend=False)
plt.title("Week 6 — Random Forest: Top 20 Feature Importances")
plt.tight_layout()
plt.savefig("figs_wk6/feature_importance_rf.png", bbox_inches="tight"); plt.close()

imp_df.head(20)
