In [None]:
# Week 3 — Forward/Backward Selection + PCR-style (PCA + Logistic)
#Dataset: `carclaims 12.csv` | Target: `FraudFound`


In [1]:
import os, numpy as np, pandas as pd, matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, average_precision_score, confusion_matrix, roc_curve, precision_recall_curve
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.decomposition import PCA

df = pd.read_csv("carclaims 12.csv")
df.columns = [c.strip() for c in df.columns]
target = "FraudFound"
yraw = df[target]
y = yraw if yraw.dtype!=object else yraw.astype(str).str.upper().map(
    {"Y":1,"YES":1,"1":1,"TRUE":1,"T":1,"N":0,"NO":0,"0":0,"FALSE":0,"F":0}
).astype(int)
X = df.drop(columns=[target])
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,stratify=y,random_state=42)

numeric = Pipeline([("impute", SimpleImputer(strategy="median")), ("scale", StandardScaler())])
categorical = Pipeline([("impute", SimpleImputer(strategy="most_frequent")), ("onehot", OneHotEncoder(handle_unknown="ignore"))])
pre = ColumnTransformer([("num", numeric, num_cols), ("cat", categorical, cat_cols)])


In [2]:
subset = num_cols[:min(10, len(num_cols))]
base = Pipeline([("sc", StandardScaler()), ("clf", LogisticRegression(max_iter=5000, solver="liblinear"))])

# Forward selection
sfs_forward = SequentialFeatureSelector(base.named_steps["clf"], n_features_to_select=min(5,len(subset)), direction="forward")
sfs_forward.fit(X_train[subset], y_train)
forward_feats = [f for f, keep in zip(subset, sfs_forward.get_support()) if keep]

# Backward selection
sfs_backward = SequentialFeatureSelector(base.named_steps["clf"], n_features_to_select=min(5,len(subset)), direction="backward")
sfs_backward.fit(X_train[subset], y_train)
backward_feats = [f for f, keep in zip(subset, sfs_backward.get_support()) if keep]

forward_feats, backward_feats


(['WeekOfMonth', 'WeekOfMonthClaimed', 'Age', 'PolicyNumber', 'RepNumber'],
 ['PolicyNumber', 'RepNumber', 'Deductible', 'DriverRating', 'Year'])

In [4]:
from sklearn.preprocessing import FunctionTransformer

pca_ct = ColumnTransformer([
    ("num", numeric, num_cols),
    ("cat", categorical, cat_cols)
])

pca_logit = Pipeline([
    ("pre", pca_ct),
    # ColumnTransformer with OneHotEncoder produces a sparse matrix.
    # Convert to dense before PCA so PCA with svd_solver="full" can operate.
    ("to_dense", FunctionTransformer(lambda X: X.toarray() if hasattr(X, "toarray") else X, accept_sparse=True)),
    ("pca", PCA(n_components=0.90, svd_solver="full")),  # keep ~90% variance
    ("clf", LogisticRegression(max_iter=5000))
])

pca_logit.fit(X_train, y_train)
proba = pca_logit.predict_proba(X_test)[:,1]
pred = (proba>=0.5).astype(int)

acc = accuracy_score(y_test, pred)
f1  = f1_score(y_test, pred)
auc = roc_auc_score(y_test, proba)
ap  = average_precision_score(y_test, proba)
acc, f1, auc, ap


(0.9400129701686122, 0.0, 0.8004493627812015, 0.16894813178728255)

In [5]:
os.makedirs("figs_wk3", exist_ok=True)

# ROC
fpr, tpr, _ = roc_curve(y_test, proba)
plt.figure(); plt.plot(fpr,tpr); plt.plot([0,1],[0,1],'--')
plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate"); plt.title("Week 3 — ROC (PCA+Logit)")
plt.savefig("figs_wk3/roc.png", bbox_inches="tight"); plt.close()

# PR
prec, rec, _ = precision_recall_curve(y_test, proba)
plt.figure(); plt.plot(rec,prec)
plt.xlabel("Recall"); plt.ylabel("Precision"); plt.title("Week 3 — Precision-Recall (PCA+Logit)")
plt.savefig("figs_wk3/pr.png", bbox_inches="tight"); plt.close()

# Confusion
cm = confusion_matrix(y_test, pred)
plt.figure(); plt.imshow(cm)
for (i,j),v in np.ndenumerate(cm): plt.text(j,i,str(v),ha='center',va='center')
plt.title("Week 3 — Confusion Matrix (PCA+Logit)"); plt.xlabel("Predicted"); plt.ylabel("Actual")
plt.savefig("figs_wk3/cm.png", bbox_inches="tight"); plt.close()

print("Saved plots to figs_wk3/")


Saved plots to figs_wk3/
