In [None]:
# Week 4 — Logistic Regression + Feature Scaling
#Dataset: `carclaims 12.csv` | Target: `FraudFound`


In [1]:
import os, numpy as np, pandas as pd, matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, average_precision_score, confusion_matrix, roc_curve, precision_recall_curve

df = pd.read_csv("carclaims 12.csv")
df.columns = [c.strip() for c in df.columns]
target = "FraudFound"
yraw = df[target]
y = yraw if yraw.dtype!=object else yraw.astype(str).str.upper().map(
    {"Y":1,"YES":1,"1":1,"TRUE":1,"T":1,"N":0,"NO":0,"0":0,"FALSE":0,"F":0}
).astype(int)
X = df.drop(columns=[target])
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,stratify=y,random_state=42)

numeric = Pipeline([("impute", SimpleImputer(strategy="median")), ("scale", StandardScaler())])
categorical = Pipeline([("impute", SimpleImputer(strategy="most_frequent")), ("onehot", OneHotEncoder(handle_unknown="ignore"))])
pre = ColumnTransformer([("num", numeric, num_cols), ("cat", categorical, cat_cols)])


In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, average_precision_score

# Quick pipeline
pipe_fast = Pipeline([
    ("pre", pre),
    ("clf", LogisticRegression(max_iter=1000, solver="liblinear"))  # liblinear supports l1 & l2 and is faster
])

# Smaller grid (6 fits total × 3 folds = 18)
param_fast = {
    "clf__penalty": ["l1", "l2"],  # just two options
    "clf__C": [0.1, 1.0, 10.0],    # small C grid
}

cv3 = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
gs_fast = GridSearchCV(pipe_fast, param_fast, scoring="roc_auc", cv=cv3, n_jobs=-1, refit=True)

# Fit
gs_fast.fit(X_train, y_train)
best = gs_fast.best_estimator_

# Predict and evaluate
proba = best.predict_proba(X_test)[:, 1]
pred = (proba >= 0.5).astype(int)

acc = accuracy_score(y_test, pred)
f1  = f1_score(y_test, pred)
auc = roc_auc_score(y_test, proba)
ap  = average_precision_score(y_test, proba)

print("Best parameters:", gs_fast.best_params_)
print(f"Accuracy: {acc:.3f} | F1: {f1:.3f} | ROC-AUC: {auc:.3f} | PR-AUC: {ap:.3f}")



Best parameters: {'clf__C': 1.0, 'clf__penalty': 'l1'}
Accuracy: 0.940 | F1: 0.011 | ROC-AUC: 0.815 | PR-AUC: 0.179


In [4]:
os.makedirs("figs_wk4", exist_ok=True)

# ROC
fpr, tpr, _ = roc_curve(y_test, proba)
plt.figure(); plt.plot(fpr,tpr); plt.plot([0,1],[0,1],'--')
plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate"); plt.title("Week 4 — ROC (Logistic)")
plt.savefig("figs_wk4/roc.png", bbox_inches="tight"); plt.close()

# PR
prec, rec, _ = precision_recall_curve(y_test, proba)
plt.figure(); plt.plot(rec,prec)
plt.xlabel("Recall"); plt.ylabel("Precision"); plt.title("Week 4 — Precision-Recall (Logistic)")
plt.savefig("figs_wk4/pr.png", bbox_inches="tight"); plt.close()

# Confusion
cm = confusion_matrix(y_test, pred)
plt.figure(); plt.imshow(cm)
for (i,j),v in np.ndenumerate(cm): plt.text(j,i,str(v),ha='center',va='center')
plt.title("Week 4 — Confusion Matrix (Logistic)"); plt.xlabel("Predicted"); plt.ylabel("Actual")
plt.savefig("figs_wk4/cm.png", bbox_inches="tight"); plt.close()

print("Saved plots to figs_wk4/")


Saved plots to figs_wk4/
