In [1]:
# Week 1 — Polynomial & Interactions + Multicollinearity (VIF)
# Dataset: 'carclaims 12.csv' Target: `FraudFound` (1=fraud, 0=non-fraud)

In [2]:
import os, numpy as np, pandas as pd, matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, average_precision_score, confusion_matrix, roc_curve, precision_recall_curve

CSV = "carclaims 12.csv"
df = pd.read_csv(CSV)
df.columns = [c.strip() for c in df.columns]
target = "FraudFound"

# Map target to 0/1 if needed
y_raw = df[target]
y = y_raw if y_raw.dtype!=object else y_raw.astype(str).str.upper().map(
    {"Y":1,"YES":1,"1":1,"TRUE":1,"T":1,"N":0,"NO":0,"0":0,"FALSE":0,"F":0}
).astype(int)
X = df.drop(columns=[target])

num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


In [3]:
%pip install statsmodels

from statsmodels.stats.outliers_influence import variance_inflation_factor

X_num = X_train[num_cols].copy()
X_num = X_num.replace([np.inf, -np.inf], np.nan)
X_num = X_num.fillna(X_num.median(numeric_only=True))
X_num_const = X_num.assign(const=1.0)
vif = pd.DataFrame({
    "feature": X_num_const.columns,
    "VIF": [variance_inflation_factor(X_num_const.values, i) for i in range(X_num_const.shape[1])]
}).sort_values("VIF", ascending=False)
vif.head(15)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


Unnamed: 0,feature,VIF
8,const,50151980.0
3,PolicyNumber,8.12846
7,Year,8.128001
0,WeekOfMonth,1.084914
1,WeekOfMonthClaimed,1.084886
2,Age,1.005812
5,Deductible,1.005631
6,DriverRating,1.000988
4,RepNumber,1.000557


In [4]:
# Keep polynomial expansion modest: up to 5 numeric features to prevent explosion
poly_subset = num_cols[:min(5, len(num_cols))]

numeric_poly = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("scale", StandardScaler()),
    ("poly", PolynomialFeatures(degree=2, include_bias=False))  # squares + pairwise interactions
])
categorical = Pipeline([
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

pre_poly = ColumnTransformer([
    ("num_poly", numeric_poly, poly_subset),
    ("cat", categorical, cat_cols)
], remainder="drop")

clf = Pipeline([
    ("pre", pre_poly),
    ("logit", LogisticRegression(max_iter=5000, solver="saga", penalty="l2"))
])

clf.fit(X_train, y_train)
proba = clf.predict_proba(X_test)[:,1]
pred = (proba >= 0.5).astype(int)

acc = accuracy_score(y_test, pred)
f1  = f1_score(y_test, pred)
auc = roc_auc_score(y_test, proba)
ap  = average_precision_score(y_test, proba)

acc, f1, auc, ap


(0.9396887159533074, 0.0, 0.8151310330682529, 0.1821128207152426)

In [5]:
os.makedirs("figs_wk1", exist_ok=True)

# ROC
fpr, tpr, _ = roc_curve(y_test, proba)
plt.figure(); plt.plot(fpr,tpr); plt.plot([0,1],[0,1],'--')
plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate"); plt.title("Week 1 — ROC")
plt.savefig("figs_wk1/roc.png", bbox_inches="tight"); plt.close()

# PR
prec, rec, _ = precision_recall_curve(y_test, proba)
plt.figure(); plt.plot(rec,prec)
plt.xlabel("Recall"); plt.ylabel("Precision"); plt.title("Week 1 — Precision-Recall")
plt.savefig("figs_wk1/pr.png", bbox_inches="tight"); plt.close()

# Confusion Matrix
cm = confusion_matrix(y_test, pred)
plt.figure(); plt.imshow(cm)
for (i,j),v in np.ndenumerate(cm): plt.text(j,i,str(v),ha='center',va='center')
plt.title("Week 1 — Confusion Matrix"); plt.xlabel("Predicted"); plt.ylabel("Actual")
plt.savefig("figs_wk1/cm.png", bbox_inches="tight"); plt.close()

print("Saved plots to figs_wk1/")


Saved plots to figs_wk1/
