In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_recall_curve
import numpy as np

# 1) Load & split (drop ID; stratify)
df = pd.read_csv("Social_Network_Ads.csv")
X = df.drop(columns=["User ID", "Purchased"])
y = df["Purchased"]

num = ["Age", "EstimatedSalary"]
cat = ["Gender"]

# 2) Preprocess in a Pipeline
pre = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat),
    ("num", StandardScaler(), num),
])

logreg = LogisticRegression(max_iter=2000)

pipe = Pipeline([
    ("pre", pre),
    ("clf", logreg),
])

# 3) Hyperparameter grid (tune regularization & penalty)
param_grid = {
    "clf__penalty": ["l2", "l1"],                 # L1 requires liblinear or saga
    "clf__C": np.logspace(-3, 2, 9),              # 0.001 ... 100
    "clf__solver": ["liblinear", "saga"],         # both support L1/L2
    "clf__class_weight": [None, "balanced"],      # try balancing
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# If class 1 is the business-positive class, use 'f1' to focus on it
gs = GridSearchCV(pipe, param_grid, cv=cv, n_jobs=-1, scoring="f1", refit=True, verbose=0)

X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=1/3, stratify=y, random_state=42)
gs.fit(X_tr, y_tr)

print("Best params:", gs.best_params_)

# 4) Threshold tuning on validation-style holdout (use test here for demo; normally use a val split)
proba = gs.predict_proba(X_te)[:, 1]
prec, rec, thr = precision_recall_curve(y_te, proba)

# Pick threshold that maximizes F1 on the curve
f1_vals = (2 * prec * rec) / (prec + rec + 1e-12)
best_idx = int(np.nanargmax(f1_vals))
best_thr = thr[max(0, best_idx-1)]  # align sizes (thr is len-1 vs prec/rec)
print(f"Chosen threshold for best F1: {best_thr:.3f}")

y_pred_thr = (proba >= best_thr).astype(int)

print("\nClassification report @ tuned threshold:")
print(classification_report(y_te, y_pred_thr, digits=2))

print("Confusion matrix @ tuned threshold:")
print(confusion_matrix(y_te, y_pred_thr))


Best params: {'clf__C': np.float64(0.01778279410038923), 'clf__class_weight': 'balanced', 'clf__penalty': 'l2', 'clf__solver': 'liblinear'}
Chosen threshold for best F1: 0.547

Classification report @ tuned threshold:
              precision    recall  f1-score   support

           0       0.88      0.92      0.90        86
           1       0.84      0.77      0.80        48

    accuracy                           0.87       134
   macro avg       0.86      0.84      0.85       134
weighted avg       0.86      0.87      0.86       134

Confusion matrix @ tuned threshold:
[[79  7]
 [11 37]]
