In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pprint import pprint

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
from joblib import dump

from src.data_utils import load_breast_cancer_df
from src.preprocess import build_preprocessor, split_features_target
from src.logistic_scratch import LogisticRegressionScratch
from src.logistic_sklearn import build_pipeline, grid_search_pipeline

sns.set(style="whitegrid")
%matplotlib inline


ModuleNotFoundError: No module named 'src'

In [None]:
df = load_breast_cancer_df()
print("Shape:", df.shape)
df.head()


In [None]:
display(df.info())
display(df.describe().T)

print("Missing values per column:")
display(df.isna().sum())


In [None]:
plt.figure(figsize=(6,4))
sns.countplot(x="target", data=df)
plt.title("Target distribution (0 = malignant, 1 = benign)")
plt.show()

corr = df.corr()
top_feats = corr['target'].abs().sort_values(ascending=False).index[1:13]  # skip target itself
plt.figure(figsize=(10,8))
sns.heatmap(df[top_feats].corr(), annot=True, fmt=".2f", cmap="vlag")
plt.title("Correlation among top features")
plt.show()


In [None]:
X, y = split_features_target(df, target_col="target")
numeric_features = list(X.select_dtypes(include=["int64","float64"]).columns)
categorical_features = [] 

preprocessor = build_preprocessor(numeric_features, categorical_features)
preprocessor


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print("Train:", X_train.shape, "Test:", X_test.shape)


In [None]:
pipe = build_pipeline(preprocessor, solver="liblinear", penalty="l2", C=1.0)

param_grid = {"clf__C":[0.01, 0.1, 1.0, 10.0]}
gs = grid_search_pipeline(pipe, X_train, y_train, param_grid=param_grid, cv=5, n_jobs=1, scoring="roc_auc")

print("Best params:", gs.best_params_)
best_pipeline = gs.best_estimator_

y_prob_sklearn = best_pipeline.predict_proba(X_test)[:,1]
y_pred_sklearn = best_pipeline.predict(X_test)

print("Sklearn model (test):")
print("Accuracy:", accuracy_score(y_test, y_pred_sklearn))
print("Precision:", precision_score(y_test, y_pred_sklearn))
print("Recall:", recall_score(y_test, y_pred_sklearn))
print("F1:", f1_score(y_test, y_pred_sklearn))
print("ROC AUC:", roc_auc_score(y_test, y_prob_sklearn))


In [None]:
X_train_proc = preprocessor.fit_transform(X_train)
X_test_proc = preprocessor.transform(X_test)

scratch = LogisticRegressionScratch(lr=0.1, n_iter=5000, l2=0.01, verbose=True)
scratch.fit(X_train_proc, y_train.values)

y_prob_scratch = scratch.predict_proba(X_test_proc)
y_pred_scratch = scratch.predict(X_test_proc)

print("Scratch model (test):")
print("Accuracy:", accuracy_score(y_test, y_pred_scratch))
print("Precision:", precision_score(y_test, y_pred_scratch))
print("Recall:", recall_score(y_test, y_pred_scratch))
print("F1:", f1_score(y_test, y_pred_scratch))
print("ROC AUC:", roc_auc_score(y_test, y_prob_scratch))


In [None]:
fpr_s, tpr_s, _ = roc_curve(y_test, y_prob_sklearn)
fpr_c, tpr_c, _ = roc_curve(y_test, y_prob_scratch)

plt.plot(fpr_s, tpr_s, label=f"Sklearn (AUC={roc_auc_score(y_test,y_prob_sklearn):.3f})")
plt.plot(fpr_c, tpr_c, label=f"Scratch (AUC={roc_auc_score(y_test,y_prob_scratch):.3f})")
plt.plot([0,1],[0,1], linestyle="--", color="grey")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC comparison")
plt.legend()
plt.show()


In [None]:
feature_names = numeric_features  # with only numeric features

coef = best_pipeline.named_steps["clf"].coef_.ravel()
coef_df = pd.DataFrame({"feature": feature_names, "coef": coef})
coef_df = coef_df.reindex(coef_df.coef.abs().sort_values(ascending=False).index) 
coef_df.style.background_gradient(subset=["coef"], cmap="coolwarm")


In [None]:
os.makedirs("models", exist_ok=True)
dump(best_pipeline, "models/breast_cancer_model.joblib")
print("Saved sklearn pipeline to models/breast_cancer_model.joblib")
