In [11]:
# --- Imports & config ---
import os, warnings, json, math, sys, random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import (
    accuracy_score, f1_score, roc_auc_score, confusion_matrix,
    mean_absolute_error, mean_squared_error
)

import mlflow
import mlflow.sklearn

# --- Reproducibility ---
SEED = 4120
np.random.seed(SEED); random.seed(SEED)

# --- Paths ---
DATA_PATH  = "../data/insurance.csv"
PLOTS_DIR  = "../plots"
os.makedirs(PLOTS_DIR, exist_ok=True)

# --- MLflow local tracking (to ./mlruns) ---
mlflow.set_tracking_uri("file:../mlruns")
mlflow.set_experiment("midpoint-baselines")

# Nicer plots
sns.set(context="notebook", style="whitegrid")
warnings.filterwarnings("ignore")


In [12]:
df = pd.read_csv(DATA_PATH)
print(df.shape)
df.head()


(1338, 7)


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [13]:
# enforce dtypes
cat_cols = ["sex", "smoker", "region"]
for c in cat_cols:
    df[c] = df[c].astype("category")

# engineered features (simple & explainable)
df["bmi_obese"] = (df["bmi"] >= 30).astype(int)       # obesity flag
df["age_band"]  = pd.cut(df["age"], bins=[17, 25, 35, 45, 55, 65],
                         labels=["18-25","26-35","36-45","46-55","56-65"], right=True)

# check missing (dataset should have none)
df.isna().sum()


age          0
sex          0
bmi          0
children     0
smoker       0
region       0
charges      0
bmi_obese    0
age_band     0
dtype: int64

In [14]:
# Split once, then compute train median for label threshold to avoid leakage
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=SEED, shuffle=True)
val_df, test_df   = train_test_split(temp_df, test_size=0.5, random_state=SEED, shuffle=True)

train_median = train_df["charges"].median()

def add_targets(d, median):
    d = d.copy()
    d["y_class"] = (d["charges"] >= median).astype(int)   # High-cost vs low-cost
    d["y_reg"]   = d["charges"].astype(float)
    return d

train_df = add_targets(train_df, train_median)
val_df   = add_targets(val_df,   train_median)
test_df  = add_targets(test_df,  train_median)

train_median, train_df["y_class"].value_counts(normalize=True).round(3)


(np.float64(9545.630325),
 y_class
 0    0.5
 1    0.5
 Name: proportion, dtype: float64)

In [15]:
# Feature columns
num_cols = ["age", "bmi", "children", "bmi_obese"]      # numeric
cat_cols = ["sex", "smoker", "region", "age_band"]       # categorical

# Common transformers
numeric_tf   = Pipeline([("scaler", StandardScaler())])
categorical_tf= Pipeline([("onehot", OneHotEncoder(handle_unknown="ignore"))])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_tf, num_cols),
        ("cat", categorical_tf, cat_cols),
    ]
)


In [16]:
# CLASSIFICATION
logreg_clf = Pipeline([
    ("prep", preprocess),
    ("clf",  LogisticRegression(max_iter=1000, class_weight="balanced", random_state=SEED))
])

tree_clf = Pipeline([
    ("prep", preprocess),
    ("clf",  DecisionTreeClassifier(max_depth=None, random_state=SEED))
])

# REGRESSION
lin_reg = Pipeline([
    ("prep", preprocess),
    ("reg",  LinearRegression())
])

tree_reg = Pipeline([
    ("prep", preprocess),
    ("reg",  DecisionTreeRegressor(random_state=SEED))
])

models_classification = {
    "logreg": logreg_clf,
    "tree_clf": tree_clf
}

models_regression = {
    "lin_reg": lin_reg,
    "tree_reg": tree_reg
}


In [17]:
def eval_classification(model_name, model, train, val, test):
    with mlflow.start_run(run_name=f"clf_{model_name}", nested=True):
        Xtr, ytr = train[num_cols + cat_cols], train["y_class"]
        Xva, yva = val[num_cols + cat_cols],   val["y_class"]
        Xte, yte = test[num_cols + cat_cols],  test["y_class"]

        model.fit(Xtr, ytr)

        def metrics(X, y, split):
            p = model.predict(X)
            # For ROC-AUC we need probabilities; fallback if not available
            try:
                pprob = model.predict_proba(X)[:,1]
            except Exception:
                pprob = p
            m = {
                f"{split}_accuracy": accuracy_score(y, p),
                f"{split}_f1":       f1_score(y, p),
                f"{split}_roc_auc":  roc_auc_score(y, pprob),
            }
            for k,v in m.items(): mlflow.log_metric(k, float(v))
            return m, p

        m_val,  yhat_val  = metrics(Xva, yva, "val")
        m_test, yhat_test = metrics(Xte, yte, "test")

        mlflow.log_params({"task":"classification","model":model_name})

        # Save confusion matrix for test
        cm = confusion_matrix(yte, yhat_test)
        fig, ax = plt.subplots(figsize=(4,4))
        sns.heatmap(cm, annot=True, fmt="d", cbar=False, ax=ax)
        ax.set_title(f"Confusion Matrix (test) — {model_name}")
        ax.set_xlabel("Predicted"); ax.set_ylabel("True")
        fig_path = os.path.join(PLOTS_DIR, f"confmat_test_{model_name}.png")
        plt.tight_layout(); plt.savefig(fig_path, dpi=160); plt.close()
        mlflow.log_artifact(fig_path)

        # Log model
        mlflow.sklearn.log_model(model, artifact_path="model")

        # Return for table building
        return {
            "model": model_name,
            **{k:v for k,v in m_val.items()},
            **{k:v for k,v in m_test.items()}
        }, (model, yhat_test, cm)

def eval_regression(model_name, model, train, val, test):
    with mlflow.start_run(run_name=f"reg_{model_name}", nested=True):
        Xtr, ytr = train[num_cols + cat_cols], train["y_reg"]
        Xva, yva = val[num_cols + cat_cols],   val["y_reg"]
        Xte, yte = test[num_cols + cat_cols],  test["y_reg"]

        model.fit(Xtr, ytr)

        def metrics(X, y, split):
            p = model.predict(X)
            m = {
                f"{split}_mae":  mean_absolute_error(y, p),
                f"{split}_rmse": math.sqrt(mean_squared_error(y, p)),
            }
            for k,v in m.items(): mlflow.log_metric(k, float(v))
            return m, p

        m_val,  yhat_val  = metrics(Xva, yva, "val")
        m_test, yhat_test = metrics(Xte, yte, "test")

        mlflow.log_params({"task":"regression","model":model_name})
        mlflow.sklearn.log_model(model, artifact_path="model")

        # residuals vs predicted (test)
        resid = yte - yhat_test
        fig, ax = plt.subplots(figsize=(5,4))
        ax.scatter(yhat_test, resid, s=10)
        ax.axhline(0, ls="--")
        ax.set_xlabel("Predicted"); ax.set_ylabel("Residual (y - ŷ)")
        ax.set_title(f"Residuals vs Predicted (test) — {model_name}")
        fig_path = os.path.join(PLOTS_DIR, f"residuals_test_{model_name}.png")
        plt.tight_layout(); plt.savefig(fig_path, dpi=160); plt.close()
        mlflow.log_artifact(fig_path)

        return {
            "model": model_name,
            **{k:v for k,v in m_val.items()},
            **{k:v for k,v in m_test.items()}
        }, (model, yhat_test, resid)


In [18]:
# Classification runs
clf_rows = []
clf_artifacts = {}
for name, mdl in models_classification.items():
    row, art = eval_classification(name, mdl, train_df, val_df, test_df)
    clf_rows.append(row); clf_artifacts[name] = art

clf_table = pd.DataFrame(clf_rows)[[
    "model", "val_accuracy","val_f1","val_roc_auc",
    "test_accuracy","test_f1","test_roc_auc"
]].sort_values("val_f1", ascending=False).reset_index(drop=True)

# Regression runs
reg_rows = []
reg_artifacts = {}
for name, mdl in models_regression.items():
    row, art = eval_regression(name, mdl, train_df, val_df, test_df)
    reg_rows.append(row); reg_artifacts[name] = art

reg_table = pd.DataFrame(reg_rows)[[
    "model","val_mae","val_rmse","test_mae","test_rmse"
]].sort_values("val_mae", ascending=True).reset_index(drop=True)

# Save the two tables (for your PDF)
clf_table.to_csv(os.path.join(PLOTS_DIR, "table_classification_baselines.csv"), index=False)
reg_table.to_csv(os.path.join(PLOTS_DIR, "table_regression_baselines.csv"), index=False)

clf_table, reg_table




(      model  val_accuracy    val_f1  val_roc_auc  test_accuracy   test_f1  \
 0    logreg      0.920398  0.915789     0.954812       0.920398  0.916667   
 1  tree_clf      0.895522  0.892308     0.896997       0.890547  0.888889   
 
    test_roc_auc  
 0      0.962202  
 1      0.891667  ,
       model      val_mae     val_rmse     test_mae    test_rmse
 0  tree_reg  2820.823827  6013.051695  3005.019679  6517.739977
 1   lin_reg  4261.040885  5858.813026  4309.843640  6155.425917)

In [19]:
# Plot 1 — target distribution for classification (train set)
fig, ax = plt.subplots(figsize=(4.5,3.5))
train_df["y_class"].map({0:"Low cost",1:"High cost"}).value_counts().plot(kind="bar", ax=ax)
ax.set_title("Classification target distribution (train)"); ax.set_xlabel(""); ax.set_ylabel("Count")
plt.tight_layout(); plt.savefig(os.path.join(PLOTS_DIR,"plot1_target_distribution.png"), dpi=160); plt.close()

# Plot 2 — correlation heatmap for numeric features
corr_cols = ["age","bmi","children","charges"]
fig, ax = plt.subplots(figsize=(4.8,3.8))
sns.heatmap(train_df[corr_cols].corr(numeric_only=True), annot=True, fmt=".2f", ax=ax)
ax.set_title("Correlation heatmap (train)")
plt.tight_layout(); plt.savefig(os.path.join(PLOTS_DIR,"plot2_correlation_heatmap.png"), dpi=160); plt.close()

# Pick best classification baseline by validation F1
best_clf_name = clf_table.iloc[0]["model"]
best_clf_model, best_clf_yhat_test, best_cm = clf_artifacts[best_clf_name]

# Plot 3 — confusion matrix (best classification baseline on test)
fig, ax = plt.subplots(figsize=(4,4))
sns.heatmap(best_cm, annot=True, fmt="d", cbar=False, ax=ax)
ax.set_title(f"Confusion Matrix (test) — {best_clf_name}")
ax.set_xlabel("Predicted"); ax.set_ylabel("True")
plt.tight_layout(); plt.savefig(os.path.join(PLOTS_DIR,"plot3_confusion_matrix_best.png"), dpi=160); plt.close()

# Pick best regression baseline by validation MAE (smaller is better)
best_reg_name = reg_table.iloc[0]["model"]
best_reg_model, best_reg_yhat_test, best_reg_resid = reg_artifacts[best_reg_name]

# Plot 4 — residuals vs predicted (best regression baseline on test)
fig, ax = plt.subplots(figsize=(5,4))
ax.scatter(best_reg_yhat_test, best_reg_resid, s=10)
ax.axhline(0, ls="--")
ax.set_xlabel("Predicted"); ax.set_ylabel("Residual (y - ŷ)")
ax.set_title(f"Residuals vs Predicted (test) — {best_reg_name}")
plt.tight_layout(); plt.savefig(os.path.join(PLOTS_DIR,"plot4_residuals_best.png"), dpi=160); plt.close()

print("Saved required plots to:", PLOTS_DIR)
print("Best (by val F1):", best_clf_name)
print("Best (by val MAE):", best_reg_name)


Saved required plots to: ../plots
Best (by val F1): logreg
Best (by val MAE): tree_reg


In [20]:
print("Classification table:")
display(clf_table.round(3))
print("\nRegression table:")
display(reg_table.round(2))


Classification table:


Unnamed: 0,model,val_accuracy,val_f1,val_roc_auc,test_accuracy,test_f1,test_roc_auc
0,logreg,0.92,0.916,0.955,0.92,0.917,0.962
1,tree_clf,0.896,0.892,0.897,0.891,0.889,0.892



Regression table:


Unnamed: 0,model,val_mae,val_rmse,test_mae,test_rmse
0,tree_reg,2820.82,6013.05,3005.02,6517.74
1,lin_reg,4261.04,5858.81,4309.84,6155.43
