In [1]:
import json
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from joblib import dump
import warnings
from lightgbm import LGBMClassifier

In [2]:
DATA_DIR = Path("../data")
TRAIN_CSV = DATA_DIR / "train.csv"
TEST_CSV  = DATA_DIR / "test.csv"
MODELS_DIR = Path("../models")
MODELS_DIR.mkdir(parents=True, exist_ok=True)
RANDOM_STATE = 42

In [3]:
train_df = pd.read_csv(TRAIN_CSV)
test_df  = pd.read_csv(TEST_CSV)

In [4]:
X_train = train_df.drop(columns=["income"])
y_train = (train_df["income"] == ">50K").astype(int)
X_test  = test_df.drop(columns=["income"])
y_test  = (test_df["income"] == ">50K").astype(int)

In [5]:
cat_cols = X_train.select_dtypes(include="object").columns.tolist()
num_cols = X_train.select_dtypes(exclude="object").columns.tolist()

In [6]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ]
)

In [7]:
models = {
    "dummy_most_frequent": DummyClassifier(strategy="most_frequent"),
    "logreg": LogisticRegression(max_iter=1000, class_weight="balanced", random_state=RANDOM_STATE),
    "dt_depth5": DecisionTreeClassifier(max_depth=5, random_state=RANDOM_STATE),
    "rf_200": RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE, n_jobs=-1),
    "knn_5": KNeighborsClassifier(n_neighbors=5),
    "lgbm" : LGBMClassifier(random_state=RANDOM_STATE)
}

In [8]:
def evaluate(y_true, y_pred, y_prob=None):
    if y_prob is None:
        y_prob = np.where(y_pred == 1, 1.0, 0.0)
    return {
        "accuracy": float(accuracy_score(y_true, y_pred)),
        "precision_pos": float(precision_score(y_true, y_pred, pos_label=1, zero_division=0)),
        "recall_pos": float(recall_score(y_true, y_pred, pos_label=1, zero_division=0)),
        "f1_pos": float(f1_score(y_true, y_pred, pos_label=1, zero_division=0)),
        "roc_auc": float(roc_auc_score(y_true, y_prob)),
    }

In [9]:
metadata = {
    "schema": {
        "categorical": cat_cols,
        "numerical": num_cols,
        "target": "income",
        "positive_class": ">50K",
        "mapping": {"<=50K": 0, ">50K": 1},
    },
    "splits": {
        "train_rows": int(X_train.shape[0]),
        "test_rows": int(X_test.shape[0]),
    },
    "random_state": RANDOM_STATE,
    "models": {}
}

In [10]:
for name, model in models.items():
    pipe = Pipeline([("prep", preprocessor), ("model", model)])
    pipe.fit(X_train, y_train)

    y_pred = pipe.predict(X_test)
    y_prob = pipe.predict_proba(X_test)[:, 1] if hasattr(pipe, "predict_proba") else None
    mets = evaluate(y_test, y_pred, y_prob)

    # save pipeline
    pkl_path = MODELS_DIR / f"model_{name}.pkl"
    dump(pipe, pkl_path)

    # record in metadata
    metadata["models"][name] = {
        "path": str(pkl_path),
        "metrics": mets,
    }

# write metadata
with open(MODELS_DIR / "metadata.json", "w") as f:
    json.dump(metadata, f, indent=2)

# print summary
mdf = pd.DataFrame(
    [(k, v["metrics"]["accuracy"], v["metrics"]["precision_pos"], v["metrics"]["recall_pos"],
      v["metrics"]["f1_pos"], v["metrics"]["roc_auc"])
     for k, v in metadata["models"].items()],
    columns=["model", "accuracy", "precision(>50K)", "recall(>50K)", "f1(>50K)", "roc_auc"]
).sort_values("roc_auc", ascending=False)
print(mdf.to_string(index=False))
print(f"\nSaved to: {MODELS_DIR}")

[LightGBM] [Info] Number of positive: 6271, number of negative: 19758
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004190 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 733
[LightGBM] [Info] Number of data points in the train set: 26029, number of used features: 77
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.240924 -> initscore=-1.147623
[LightGBM] [Info] Start training from score -1.147623
              model  accuracy  precision(>50K)  recall(>50K)  f1(>50K)  roc_auc
               lgbm  0.866626         0.769231      0.637755  0.697350 0.921154
             logreg  0.810541         0.573626      0.832270  0.679157 0.902374
             rf_200  0.852028         0.730740      0.610969  0.665509 0.897452
          dt_depth5  0.848187         0.771028      0.526148  0.625474 0.875400
              knn_5  0.827443         0.6608

