In [27]:
import pandas as pd
import joblib
import os
import time
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.exceptions import ConvergenceWarning
from ydata_profiling import ProfileReport

In [28]:
# load data
data_path = "adult_income.csv"
if not os.path.exists(data_path):
    df = pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
        header=None,
        names=[
            "age", "workclass", "fnlwgt", "education", "education-num",
            "marital-status", "occupation", "relationship", "race", "sex",
            "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"
        ]
    )
    df.to_csv(data_path, index=False)
else:
    df = pd.read_csv(
        data_path,
        header=None,
        names=[
            "age", "workclass", "fnlwgt", "education", "education-num",
            "marital-status", "occupation", "relationship", "race", "sex",
            "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"
        ]
    )


report_data_path = "adult_income_profile_report.html"

if not os.path.exists(report_data_path):
    profile = ProfileReport(df, title="Adult Income Dataset Profile Report", explorative=True)
    profile.to_file("adult_income_profile_report.html")

In [29]:
#  drop columns with missing values
df.replace(" ?", np.nan, inplace=True)
df.dropna(inplace=True)

In [None]:
X = df.drop("income", axis=1)
y = df["income"].apply(lambda x: x.strip() == ">50K")

# identity column types
cat_cols = X.select_dtypes(include="object").columns.tolist()
num_cols = X.select_dtypes(exclude="object").columns.tolist()

print(f"Categorical columns: {cat_cols}")
print(f"Numerical columns: {num_cols}")

Categorical columns: ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']
Numerical columns: []


In [31]:
# split data

X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

In [32]:
# Preprocessing: One-hot fpr categorical, scale for numerical
preprocessor = ColumnTransformer([
    ("onehot", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ("scale", StandardScaler(), num_cols)
])

In [33]:
# define models
models = {
    "LogisticRegression": LogisticRegression(max_iter=2000, solver="lbfgs", n_jobs=-1),
    "RandomForest": RandomForestClassifier(n_estimators=200, n_jobs=-1),
    "MLPClassifier": MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=400),
}

In [34]:
def run_pipeline(name, model):
    result = {}

    print(f"\nRunning pipeline for {name}...")

    pipeline = make_pipeline(preprocessor, model)

    # Fit
    start_time = time.perf_counter()
    pipeline.fit(X_train, y_train)
    fit_time = time.perf_counter() - start_time

    # Save model to disk
    filename = f"{name}.joblib"
    joblib.dump(pipeline, filename)
    model_size_kb = os.path.getsize(filename) / 1024

    # Inference timing (single-row prediction)
    start_time = time.perf_counter()
    for _ in range(1000):
        pipeline.predict(X_test.iloc[[0]])  # Correct: 2D input
    latency_ms = (time.perf_counter() - start_time) / 1000 * 1000
    # accuracy
    y_pred = pipeline.predict(X_test)
    acc = accuracy_score(y_test, y_pred)

    result = {
        "Train Time (s)": round(fit_time, 3),
        "Latency (ms)": round(latency_ms, 3),
        "Model Size (KB)": round(model_size_kb, 1),
        "Accuracy": round(acc * 100, 2),
    }

    return result


def print_result(model_name, result):
    print("\nModel Benchmark Results:\n")
    print(
        "{:<18} {:>15} {:>15} {:>18} {:>12}".format(
            "Model", "Train Time (s)", "Latency (ms)", "Model Size (KB)", "Accuracy (%)"
        )
    )
    print("-" * 80)

    print(
        "{:<18} {:>15} {:>15} {:>18} {:>12}".format(
            model_name,
            result["Train Time (s)"],
            result["Latency (ms)"],
            result["Model Size (KB)"],
            result["Accuracy"],
        )
    )

In [35]:
for name, model in models.items():
    result = run_pipeline(name, model)
    print_result(name, result)


Running pipeline for LogisticRegression...

Model Benchmark Results:

Model               Train Time (s)    Latency (ms)    Model Size (KB) Accuracy (%)
--------------------------------------------------------------------------------
LogisticRegression          11.631          22.767              296.2        86.03

Running pipeline for RandomForest...

Model Benchmark Results:

Model               Train Time (s)    Latency (ms)    Model Size (KB) Accuracy (%)
--------------------------------------------------------------------------------
RandomForest                13.093          76.521           243419.0        84.83

Running pipeline for MLPClassifier...





Model Benchmark Results:

Model               Train Time (s)    Latency (ms)    Model Size (KB) Accuracy (%)
--------------------------------------------------------------------------------
MLPClassifier              344.885         1829.42            71318.1        82.89
