In [20]:
import pandas as pd
import numpy as np
import joblib
import urllib.request
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score,
    recall_score, f1_score, matthews_corrcoef
)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier


# ================================
# Create Data Folder (if not exists)
# ================================
os.makedirs("Data", exist_ok=True)

# ================================
# URLs from UCI Repository
# ================================
train_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
test_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test"
names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names"

# ================================
# File Paths
# ================================
train_path = "Data/adult.data"
test_path = "Data/adult.test"
names_path = "Data/adult.names"

# ================================
# Download Function
# ================================
def download_file(url, path):
    if not os.path.exists(path):
        print(f"Downloading {os.path.basename(path)}...")
        urllib.request.urlretrieve(url, path)
        print("Download complete!")
    else:
        print(f"{os.path.basename(path)} already exists.")

# ================================
# Download Files
# ================================
download_file(train_url, train_path)
download_file(test_url, test_path)
download_file(names_url, names_path)

print("\nAll files ready in Data/ folder.")

# ================================
# Load Dataset
# ================================

column_names = [
    "age", "workclass", "fnlwgt", "education", "education-num",
    "marital-status", "occupation", "relationship", "race",
    "sex", "capital-gain", "capital-loss", "hours-per-week",
    "native-country", "income"
]

# Load training data
data = pd.read_csv("Data/adult.data", header=None, names=column_names)

print(data.shape)
print(data.head())

# =====================================
#  Data Cleaning
# =====================================

# Replace '?' with NaN
data.replace("?", np.nan, inplace=True)

# Drop missing rows
data.dropna(inplace=True)

# Drop fnlwgt (not useful for ML)
data.drop("fnlwgt", axis=1, inplace=True)

# Convert target to binary
data["income"] = data["income"].apply(
    lambda x: 1 if x.strip() == ">50K" else 0
)

# One-hot encode categorical features
data = pd.get_dummies(data, drop_first=True)

print("After Cleaning:", data.shape)


# ==========================================
# Feature-Target Split
# ==========================================

X = data.drop("income", axis=1)
y = data["income"]

# Save feature column names (IMPORTANT for Streamlit)
feature_columns = X.columns
os.makedirs("Model", exist_ok=True)
joblib.dump(feature_columns, "Model/feature_columns.pkl")

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# ==========================================
# Feature Scaling
# ==========================================

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

joblib.dump(scaler, "Model/scaler.pkl")



# ==========================================
# Evaluation Function (Training Set)
# ==========================================

def evaluate_model(model):
    y_pred = model.predict(X_val_scaled)
    y_prob = model.predict_proba(X_val_scaled)[:, 1]

    return [
        accuracy_score(y_val, y_pred),
        roc_auc_score(y_val, y_prob),
        precision_score(y_val, y_pred),
        recall_score(y_val, y_pred),
        f1_score(y_val, y_pred),
        matthews_corrcoef(y_val, y_pred)
    ]


# ==========================================
# Models
# ==========================================

models = {
    "Logistic Regression": LogisticRegression(
        max_iter=1000,
        C=1.0
    ),

    "Decision Tree": DecisionTreeClassifier(
        max_depth=10,
        min_samples_split=20,
        min_samples_leaf=10,
        random_state=42
    ),

    "KNN": KNeighborsClassifier(
        n_neighbors=7
    ),

    "Naive Bayes": GaussianNB(),

    "Random Forest": RandomForestClassifier(
        n_estimators=200,
        max_depth=15,
        min_samples_split=20,
        min_samples_leaf=10,
        random_state=42
    ),

    "XGBoost": XGBClassifier(
        n_estimators=200,
        learning_rate=0.1,
        max_depth=5,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        eval_metric="logloss"
    )
}


# ==========================================
# Train Models & Evaluate (Training Set)
# ==========================================

results = {}

for name, model in models.items():
    print(f"Training {name}...")

    model.fit(X_train_scaled, y_train)
    results[name] = evaluate_model(model)

    # Save trained model
    file_name = name.lower().replace(" ", "_") + ".pkl"
    joblib.dump(model, os.path.join("Model", file_name))



# ==========================================
# Display Results
# ==========================================

results_df = pd.DataFrame(
    results,
    index=["Accuracy", "AUC", "Precision", "Recall", "F1 Score", "MCC"]
).T

print("\nTraining Results:\n")
print(results_df)

print("\nAll models saved successfully.")



adult.data already exists.
adult.test already exists.
adult.names already exists.

All files ready in Data/ folder.
(32561, 15)
   age          workclass  fnlwgt   education  education-num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital-status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital-g