# ML Assignment 2

## Import Libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score, matthews_corrcoef
from sklearn.metrics import confusion_matrix, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier

import joblib


## Load Dataset

In [2]:
df = pd.read_csv("adult.csv")
df.head()


Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [3]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education.num   32561 non-null  int64 
 5   marital.status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital.gain    32561 non-null  int64 
 11  capital.loss    32561 non-null  int64 
 12  hours.per.week  32561 non-null  int64 
 13  native.country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


## Handle Missing Values

In [4]:
df.replace(" ?", np.nan, inplace=True)
df.dropna(inplace=True)


## Encode Target Variable

In [5]:
df['income'] = df['income'].apply(lambda x: 1 if ">50K" in x else 0)


## Separate Features and Target

In [6]:
X = df.drop("income", axis=1)
y = df["income"]


## One-Hot Encode Categorical Features

In [7]:
X = pd.get_dummies(X, drop_first=True)


## Train-Test Split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


## Scaling (For Logistic & kNN)

In [9]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


## --Train All 6 Models

### 1. Logistic Regression

In [10]:
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train_scaled, y_train)
y_pred_log = log_model.predict(X_test_scaled)
y_prob_log = log_model.predict_proba(X_test_scaled)[:,1]


### 2. Decision Tree

In [11]:
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)
y_prob_dt = dt_model.predict_proba(X_test)[:,1]


### 3.KNN

In [12]:
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_scaled, y_train)
y_pred_knn = knn_model.predict(X_test_scaled)
y_prob_knn = knn_model.predict_proba(X_test_scaled)[:,1]


### 4. Naive Bayes

In [13]:
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)
y_prob_nb = nb_model.predict_proba(X_test)[:,1]


### 5. Random Forest

In [14]:
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
y_prob_rf = rf_model.predict_proba(X_test)[:,1]


### 6. XGBoost

In [16]:
xgb_model = XGBClassifier(eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)

y_pred_xgb = xgb_model.predict(X_test)
y_prob_xgb = xgb_model.predict_proba(X_test)[:, 1]


## Evaluation Function

In [17]:
def evaluate_model(y_test, y_pred, y_prob):
    return {
        "Accuracy": accuracy_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_prob),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "MCC": matthews_corrcoef(y_test, y_pred)
    }


In [18]:
results = {}

results["Logistic Regression"] = evaluate_model(y_test, y_pred_log, y_prob_log)
results["Decision Tree"] = evaluate_model(y_test, y_pred_dt, y_prob_dt)
results["KNN"] = evaluate_model(y_test, y_pred_knn, y_prob_knn)
results["Naive Bayes"] = evaluate_model(y_test, y_pred_nb, y_prob_nb)
results["Random Forest"] = evaluate_model(y_test, y_pred_rf, y_prob_rf)
results["XGBoost"] = evaluate_model(y_test, y_pred_xgb, y_prob_xgb)

results_df = pd.DataFrame(results).T
results_df


Unnamed: 0,Accuracy,AUC,Precision,Recall,F1,MCC
Logistic Regression,0.847536,0.903119,0.7176,0.583604,0.643703,0.552762
Decision Tree,0.813911,0.747778,0.602266,0.622642,0.612284,0.490022
KNN,0.817442,0.826831,0.62908,0.551724,0.587868,0.47294
Naive Bayes,0.794718,0.829597,0.639665,0.297983,0.406569,0.334111
Random Forest,0.851835,0.901795,0.718321,0.612232,0.661047,0.569978
XGBoost,0.872563,0.926969,0.777255,0.644763,0.704836,0.628907


## Save Models

In [20]:
joblib.dump(log_model, "model/logistic_model.pkl")
joblib.dump(dt_model, "model/dt_model.pkl")
joblib.dump(knn_model, "model/knn_model.pkl")
joblib.dump(nb_model, "model/nb_model.pkl")
joblib.dump(rf_model, "model/rf_model.pkl")
joblib.dump(xgb_model, "model/xgb_model.pkl")
joblib.dump(scaler, "model/scaler.pkl")


['model/scaler.pkl']