ML Assignment 2

Import Required Libraries

In [34]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import (accuracy_score, roc_auc_score, precision_score,
                             recall_score, f1_score, matthews_corrcoef,
                             confusion_matrix, classification_report)
import pickle
import warnings
warnings.filterwarnings('ignore')

Download the dataset

In [35]:

print("LOADING ADULT INCOME PREDICTION DATASET")

df = None
import kagglehub

path = kagglehub.dataset_download("mosapabdelghany/adult-income-prediction-dataset")

csv_files = [f for f in os.listdir(path) if f.endswith('.csv')]

data_file = os.path.join(path, csv_files[0])
df = pd.read_csv(data_file)


LOADING ADULT INCOME PREDICTION DATASET
Using Colab cache for faster access to the 'adult-income-prediction-dataset' dataset.


In [36]:
print(f"Dataset loaded successfully!")
print(f"Dataset shape: {df.shape}")
print(f"\nColumn names:\n{df.columns.tolist()}")
print(f"\nFirst few rows:\n{df.head()}")
print(f"\nData types:\n{df.dtypes}")
print(f"\nMissing values:\n{df.isnull().sum()}")

Dataset loaded successfully!
Dataset shape: (32561, 15)

Column names:
['age', 'workclass', 'fnlwgt', 'education', 'education.num', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'capital.gain', 'capital.loss', 'hours.per.week', 'native.country', 'income']

First few rows:
   age workclass  fnlwgt     education  education.num marital.status  \
0   90         ?   77053       HS-grad              9        Widowed   
1   82   Private  132870       HS-grad              9        Widowed   
2   66         ?  186061  Some-college             10        Widowed   
3   54   Private  140359       7th-8th              4       Divorced   
4   41   Private  264663  Some-college             10      Separated   

          occupation   relationship   race     sex  capital.gain  \
0                  ?  Not-in-family  White  Female             0   
1    Exec-managerial  Not-in-family  White  Female             0   
2                  ?      Unmarried  Black  Female             0   
3  Ma

Data Pre-processing

In [37]:
df = df.replace('?', np.nan)
df = df.dropna()

In [38]:
target_col = None
for col in df.columns:
    if 'income' in col.lower() or 'salary' in col.lower():
        target_col = col
        break

if target_col is None:
    target_col = df.columns[-1]

print(f"\nTarget column: {target_col}")
print(f"Target classes: {df[target_col].unique()}")
print(f"Class distribution:\n{df[target_col].value_counts()}")


Target column: income
Target classes: ['<=50K' '>50K']
Class distribution:
income
<=50K    22654
>50K      7508
Name: count, dtype: int64


In [39]:
X = df.drop(columns=[target_col])
y = df[target_col]

# Encode target
le_target = LabelEncoder()
y = le_target.fit_transform(y)
print(f"Target encoding: {dict(zip(le_target.classes_, le_target.transform(le_target.classes_)))}")

# Identify column types
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

print(f"\nCategorical features ({len(categorical_cols)}): {categorical_cols}")
print(f"Numerical features ({len(numerical_cols)}): {numerical_cols}")

Target encoding: {'<=50K': np.int64(0), '>50K': np.int64(1)}

Categorical features (8): ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']
Numerical features (6): ['age', 'fnlwgt', 'education.num', 'capital.gain', 'capital.loss', 'hours.per.week']


In [40]:
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    label_encoders[col] = le

print(f"Final features: {X.shape[1]}")
print(f"Total samples: {X.shape[0]}")
X.head()

Final features: 14
Total samples: 30162


Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
1,82,2,132870,11,9,6,3,1,4,0,0,4356,18,38
3,54,2,140359,5,4,0,6,4,4,0,0,3900,40,38
4,41,2,264663,15,10,5,9,3,4,0,0,3900,40,38
5,34,2,216864,11,9,0,7,4,4,0,0,3770,45,38
6,38,2,150601,0,6,5,0,4,4,1,0,3770,40,38


Split the Dataset

In [41]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"\nClass distribution in train set:")
print(pd.Series(y_train).value_counts())
print(f"\nClass distribution in test set:")
print(pd.Series(y_test).value_counts())

Training set: (21113, 14)
Test set: (9049, 14)

Class distribution in train set:
0    15857
1     5256
Name: count, dtype: int64

Class distribution in test set:
0    6797
1    2252
Name: count, dtype: int64


In [42]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Feature scaling complete!")
print(f"Scaled training data shape: {X_train_scaled.shape}")
print(f"Scaled test data shape: {X_test_scaled.shape}")

Feature scaling complete!
Scaled training data shape: (21113, 14)
Scaled test data shape: (9049, 14)


In [43]:
test_data = pd.DataFrame(X_test_scaled, columns=X.columns)
test_data['target'] = y_test

# For Streamlit free tier, limit to 1000 samples if dataset is large
if len(test_data) > 1000:
    test_data = test_data.sample(n=1000, random_state=42)
    print(f"Test data sampled to 1000 rows for Streamlit free tier")
else:
    print(f"Using full test set: {len(test_data)} rows")

test_data.to_csv('test_data.csv', index=False)
print(f"Test data saved: test_data.csv ({test_data.shape[0]} rows × {test_data.shape[1]} columns)")
print(f"  - This file can be downloaded from Streamlit app")
print(f"  - Features: {test_data.shape[1] - 1}")
print(f"  - Target distribution: Class 0: {(test_data['target']==0).sum()}, Class 1: {(test_data['target']==1).sum()}")

Test data sampled to 1000 rows for Streamlit free tier
Test data saved: test_data.csv (1000 rows × 15 columns)
  - This file can be downloaded from Streamlit app
  - Features: 14
  - Target distribution: Class 0: 768, Class 1: 232


Train Classification Models

In [44]:
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=10000),
    'Decision Tree': DecisionTreeClassifier(random_state=42, max_depth=5),
    'K-Nearest Neighbor': KNeighborsClassifier(n_neighbors=5),
    'Naive Bayes': GaussianNB(),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10),
    'XGBoost': XGBClassifier(n_estimators=100, random_state=42, max_depth=5, eval_metric='logloss')
}

print("Models initialized:")
for name in models.keys():
    print(f"  • {name}")


Models initialized:
  • Logistic Regression
  • Decision Tree
  • K-Nearest Neighbor
  • Naive Bayes
  • Random Forest
  • XGBoost


Train and Evaluate Each Model

In [45]:
results = []

print("="*80)
print("TRAINING AND EVALUATING MODELS")
print("="*80)

for model_name, model in models.items():
    print(f"\n{model_name}:")
    print("-" * 40)

    # Train
    model.fit(X_train_scaled, y_train)

    # Predict
    y_pred = model.predict(X_test_scaled)
    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    mcc = matthews_corrcoef(y_test, y_pred)

    # Print metrics
    print(f"Accuracy:  {accuracy:.4f}")
    print(f"AUC:       {auc:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1 Score:  {f1:.4f}")
    print(f"MCC:       {mcc:.4f}")

# Store results
    results.append({
        'Model': model_name,
        'Accuracy': round(accuracy, 4),
        'AUC': round(auc, 4),
        'Precision': round(precision, 4),
        'Recall': round(recall, 4),
        'F1': round(f1, 4),
        'MCC': round(mcc, 4)
    })

    # Save model
    model_filename = f"model_{model_name.lower().replace(' ', '_').replace('-', '_')}.pkl"
    with open(model_filename, 'wb') as f:
        pickle.dump(model, f)
    print(f"Model saved: {model_filename}")

print("\n" + "="*80)
print("All models trained successfully!")
print("="*80)

TRAINING AND EVALUATING MODELS

Logistic Regression:
----------------------------------------
Accuracy:  0.8300
AUC:       0.8612
Precision: 0.7532
Recall:    0.4716
F1 Score:  0.5800
MCC:       0.5011
Model saved: model_logistic_regression.pkl

Decision Tree:
----------------------------------------
Accuracy:  0.8447
AUC:       0.8840
Precision: 0.7907
Recall:    0.5115
F1 Score:  0.6212
MCC:       0.5490
Model saved: model_decision_tree.pkl

K-Nearest Neighbor:
----------------------------------------
Accuracy:  0.8280
AUC:       0.8554
Precision: 0.6738
Recall:    0.5990
F1 Score:  0.6342
MCC:       0.5239
Model saved: model_k_nearest_neighbor.pkl

Naive Bayes:
----------------------------------------
Accuracy:  0.7947
AUC:       0.8556
Precision: 0.6759
Recall:    0.3361
F1 Score:  0.4490
MCC:       0.3712
Model saved: model_naive_bayes.pkl

Random Forest:
----------------------------------------
Accuracy:  0.8550
AUC:       0.9192
Precision: 0.8052
Recall:    0.5506
F1 Score:  0.6

Results Summary

In [46]:
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('Accuracy', ascending=False)

print("\n" + "="*80)
print("FINAL RESULTS - ALL MODELS")
print("="*80)
print(results_df.to_string(index=False))


FINAL RESULTS - ALL MODELS
              Model  Accuracy    AUC  Precision  Recall     F1    MCC
            XGBoost    0.8719 0.9288     0.7851  0.6683 0.7220 0.6430
      Random Forest    0.8550 0.9192     0.8052  0.5506 0.6540 0.5827
      Decision Tree    0.8447 0.8840     0.7907  0.5115 0.6212 0.5490
Logistic Regression    0.8300 0.8612     0.7532  0.4716 0.5800 0.5011
 K-Nearest Neighbor    0.8280 0.8554     0.6738  0.5990 0.6342 0.5239
        Naive Bayes    0.7947 0.8556     0.6759  0.3361 0.4490 0.3712


Verify All Files Created

In [47]:
import os
import glob

print("\n" + "="*80)
print("FILE VERIFICATION")
print("="*80)

required_files = [
    'model_logistic_regression.pkl',
    'model_decision_tree.pkl',
    'model_k_nearest_neighbor.pkl',
    'model_naive_bayes.pkl',
    'model_random_forest.pkl',
    'model_xgboost.pkl',
    'test_data.csv'
]

all_present = True
for file in required_files:
    if os.path.exists(file):
        size = os.path.getsize(file)
        print(f"{file:40s} ({size:,} bytes)")
    else:
        print(f"{file:40s} NOT FOUND")
        all_present = False

print("\n" + "="*80)
if all_present:
    print(" All files created successfully!")
else:
    print(" Some files are missing. Please check the errors above.")
print("="*80)


FILE VERIFICATION
model_logistic_regression.pkl            (829 bytes)
model_decision_tree.pkl                  (5,545 bytes)
model_k_nearest_neighbor.pkl             (2,965,553 bytes)
model_naive_bayes.pkl                    (1,026 bytes)
model_random_forest.pkl                  (4,736,615 bytes)
model_xgboost.pkl                        (210,721 bytes)
test_data.csv                            (278,084 bytes)

 All files created successfully!
