# ML Assignment 2 - Model Training Notebook
## Adult Income Prediction - Classification Models

**BITS Pilani - M.Tech (AIML/DSE)**  
**Course:** Machine Learning  
**Assignment:** ML Assignment 2

---

## 1. Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
import kagglehub
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score,
    recall_score, f1_score, matthews_corrcoef,
    confusion_matrix, classification_report
)
import pickle
import warnings
import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings('ignore')
print("‚úÖ All libraries imported successfully!")

## 2. Load Dataset from Kaggle

In [None]:
# Download dataset from Kaggle
print("Downloading Adult Income Prediction Dataset...")
path = kagglehub.dataset_download("mosapabdelghany/adult-income-prediction-dataset")
print(f"Dataset downloaded to: {path}")

# Find CSV files
import os
csv_files = [f for f in os.listdir(path) if f.endswith('.csv')]
print(f"Available CSV files: {csv_files}")

# Load the dataset
data_file = os.path.join(path, csv_files[0])
df = pd.read_csv(data_file)

print(f"\n‚úÖ Dataset loaded successfully!")
print(f"Shape: {df.shape}")
df.head()

## 3. Exploratory Data Analysis

In [None]:
# Basic information
print("Dataset Info:")
print(f"Rows: {df.shape[0]}")
print(f"Columns: {df.shape[1]}")
print(f"\nColumn Names:\n{df.columns.tolist()}")
print(f"\nData Types:\n{df.dtypes}")
print(f"\nMissing Values:\n{df.isnull().sum()}")

# Check for '?' as missing values
print("\nChecking for '?' as missing values...")
for col in df.columns:
    if df[col].dtype == 'object':
        missing = (df[col] == '?').sum()
        if missing > 0:
            print(f"{col}: {missing} missing values")

## 4. Data Preprocessing

In [None]:
# Handle missing values
print("Handling missing values...")
df = df.replace('?', np.nan)
print(f"Rows before removing NaN: {len(df)}")
df = df.dropna()
print(f"Rows after removing NaN: {len(df)}")

# Identify target column
target_col = None
for col in df.columns:
    if 'income' in col.lower() or 'salary' in col.lower():
        target_col = col
        break

if target_col is None:
    target_col = df.columns[-1]

print(f"\nTarget column: {target_col}")
print(f"Target classes: {df[target_col].unique()}")
print(f"Class distribution:\n{df[target_col].value_counts()}")

In [None]:
# Separate features and target
X = df.drop(columns=[target_col])
y = df[target_col]

# Encode target
le_target = LabelEncoder()
y = le_target.fit_transform(y)
print(f"Target encoding: {dict(zip(le_target.classes_, le_target.transform(le_target.classes_)))}")

# Identify column types
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

print(f"\nCategorical features ({len(categorical_cols)}): {categorical_cols}")
print(f"Numerical features ({len(numerical_cols)}): {numerical_cols}")

In [None]:
# Encode categorical variables
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    label_encoders[col] = le

print(f"\n‚úÖ Encoding complete!")
print(f"Final features: {X.shape[1]}")
print(f"Total samples: {X.shape[0]}")
X.head()

## 5. Train-Test Split and Scaling

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"\nClass distribution in train set:")
print(pd.Series(y_train).value_counts())
print(f"\nClass distribution in test set:")
print(pd.Series(y_test).value_counts())

In [None]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("‚úÖ Feature scaling complete!")
print(f"Scaled training data shape: {X_train_scaled.shape}")
print(f"Scaled test data shape: {X_test_scaled.shape}")

## 6. Train Classification Models

In [None]:
# Initialize all 6 models
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=10000),
    'Decision Tree': DecisionTreeClassifier(random_state=42, max_depth=5),
    'K-Nearest Neighbor': KNeighborsClassifier(n_neighbors=5),
    'Naive Bayes': GaussianNB(),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10),
    'XGBoost': XGBClassifier(n_estimators=100, random_state=42, max_depth=5, eval_metric='logloss')
}

print("‚úÖ Models initialized:")
for name in models.keys():
    print(f"  ‚Ä¢ {name}")

## 7. Train and Evaluate Each Model

In [None]:
# Store results
results = []

print("="*80)
print("TRAINING AND EVALUATING MODELS")
print("="*80)

for model_name, model in models.items():
    print(f"\n{model_name}:")
    print("-" * 40)
    
    # Train
    model.fit(X_train_scaled, y_train)
    
    # Predict
    y_pred = model.predict(X_test_scaled)
    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    mcc = matthews_corrcoef(y_test, y_pred)
    
    # Print metrics
    print(f"Accuracy:  {accuracy:.4f}")
    print(f"AUC:       {auc:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1 Score:  {f1:.4f}")
    print(f"MCC:       {mcc:.4f}")
    
    # Store results
    results.append({
        'Model': model_name,
        'Accuracy': round(accuracy, 4),
        'AUC': round(auc, 4),
        'Precision': round(precision, 4),
        'Recall': round(recall, 4),
        'F1': round(f1, 4),
        'MCC': round(mcc, 4)
    })
    
    # Save model
    model_filename = f"model_{model_name.lower().replace(' ', '_').replace('-', '_')}.pkl"
    with open(model_filename, 'wb') as f:
        pickle.dump(model, f)
    print(f"‚úÖ Model saved: {model_filename}")

print("\n" + "="*80)
print("‚úÖ All models trained successfully!")
print("="*80)

## 8. Results Summary

In [None]:
# Create results DataFrame
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('Accuracy', ascending=False)

print("\n" + "="*80)
print("FINAL RESULTS - ALL MODELS")
print("="*80)
print(results_df.to_string(index=False))

# Save results
results_df.to_csv('model_results.csv', index=False)
print("\n‚úÖ Results saved to model_results.csv")

In [None]:
# Visualize results
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
fig.suptitle('Model Performance Comparison', fontsize=16, fontweight='bold')

metrics = ['Accuracy', 'AUC', 'Precision', 'Recall', 'F1', 'MCC']
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b']

for idx, (ax, metric) in enumerate(zip(axes.flat, metrics)):
    data = results_df.sort_values(metric, ascending=False)
    ax.barh(data['Model'], data[metric], color=colors[idx])
    ax.set_xlabel(metric, fontweight='bold')
    ax.set_title(f'{metric} Comparison', fontweight='bold')
    ax.grid(axis='x', alpha=0.3)
    
    # Add value labels
    for i, v in enumerate(data[metric]):
        ax.text(v, i, f' {v:.3f}', va='center')

plt.tight_layout()
plt.savefig('model_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("‚úÖ Visualization saved as model_comparison.png")

## 9. Save Supporting Files

In [None]:
# Save scaler
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
print("‚úÖ Scaler saved")

# Save label encoders
with open('label_encoders.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)
print("‚úÖ Label encoders saved")

# Save target encoder
with open('target_encoder.pkl', 'wb') as f:
    pickle.dump(le_target, f)
print("‚úÖ Target encoder saved")

# Save feature names
with open('feature_names.pkl', 'wb') as f:
    pickle.dump(X.columns.tolist(), f)
print("‚úÖ Feature names saved")

# Save test data sample for Streamlit
test_data = pd.DataFrame(X_test_scaled[:1000], columns=X.columns)
test_data['target'] = y_test[:1000]
test_data.to_csv('test_data.csv', index=False)
print(f"‚úÖ Test data saved ({test_data.shape[0]} samples)")

## 10. Verify All Files Created

In [None]:
import os
import glob

print("\n" + "="*80)
print("FILE VERIFICATION")
print("="*80)

required_files = [
    'model_logistic_regression.pkl',
    'model_decision_tree.pkl',
    'model_k_nearest_neighbor.pkl',
    'model_naive_bayes.pkl',
    'model_random_forest.pkl',
    'model_xgboost.pkl',
    'scaler.pkl',
    'label_encoders.pkl',
    'target_encoder.pkl',
    'feature_names.pkl',
    'test_data.csv',
    'model_results.csv'
]

all_present = True
for file in required_files:
    if os.path.exists(file):
        size = os.path.getsize(file)
        print(f"‚úÖ {file:40s} ({size:,} bytes)")
    else:
        print(f"‚ùå {file:40s} NOT FOUND")
        all_present = False

print("\n" + "="*80)
if all_present:
    print("üéâ SUCCESS! All files created successfully!")
    print("You are ready to deploy on Streamlit!")
else:
    print("‚ö†Ô∏è  Some files are missing. Please check the errors above.")
print("="*80)

## ‚úÖ Next Steps

1. **Upload to GitHub:**
   - Create a new repository
   - Push all files including .pkl files

2. **Deploy on Streamlit:**
   - Go to share.streamlit.io
   - Connect your GitHub repository
   - Select app.py as main file
   - Deploy!

3. **Create Submission PDF:**
   - Include GitHub link
   - Include Streamlit app link
   - Add screenshot from BITS Lab
   - Copy README content

4. **Submit on Taxila:**
   - Upload the PDF before deadline
   - Verify submission confirmation

---

**Good luck with your assignment! üöÄ**