In [12]:
# model_training.py (FINAL VERSION)
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report

# Load and preprocess data
df = pd.read_csv("C:/Users/nabil/Downloads/archive (2)/income_inequality_prediction.csv")
df['income_above_limit'] = df['income_above_limit'].map({'Above limit': 1, 'Below limit': 0})
df = df.drop(columns=["ID", "is_hispanic", "country_of_birth_mother", "country_of_birth_own"])

# Handle missing values
df = df.dropna(thresh=len(df)*0.5, axis=1)
df = df.fillna(df.mode().iloc[0])

# Identify categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

# Save category mappings BEFORE encoding
category_mappings = {col: df[col].unique().tolist() for col in categorical_cols}
joblib.dump(category_mappings, 'category_mappings.joblib')

# One-hot encoding
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Save feature names
joblib.dump(df.drop('income_above_limit', axis=1).columns.tolist(), 'feature_columns.joblib')

# Split data
X = df.drop('income_above_limit', axis=1)
y = df['income_above_limit']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Model pipeline
model = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('clf', RandomForestClassifier(
        class_weight='balanced_subsample',
        n_jobs=-1,
        random_state=42
    ))
])

# Hyperparameter tuning
param_grid = {
    'clf__n_estimators': [100, 200],
    'clf__max_depth': [None, 10, 20],
    'clf__min_samples_split': [2, 5],
    'smote__sampling_strategy': [0.3, 0.5]
}

search = RandomizedSearchCV(model, param_grid, n_iter=10, scoring='f1', cv=3, n_jobs=-1)
search.fit(X_train, y_train)

# Save model and evaluation
best_model = search.best_estimator_
joblib.dump(best_model, 'income_model.joblib')

print(f"Best Parameters: {search.best_params_}")
print(classification_report(y_test, best_model.predict(X_test)))

Best Parameters: {'smote__sampling_strategy': 0.3, 'clf__n_estimators': 100, 'clf__min_samples_split': 5, 'clf__max_depth': None}
              precision    recall  f1-score   support

           0       0.97      0.98      0.97     39300
           1       0.65      0.51      0.57      2600

    accuracy                           0.95     41900
   macro avg       0.81      0.75      0.77     41900
weighted avg       0.95      0.95      0.95     41900

