In [None]:
import pandas as pd
import numpy as np
import pickle
import os

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, average_precision_score
import lightgbm as lgb 

processed_dir = '../data/processed'
print("Loading data...")
X_train_processed = np.load(f'{processed_dir}/X_train_processed.npy')
X_test_processed = np.load(f'{processed_dir}/X_test_processed.npy')
y_train = pd.read_csv(f'{processed_dir}/y_train.csv').squeeze()
y_test = pd.read_csv(f'{processed_dir}/y_test.csv').squeeze()
with open(f'{processed_dir}/feature_names.pkl', 'rb') as f:
    feature_names = pickle.load(f)
print(f"X_train_processed shape: {X_train_processed.shape}")
print(f"y_train shape: {y_train.shape}")

Loading data...
X_train_processed shape: (156156, 32)
y_train shape: (156156,)


In [None]:

# We will compare a simple baseline, an ensemble, and a boosting model

models = {
    'LogisticRegression': LogisticRegression(random_state=42, max_iter=1000),
    'RandomForest': RandomForestClassifier(
        random_state=42, 
        n_estimators=50,  # Build 50 trees 
        max_depth=10,     # Limit tree depth to 10
        n_jobs=-1         # Use all available CPU cores
    ),
    'LightGBM': lgb.LGBMClassifier(random_state=42, verbosity=-1) # verbosity=-1 stops it from printing logs
}

In [None]:
# RAIN AND EVALUATE MODELS 

results = {}
trained_models = {}

#  directory to save models
os.makedirs('../models', exist_ok=True)

for name, model in models.items():
    print(f"--- Training {name} ---")
    
    # Train the model
    # LightGBM needs feature names
    if name == 'LightGBM':
        # Identify categorical features by the prefix we added in preprocessing
        categorical_cols = [f for f in feature_names if f.startswith('cat__')]
        model.fit(X_train_processed, y_train, feature_name=feature_names, categorical_feature=categorical_cols)
    else:
        model.fit(X_train_processed, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_processed)
    

    y_pred_proba = model.predict_proba(X_test_processed)[:, 1] # Get prob for the '1' class
 
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    pr_auc = average_precision_score(y_test, y_pred_proba) # Also known as Average Precision
    report = classification_report(y_test, y_pred, output_dict=True)
    
    print(f"Test Accuracy: {accuracy:.4f}")
    print(f"Test ROC AUC: {roc_auc:.4f}")
    print(f"Test PR AUC (Avg. Precision): {pr_auc:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    # Store results and the trained model
    results[name] = {
        'accuracy': accuracy,
        'roc_auc': roc_auc,
        'pr_auc': pr_auc,
        'report': report
    }
    trained_models[name] = model
    
    # Save the trained model to the /models folder
    with open(f'../models/{name}.pkl', 'wb') as f:
        pickle.dump(model, f)
        
    print(f"Saved trained {name} model to ../models/{name}.pkl\n")

--- Training LogisticRegression ---
Test Accuracy: 0.7344
Test ROC AUC: 0.7134
Test PR AUC (Avg. Precision): 0.5283

Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.95      0.84     27522
           1       0.66      0.21      0.32     11518

    accuracy                           0.73     39040
   macro avg       0.70      0.58      0.58     39040
weighted avg       0.72      0.73      0.68     39040

Saved trained LogisticRegression model to ../models/LogisticRegression.pkl

--- Training RandomForest ---
Test Accuracy: 0.7391
Test ROC AUC: 0.7269
Test PR AUC (Avg. Precision): 0.5530

Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.97      0.84     27522
           1       0.71      0.19      0.31     11518

    accuracy                           0.74     39040
   macro avg       0.73      0.58      0.57     39040
weighted avg       0.73      0.74      0.68     390



Test Accuracy: 0.7427
Test ROC AUC: 0.7337
Test PR AUC (Avg. Precision): 0.5597

Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.95      0.84     27522
           1       0.67      0.26      0.37     11518

    accuracy                           0.74     39040
   macro avg       0.71      0.60      0.60     39040
weighted avg       0.73      0.74      0.70     39040

Saved trained LightGBM model to ../models/LightGBM.pkl



In [None]:
# SAVE RESULTS FOR EVALUATION NOTEBOOK 

with open('../data/processed/model_results.pkl', 'wb') as f:
    pickle.dump(results, f)

print("All model results saved to ../data/processed/model_results.pkl")

All model results saved to ../data/processed/model_results.pkl
