In [2]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import joblib

# ==========================================
# 1. SETUP & DATA
# ==========================================
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("Gobest_Cab_Safety_Prediction")

print("Loading Data...")
df = pd.read_csv("../data/bi_dataset.csv")

# --- THE AGGRESSIVE FEATURE LIST (16 Heavy Hitters) ---
# We ONLY keep the columns that survived your pruning.
keep_features = [
    'trip_duration_sec', 'n_points', 'speed_max', 'speed_mean', 'speed_std', 
    'pct_time_speed_over_80', 'accel_mag_max', 'n_hard_accels', 'n_hard_brakes', 
    'pct_time_high_accel', 'gyro_mag_max', 'gyro_jerk_mag_mean', 'n_hard_turns', 
    'pct_time_high_gyro', 'n_zigzag_events', 'turn_sharpness_index'
]

# Select only these features (plus label for splitting)
X = df[keep_features].fillna(0)
y = df['label']

print(f"Final Feature Count: {X.shape[1]}")
print(f"Features: {X.columns.tolist()}")

# Split & SMOTE
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# ==========================================
# 2. CONFIGURE THE WINNERS (Updated with your results)
# ==========================================

# A. Logistic Regression (The Winner: C=10)
# CRITICAL FIX: Wrapped in StandardScaler to stop Convergence Warnings
lr_pipeline = make_pipeline(
    StandardScaler(), 
    LogisticRegression(solver='lbfgs', max_iter=3000, C=10) 
)

# B. Random Forest (The Runner Up: n_est=500)
rf_best = RandomForestClassifier(
    n_estimators=500,
    min_samples_split=5,
    max_depth=None,
    bootstrap=True,
    random_state=42
)

# C. XGBoost (The Tank: n_est=200, depth=10)
xgb_best = XGBClassifier(
    n_estimators=200,
    max_depth=10,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    eval_metric='auc',
    random_state=42,
    use_label_encoder=False
)

# ==========================================
# 3. BUILD THE FINAL ENSEMBLE
# ==========================================
print("Training Final Ensemble...")

with mlflow.start_run(run_name="Final_Voting_Ensemble_Aggressive"):
    
    ensemble = VotingClassifier(
        estimators=[
            ('lr', lr_pipeline), 
            ('rf', rf_best), 
            ('xgb', xgb_best)
        ],
        voting='soft' 
    )
    
    ensemble.fit(X_train_res, y_train_res)
    
    # Evaluate
    y_prob = ensemble.predict_proba(X_test)[:, 1]
    y_pred = ensemble.predict(X_test)
    
    auc = roc_auc_score(y_test, y_prob)
    
    print(f"\n=== FINAL ENSEMBLE SCORE ===")
    print(f"AUC Score: {auc:.4f}") 
    print(classification_report(y_test, y_pred))
    
    # Log metrics
    mlflow.log_metric("auc", auc)
    mlflow.log_param("features", "Aggressive_16")
    
    # SAVE THE FINAL ASSETS
    mlflow.sklearn.log_model(ensemble, "model")
    joblib.dump(ensemble, "../models/final_model.pkl")
    joblib.dump(keep_features, "../models/model_columns.pkl")
    
    print("SUCCESS: Saved 'final_model.pkl' and 'model_columns.pkl'")

Loading Data...
Final Feature Count: 16
Features: ['trip_duration_sec', 'n_points', 'speed_max', 'speed_mean', 'speed_std', 'pct_time_speed_over_80', 'accel_mag_max', 'n_hard_accels', 'n_hard_brakes', 'pct_time_high_accel', 'gyro_mag_max', 'gyro_jerk_mag_mean', 'n_hard_turns', 'pct_time_high_gyro', 'n_zigzag_events', 'turn_sharpness_index']
Training Final Ensemble...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== FINAL ENSEMBLE SCORE ===
AUC Score: 0.7183
              precision    recall  f1-score   support

           0       0.81      0.86      0.83      3001
           1       0.48      0.40      0.44       999

    accuracy                           0.74      4000
   macro avg       0.65      0.63      0.64      4000
weighted avg       0.73      0.74      0.74      4000





SUCCESS: Saved 'final_model.pkl' and 'model_columns.pkl'
üèÉ View run Final_Voting_Ensemble_Aggressive at: http://localhost:5000/#/experiments/671472067514299670/runs/96fc3f4bbe2543d0a914d13d4b39b4e2
üß™ View experiment at: http://localhost:5000/#/experiments/671472067514299670
