In [5]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix

y_test  = pd.read_csv("../Data/Processed/y_test.csv").squeeze()
y_proba_logreg = pd.read_csv("../Data/Processed/logreg_results.csv")
y_proba_rf  = pd.read_csv("../Data/Processed/rf_results.csv")

# Use the prob_score column from each model results (not the whole DataFrame)
df_compare = pd.DataFrame({
    'actual': y_test.reset_index(drop=True),
    'score_logreg': y_proba_logreg['prob_score'].reset_index(drop=True),
    'score_rf': y_proba_rf['prob_score'].reset_index(drop=True)
})


target_depth = 0.30
thresh_logreg = df_compare['score_logreg'].quantile(1 - target_depth)
thresh_rf     = df_compare['score_rf'].quantile(1 - target_depth)

print(f"Logistic Regression Threshold (Top 30%): {thresh_logreg:.4f}")
print(f"Random Forest Threshold (Top 30%):       {thresh_rf:.4f}")

# APPLY CUTOFFS & MEASURE PERFORMANCE 

df_compare['target_logreg'] = (df_compare['score_logreg'] >= thresh_logreg).astype(int)
df_compare['target_rf']     = (df_compare['score_rf'] >= thresh_rf).astype(int)

# CALCULATE PROFITABILITY (How many actual buyers did we catch?)
# We filter for where we targeted (prediction=1) AND they bought (actual=1)
hits_logreg = df_compare[(df_compare['target_logreg'] == 1) & (df_compare['actual'] == 1)].shape[0]
hits_rf     = df_compare[(df_compare['target_rf'] == 1)     & (df_compare['actual'] == 1)].shape[0]

total_calls = int(len(df_compare) * target_depth)

print(f"Results for Top {target_depth*100}% (approx. {total_calls} customers targeted):\n")

print(f"MODEL A: Logistic Regression")
print(f" Actual Buyers Found: {hits_logreg}")
print(f" Precision (Hit Rate): {hits_logreg / total_calls:.2%}")

print(f"\nMODEL B: Random Forest")
print(f" Actual Buyers Found: {hits_rf}")
print(f" Precision (Hit Rate): {hits_rf / total_calls:.2%}")

#  THE VERDICT 
if hits_rf > hits_logreg:
    print(f"Random Forest found {hits_rf - hits_logreg} MORE buyers.")
elif hits_logreg > hits_rf:
    print(f"Logistic Regression found {hits_logreg - hits_rf} MORE buyers.")
else:
    print("Both models performed exactly the same.")

Logistic Regression Threshold (Top 30%): 0.5995
Random Forest Threshold (Top 30%):       0.3236
Results for Top 30.0% (approx. 739 customers targeted):

MODEL A: Logistic Regression
 Actual Buyers Found: 201
 Precision (Hit Rate): 27.20%

MODEL B: Random Forest
 Actual Buyers Found: 208
 Precision (Hit Rate): 28.15%
Random Forest found 7 MORE buyers.
