# <center><font color='#F1B03D'>**Revenue Intelligence Enhancement for BrokerChooser - Predictive Modeling**</font></center>
### <center><font color='#F1B03D'>Central European University, 2024-2025</font></center>
### <center><font color='#F1B03D'>CEU Capstone Project</font></center>

### <left><font color='#F1B03D'>Author: Péter Bence Török (torokpe@gmail.com)</font></left>
### <left><font color='#F1B03D'>BrokerChooser Contact Person: Zoltán Molnár (zoltan.molnar@brokerchooser.com)</font></left>

---
<p style="font-size:22px;"> This notebook focuses on the predictive modeling and classification phase of the project. It uses the previously cleaned and feature-engineered dataset to train and evaluate multiple classification models, including logistic regression (in five variations), LASSO, Random Forest, and XGBoost. The models are assessed using cross-validated performance metrics such as AUC and RMSE, and custom misclassification costs are applied to find optimal probability thresholds. The goal of this notebook is to identify the most effective model for predicting revenue-generating sessions based on user behavior and session attributes.

In [None]:
# Fit and evaluate each logistic regression model
for name, features in Logits.items():
    model = LogisticRegression(max_iter=1000)
    model.fit(X_sampled[features], y_sampled)
    logit_models[name] = model

    # Cross-validated AUC
    auc = cross_val_score(model, X_sampled[features], y_sampled, 
                          cv=5, scoring='roc_auc', n_jobs=-1).mean()

    # Cross-validated RMSE from Brier score
    brier_scores = cross_val_score(
        model, X_sampled[features], y_sampled, 
        cv=5,
        scoring=make_scorer(brier_score_loss, needs_proba=True),
        n_jobs=-1
    )
    rmse = np.mean(np.sqrt(brier_scores))  # Convert Brier to RMSE

    # Add result to collector
    results.add_model(name, len(features), auc, rmse)

---
## 2.2 Lasso + Logit

In [None]:
# Set regularization strength values (Cs are the inverse of λ)
lambdas = 10 ** np.arange(-1, -4.01, -1 / 3)  # λ from 0.1 to 0.0001
n_obs = int(len(y_sampled) * 4 / 5)  # Approximate train set size for scaling Cs
C_values = [1 / (l * n_obs) for l in lambdas]

# Fit logistic regression with L1 regularization using cross-validation
log_lasso_model = LogisticRegressionCV(
    Cs=C_values,
    penalty="l1",
    cv=5,
    solver="liblinear",
    refit=True,
    verbose=1,
    scoring="roc_auc",
    random_state=42
)

# Compute performance metrics
log_lasso_model.fit(X_sampled, y_sampled)
y_pred_proba = log_lasso_model.predict_proba(X_sampled)[:, 1]
avg_auc = roc_auc_score(y_sampled, y_pred_proba)
avg_rmse = np.sqrt(brier_score_loss(y_sampled, y_pred_proba))

# Store results
results.add_model(
    name="LASSO",
    num_var=X_sampled.shape[1],
    auc=avg_auc,
    rmse=avg_rmse
)

print(f"Logit + LASSO - RMSE: {avg_rmse:.4f}, AUC: {avg_auc:.4f}")

# Storing model
logit_models["Logit + LASSO"] = log_lasso_model

---
## 2.3 Probability Forest

In [None]:
# Define the parameter grid
param_grid = {
    'max_features': [5, 10, 20],
    'criterion': ['gini', 'entropy'],
    'min_samples_split': [10, 21, 30]
}

# Store feature names
rf_feature_names = list(X_sampled.columns)

# Initialize Random Forest model
rf_model = RandomForestClassifier(
    n_estimators=200,
    oob_score=True,
    random_state=42
)

# Set up GridSearchCV with ROC AUC and Brier score as scoring metrics
grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    cv=5,
    scoring=['roc_auc', 'neg_brier_score'],
    refit='roc_auc',
    n_jobs=-1,
    verbose=2  # Verbose level added
)

# Fit the model
grid_search.fit(X_sampled, y_sampled)

# Extract best parameters and calculate RMSE and AUC
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")
cv_rmse = np.mean([
    np.sqrt(-1 * grid_search.cv_results_[f"split{i}_test_neg_brier_score"])
    for i in range(5)
])

# Calculate average CV AUC
cv_auc = np.mean([
    grid_search.cv_results_[f"split{i}_test_roc_auc"]
    for i in range(5)
])

# Save results
results.add_model(
    name="Random Forest",
    num_var=X_sampled.shape[1],
    auc=cv_auc,
    rmse=cv_rmse
)

# Best performing model
best_random_forest = grid_search.best_estimator_
print(f"Random Forest - CV RMSE: {cv_rmse:.4f}, CV AUC: {cv_auc:.4f}")

## 2.4 XGBoost

In [None]:
# Define parameter grid for tuning
xgb_param_grid = {
    'n_estimators': [100, 300],
    'max_depth': [3, 6],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Initialize model
xgb_model = XGBClassifier(
    objective='binary:logistic',
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

# Define scoring
scoring = {
    'roc_auc': 'roc_auc',
    'neg_brier_score': 'neg_brier_score'
}

# Grid search with 5-fold CV
xgb_grid = GridSearchCV(
    estimator=xgb_model,
    param_grid=xgb_param_grid,
    scoring=scoring,
    refit='roc_auc',
    cv=5,
    verbose=2,
    n_jobs=-1
)

# Fit model
xgb_grid.fit(X_sampled, y_sampled)

# Extract best model and evakuate performance
best_xgb = xgb_grid.best_estimator_
cv_auc = np.mean([
    xgb_grid.cv_results_[f"split{i}_test_roc_auc"]
    for i in range(5)
])

cv_rmse = np.mean([
    np.sqrt(-1 * xgb_grid.cv_results_[f"split{i}_test_neg_brier_score"])
    for i in range(5)
])

# Save results
results.add_model(
    name="XGBoost",
    num_var=X_sampled.shape[1],
    auc=cv_auc,
    rmse=cv_rmse
)

print(f"XGBoost - CV RMSE: {cv_rmse:.4f}, CV AUC: {cv_auc:.4f}")

In [None]:
results.get_table()

# Classification

In [None]:
# Defining costs for false positives and false negatives clasifications
FP_cost = 1
FN_cost = 20

In [None]:
# Function to find the best classification threshold based on custom misclassification costs
def find_optimal_threshold(model, X, y_true, FP_cost=1, FN_cost=20):
    # Get predicted probabilities for the positive class
    y_probs = model.predict_proba(X)[:, 1]

    # Define a range of threshold values from 0 to 1
    thresholds = np.linspace(0, 1, 100)
    costs = []  # List to store the total cost at each threshold

    # Loop through all thresholds and compute the total cost
    for t in thresholds:
        # Convert probabilities to binary predictions based on threshold
        y_pred = (y_probs >= t).astype(int)

        # Get confusion matrix components
        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

        # Calculate total cost using the specified FP and FN costs
        total_cost = FP_cost * fp + FN_cost * fn
        costs.append(total_cost)

    # Find the threshold that minimizes the total cost
    best_index = np.argmin(costs)
    best_threshold = thresholds[best_index]
    min_cost = costs[best_index]

    # Return the best threshold, its cost, and the full cost curve for plotting
    return best_threshold, min_cost, thresholds, costs

In [295]:
# thresholds are the same (np.linspace(0, 1, 100)), so we can reuse them
thresholds = np.linspace(0, 1, 100)

# Run the function for each model
threshold_logit, cost_logit, _, costs_logit = find_optimal_threshold(logit_models["Logit_M5"], X_test, y_test)
threshold_lasso, cost_lasso, _, costs_lasso = find_optimal_threshold(log_lasso_model, X_test, y_test)
threshold_rf, cost_rf, _, costs_rf = find_optimal_threshold(best_random_forest, X_test, y_test)
threshold_xgb, cost_xgb, _, costs_xgb = find_optimal_threshold(best_xgb, X_test, y_test)

In [None]:
# Plot cost curves
plt.figure(figsize=(8, 4))

plt.plot(thresholds, costs_logit, label=f'Logit M5 (min cost: {cost_logit:.0f})', linewidth=2)
plt.plot(thresholds, costs_lasso, label=f'Lasso (min cost: {cost_lasso:.0f})', linewidth=2)
plt.plot(thresholds, costs_rf, label=f'Random Forest (min cost: {cost_rf:.0f})', linewidth=2)
plt.plot(thresholds, costs_xgb, label=f'XGBoost (min cost: {cost_xgb:.0f})', linewidth=2)

# Mark best thresholds with vertical lines
plt.axvline(x=threshold_logit, color='blue', linestyle='--', alpha=0.1)
plt.axvline(x=threshold_lasso, color='orange', linestyle='--', alpha=0.3)
plt.axvline(x=threshold_rf, color='green', linestyle='--', alpha=0.3)
plt.axvline(x=threshold_xgb, color='red', linestyle='--', alpha=0.3)

# Final plot formatting
plt.xlabel('Threshold')
plt.ylabel('Total Misclassification Cost')
plt.title('Cost Curves for Different Models')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Compute ROC values
fpr, tpr, thresholds = roc_curve(y_test, y_probs)
roc_auc = roc_auc_score(y_test, y_probs)

# Plot
plt.figure(figsize=(8, 4))
plt.plot(fpr, tpr, color='#EFC64A', label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.fill_between(fpr, tpr, alpha=0.2, color='#81C6C7')
plt.plot([0, 1], [0, 1], linestyle='--', color='black', label='Random Guess')

# Labels and title
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.grid(True)
plt.tight_layout()
plt.show()