In [None]:
import logging
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import joblib
import time
import sys

In [None]:
import os
notebooks_path = os.getcwd()
notebooks_path

In [None]:
str(Path().resolve().parent)

In [None]:
import sys
from pathlib import Path

# Add the src directory to the Python path
sys.path.append(str(Path().resolve().parent))

from src.create_folds import create_folds
from src.data_cleaning import drop_columns, impute_missing_values
from src.data_preprocessing import preprocess_data
from src.log_reg_model import train_logistic_regression
from src.model_utils import evaluate_model, load_data

In [None]:
# Step 2: Data Cleaning
data = load_data(Path("../input/train_folds.csv"))
df = drop_columns(data)

all_accuracies = []

for fold in range(5):
    logging.info(f"Processing fold {fold}...")
    
    # Step 3: Data Imputation
    fold_df = impute_missing_values(df, fold)
    
    # Step 4: Data Preprocessing
    fold_df = preprocess_data(fold_df, fold)
    
    # Step 5: Model Training
    X = fold_df.drop(["Converted", "kfold"], axis=1)
    y = fold_df["Converted"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = train_logistic_regression(X_train, y_train)
    
    # Step 6: Model Evaluation
    accuracy, report, matrix = evaluate_model(model, X_test, y_test)
    all_accuracies.append(accuracy)
    
    # Print results
    print(f"Fold {fold} Results:")
    print(f"Accuracy: {accuracy}")
    # print(f"Classification Report:\n{report}")
    # print(f"Confusion Matrix:\n{matrix}")

In [None]:
fold = 0
logging.info(f"Processing fold {fold}...")

# Step 3: Data Imputation
fold_df = impute_missing_values(df, fold)

# Step 4: Data Preprocessing
fold_df = preprocess_data(fold_df, fold)

# Step 5: Model Training
X = fold_df.drop(["Converted", "kfold"], axis=1)
y = fold_df["Converted"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = train_logistic_regression(X_train, y_train)

# Step 6: Model Evaluation
accuracy, report, matrix = evaluate_model(model, X_test, y_test)

# Print results
print(f"Fold {fold} Results:")
print(f"Accuracy: {accuracy}")
# print(f"Classification Report:\n{report}")
# print(f"Confusion Matrix:\n{matrix}")



## Coefficients

In [None]:
# Calculate feature importance

coefficients = model.coef_[0]
# odds_ratios = [round(float(x), 2) for x in list(map(lambda x: 2**x, coefficients))]
odds_ratios = np.exp(coefficients)

feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': coefficients,
    'Odds Ratio': odds_ratios
})

# Sort by absolute value of coefficient
feature_importance = feature_importance.reindex(feature_importance['Coefficient'].abs().sort_values(ascending=False).index)
print(feature_importance)

## Model Probabilities

In [None]:
# Get probability of conversion
probs = model.predict_proba(X_test)[:, 1] # Probability of the positive class

In [None]:
# Apply a custom threshold to get the predicted class
threshold = 0.45
y_pred = (probs >= threshold).astype(int)

# Evaluate the model with the custom threshold
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy with threshold {threshold}: {accuracy}")

## Plotting the ROC Curve and Calculating AUC

In [None]:
# Calculate ROC curve
fpr, tpr, thresholds = roc_curve(y_test, probs)

# Calculate AUC
auc_score = roc_auc_score(y_test, probs)
print(f"AUC: {auc_score}")

In [None]:
# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f"ROC curve (AUC = {auc_score:.2f})")
plt.plot([0, 1], [0, 1], 'k--', label="Random Guess")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.show()

## Finding Optimal Cut-off Point

Optimal cut-off point is the point where we get the best accuracy. We can find it by calculating the point where the sum of sensitivity and specificity is maximum.

In [None]:
# Let's create columns with different probability thresholds
thresholds = np.linspace(0, 0.99, 100)
metrics = []
best_threshold = 0.5
best_f1, best_accuracy, best_precision, best_recall = 0, 0, 0, 0
best_f1_threshold, best_accuracy_threshold, best_precision_threshold, best_recall_threshold = 0, 0, 0, 0

for threshold in thresholds:
    y_pred = (probs >= threshold).astype(int)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    metrics.append([threshold, accuracy, precision, recall, f1])
    if f1 > best_f1:
        best_f1 = f1
        best_f1_threshold = threshold
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_accuracy_threshold = threshold
    if precision > best_precision:
        best_precision = precision
        best_precision_threshold = threshold
    if recall > best_recall:
        best_recall = recall
        best_recall_threshold = threshold
    if recall == precision:
        best_threshold = threshold

metrics_df = pd.DataFrame(metrics, columns=["Threshold", "Accuracy", "Precision", "Recall", "F1"])
metrics_df = metrics_df.set_index("Threshold")

print(f"Best F1-Score threshold: {best_f1_threshold}, Best F1-Score: {best_f1}")
print(f"Best Accuracy threshold: {best_accuracy_threshold}, Best Accuracy: {best_accuracy}")
print(f"Best Precision threshold: {best_precision_threshold}, Best Precision: {best_precision}")
print(f"Best Recall threshold: {best_recall_threshold}, Best Recall: {best_recall}")
print(f"Best threshold: {best_threshold}")

# Plot the metrics
plt.figure(figsize=(8, 6))
metrics_df.plot()
plt.xlabel("Threshold")
plt.ylabel("Score")
plt.title("Metrics vs. Threshold")
plt.legend(title="Metric")
plt.grid(True)
plt.show()

## Youden's Index

Youden's Index ia a statistic that is often used to evaluate the effectiveness of a diagnostic test or, in the context of machine learning, to assess the performance of a binary classification model. It helps to identify the optimal threshold that maximized the balance between the true positive rate (sensitivity) and the false positive rate (1-specificity).

In [None]:
def find_optimal_threshold(y_true, y_prob):
    fpr, tpr, thresholds = roc_curve(y_true, y_prob)
    youdens_index = tpr - fpr
    optimal_threshold = thresholds[np.argmax(youdens_index)]
    return optimal_threshold

youden_threshold = find_optimal_threshold(y_test, probs)
print(f"Optimal threshold: {youden_threshold}")

## Precision-Recall Curve

In [None]:
# Precision-Recall Curve
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

In [None]:
# Step 2: Calculate precision and recall for different thresholds
precision, recall, thresholds = precision_recall_curve(y_test, probs)

# Step 3: Calculate the average precision score
average_precision = average_precision_score(y_test, probs)
print(f'Average Precision Score: {average_precision:.2f}')

# Step 4: Plot the Precision-Recall curve
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, marker='.', label=f'Logistic Regression (AP = {average_precision:.2f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
def evaluate_thresholds(probs, y_test, thresholds=np.linspace(0, 0.99, 1000), tol=0.01):
    metrics = []
    best_f1, best_accuracy, best_precision, best_recall = 0, 0, 0, 0
    best_f1_threshold, best_accuracy_threshold, best_precision_threshold, best_recall_threshold = 0, 0, 0, 0
    best_threshold = 0.1

    for threshold in thresholds:
        y_pred = (probs >= threshold).astype(int)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, zero_division=0)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        metrics.append([threshold, accuracy, precision, recall, f1])
        
        if f1 > best_f1:
            best_f1 = f1
            best_f1_threshold = threshold
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_accuracy_threshold = threshold
        if precision > best_precision:
            best_precision = precision
            best_precision_threshold = threshold
        if recall > best_recall:
            best_recall = recall
            best_recall_threshold = threshold
        if abs(recall - precision) <= tol:  # Compare within the tolerance
            best_threshold = threshold

    metrics_df = pd.DataFrame(metrics, columns=["Threshold", "Accuracy", "Precision", "Recall", "F1"])
    metrics_df = metrics_df.set_index("Threshold")

    print(f"Best F1-Score threshold: {best_f1_threshold}, Best F1-Score: {best_f1}")
    print(f"Best Accuracy threshold: {best_accuracy_threshold}, Best Accuracy: {best_accuracy}")
    print(f"Best Precision threshold: {best_precision_threshold}, Best Precision: {best_precision}")
    print(f"Best Recall threshold: {best_recall_threshold}, Best Recall: {best_recall}")
    print(f"Best threshold (Recall ≈ Precision): {best_threshold}")

    return metrics_df, best_f1_threshold, best_accuracy_threshold, best_precision_threshold, best_recall_threshold, best_threshold


def plot_metrics(metrics_df):
    plt.figure(figsize=(15, 10))
    metrics_df.plot()
    plt.xlabel("Threshold")
    plt.ylabel("Score")
    plt.title("Metrics vs. Threshold")
    plt.legend(title="Metric")
    plt.grid(True, which='both')
    plt.minorticks_on()
    plt.grid(True, which='minor', linestyle=':', linewidth='0.5', color='gray', alpha=0.3)

    plt.show()

# Example usage:
probs = model.predict_proba(X_test)[:, 1]
metrics_df, best_f1_threshold, best_accuracy_threshold, best_precision_threshold, best_recall_threshold, best_threshold = evaluate_thresholds(probs, y_test)
plot_metrics(metrics_df)


In [None]:
import matplotlib.pyplot as plt
import numpy as np

def plot_metrics(metrics_df, figsize=(15, 10), title="Metrics vs. Threshold", 
                 custom_colors=None, save_path=None):
    """
    Plot metrics against thresholds with improved formatting and options.
    
    Parameters:
    - metrics_df: DataFrame with metrics (columns) and thresholds (index)
    - figsize: Tuple for figure size
    - title: String for plot title
    - custom_colors: Dict mapping metric names to colors (optional)
    - save_path: String path to save the figure (optional)
    """
    plt.figure(figsize=figsize)
    
    # Use custom colors if provided, otherwise use default color cycle
    if custom_colors:
        for column, color in custom_colors.items():
            plt.plot(metrics_df.index, metrics_df[column], label=column, color=color, linewidth=2)
    else:
        for column in metrics_df.columns:
            plt.plot(metrics_df.index, metrics_df[column], label=column, linewidth=2)
    
    plt.xlabel("Threshold", fontsize=12)
    plt.ylabel("Score", fontsize=12)
    plt.title(title, fontsize=16)
    plt.legend(title="Metric", title_fontsize=12, fontsize=10, loc='center left', bbox_to_anchor=(1, 0.5))
    
    # Set x-axis ticks
    plt.xticks(np.arange(0, 1.1, 0.1))
    
    # Set y-axis ticks
    plt.yticks(np.arange(0, 1.1, 0.1))
    
    # Add grid
    plt.grid(True, which='both', linestyle='--', linewidth=0.5)
    plt.minorticks_on()
    plt.grid(which='minor', linestyle=':', linewidth='0.5', color='gray', alpha=0.3)
    
    # Add horizontal lines at 0.25, 0.5, and 0.75 for easier reading
    for y in [0.25, 0.5, 0.75]:
        plt.axhline(y=y, color='gray', linestyle='--', linewidth=0.5)
    
    # Improve layout
    plt.tight_layout()
    
    # Add text annotations for max values
    for column in metrics_df.columns:
        max_value = metrics_df[column].max()
        max_threshold = metrics_df[column].idxmax()
        plt.annotate(f'Max {column}: {max_value:.2f} at {max_threshold:.2f}',
                     xy=(max_threshold, max_value), xytext=(5, 5),
                     textcoords='offset points', ha='left', va='bottom',
                     bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.5),
                     arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0'))
    
    # Save the figure if a path is provided
    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
    
    plt.show()

# Example usage:
custom_colors = {'Accuracy': 'blue', 'Precision': 'red', 'Recall': 'green', 'F1': 'purple'}
plot_metrics(metrics_df, custom_colors=custom_colors)
# plot_metrics(metrics_df, custom_colors=custom_colors, save_path='metrics_plot.png')

## Metrics based on the best threshold

In [None]:
# Set the chosen threshold
chosen_threshold = 0.46

# Apply the threshold to get predictions
y_pred = (probs >= chosen_threshold).astype(int)

# Calculate the metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc_score = roc_auc_score(y_test, probs)

# Print the results
print(f"Metrics at Threshold {chosen_threshold}:")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")
print(f"AUC: {auc_score}")

In [None]:
thresholds = np.linspace(0.30, 0.64, 17)
thresholds

In [None]:
# List of thresholds to evaluate
thresholds = np.linspace(0.30, 0.64, 34)

for chosen_threshold in thresholds:
    # Apply the threshold to get predictions
    y_pred = (probs >= chosen_threshold).astype(int)

    # Calculate the metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc_score = roc_auc_score(y_test, probs)

    print(f"Threshold {chosen_threshold:.2f}: Accuracy: {accuracy:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}, F1-Score: {f1:.2f}, AUC: {auc_score:.2f}")

Considering the business requirement, where CEO has given a ballpark of the target lead conversion rate to be around 80%, we need precision atleast 80% to consider the model to be good. 

At the threshold of 0.62, we start getting precision of 80%. So, we will consider this threshold as the best threshold for our model.
It's because after this threshold, even though the precision is increasing, the recall and F1 score are decreasing. So, we will consider the threshold of 0.62 as the best threshold for our model.

## Metrics of all folds

In [None]:
fold = 0

logging.info(f"Processing fold {fold}...")

# Step 3: Data Imputation
fold_df = impute_missing_values(df, fold)

# Step 4: Data Preprocessing
fold_df = preprocess_data(fold_df, fold)

# Step 5: Model Training
X = fold_df.drop(["Converted", "kfold"], axis=1)
y = fold_df["Converted"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = train_logistic_regression(X_train, y_train)
# Predict probabilities
probs = model.predict_proba(X_test)[:, 1]

# Evaluate the model with the custom threshold
threshold = 0.62
y_pred = (probs >= threshold).astype(int)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc_score = roc_auc_score(y_test, probs)

# Print results
print(f"Fold {fold} Results: Accuracy: {accuracy:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}, AUC: {auc_score:.2f}")
print()  

## Feature Importance

In [None]:
# Extract feature importance for fold 0
coefficients = model.coef_[0]
odds_ratios = np.exp(coefficients)

feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': coefficients,
    'Odds Ratio': odds_ratios
})

# Sort by absolute value of coefficient
feature_importance = feature_importance.reindex(feature_importance['Coefficient'].abs().sort_values(ascending=False).index)
print(feature_importance)

In [None]:
# Extract feature importance
feature_importance = pd.Series(model.coef_[0], index=X_train.columns)
feature_importance = feature_importance.sort_values(ascending=False)
print(f"Feature importance:\n{feature_importance}")