# Training binary classification model for Jivi restart writers

## TODO: Hyper-parameter tuning. Further champion model hunt

In [0]:
%pip install shap

In [0]:
%restart_python

In [0]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from typing import Tuple, List, Dict, Any
import warnings
warnings.filterwarnings('ignore')
import mlflow
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.base import clone
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    roc_auc_score, precision_score, recall_score, 
    f1_score, precision_recall_curve, auc, confusion_matrix, classification_report
)
import shap

In [0]:
%run "../00_config/set-up"

In [0]:
# Month and Date parameters for manual control
first_month = "2019-12"
last_month = "2024-11"

train_start_month = "2023-01"
train_end_month = "2024-04"
test_start_month = "2024-05"
test_end_month = "2024-11"

In [0]:
# Reading the feature master table from Hivestore
hcp_feats_master_w_target_sdf = spark.sql("SELECT * FROM jivi_new_writer_model.hcp_feats_master_w_target")
print(
    "Row count: ",
    hcp_feats_master_w_target_sdf.count(),
    "Column Count: ",
    len(hcp_feats_master_w_target_sdf.columns),
)

In [0]:
# Converting Spark dataframe to Pandas dataframe
hcp_feats_master_w_target_pdf = hcp_feats_master_w_target_sdf.toPandas()

In [0]:
feat_cols = [col for col in hcp_feats_master_w_target_pdf.columns if col not in ['BH_ID', 'COHORT_MONTH', 'JIVI_NEW_WRITER_FLG']]
binary_cols = ['AFFL_WI_INSN', 'AFFL_WI_JIVI_HCP_12M']
numeric_cols = [col for col in feat_cols if col not in binary_cols]
target_col_nm = 'JIVI_NEW_WRITER_FLG'
print("Names of binary feats", binary_cols)
print("Names of numeric feats", numeric_cols)
print("Number of features: ", len(feat_cols))

In [0]:
def prepare_data(
    df: pd.DataFrame,
    target_col: str,
    feature_cols: List[str],
    numeric_cols: List[str],
    train_end_month: str,
    scale: bool = False
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
    """
    Prepare data for training and testing based on COHORT_MONTH.
    
    Args:
        df: Input Pandas DataFrame
        target_col: Name of target column
        feature_cols: List of feature column names
        train_end_month: End month for training data (YYYY-MM format)
        scale: Whether to apply StandardScaler to the features
    
    Returns:
        X_train, X_test, y_train, y_test as Pandas DataFrames/Series
    """
    # Ensure input is a pandas DataFrame
    if not isinstance(df, pd.DataFrame):
        raise TypeError("Input must be a pandas DataFrame")
    
    # Split data into train and test
    train_mask = pd.to_datetime(df['COHORT_MONTH']).dt.strftime('%Y-%m') <= train_end_month
    
    # Create train/test splits using pandas
    X_train = df[train_mask][feature_cols]
    X_test = df[~train_mask][feature_cols]
    y_train = df[train_mask][target_col]
    y_test = df[~train_mask][target_col]

    print("No. of features in input dataframe: ", len(feature_cols))
    print("Positives/Negatives in train: \n", y_train.value_counts())
    print("Positives/Negatives in test: \n", y_test.value_counts())
    print("Shape of X_train: ", X_train.shape)
    print("Shape of X_test: ", X_test.shape)
    
    # Scale features if scale is True
    if scale:
        scaler = StandardScaler()
        X_train[numeric_cols] = pd.DataFrame(
            scaler.fit_transform(X_train[numeric_cols]),
            columns=numeric_cols,
            index=X_train.index
        )
        X_test[numeric_cols] = pd.DataFrame(
            scaler.transform(X_test[numeric_cols]),
            columns=numeric_cols,
            index=X_test.index
        )
    
    return X_train, X_test, y_train, y_test

In [0]:
# X = hcp_feats_master_w_target_pdf[feat_cols]
X = hcp_feats_master_w_target_pdf[["COHORT_MONTH"] + feat_cols]
y = hcp_feats_master_w_target_pdf[target_col_nm]
print("Positives/Negatives in the dataset: \n", y.value_counts())
print("Shape of dataset before oversampling: ", hcp_feats_master_w_target_pdf.shape)

**Applying Oversampling**

In [0]:
# applying oversampling for the minority class
ros = RandomOverSampler()
X_oversampled, y_oversampled = ros.fit_resample(X, y)
hcp_feats_master_w_target_oversampled_pdf = pd.concat([X_oversampled, y_oversampled], axis=1)

In [0]:
print("Positives/Negatives in dataset after oversampling: \n", hcp_feats_master_w_target_oversampled_pdf.JIVI_NEW_WRITER_FLG.value_counts())
print("Shape of dataset after oversampling: ", hcp_feats_master_w_target_oversampled_pdf.shape)

**Applying undersampling**

In [0]:
# applying undersampling for the majority class
rus = RandomUnderSampler()
X_undersampled, y_undersampled = rus.fit_resample(X, y)
hcp_feats_master_w_target_undersampled_pdf = pd.concat([X_undersampled, y_undersampled], axis=1)

In [0]:
print("Positives/Negatives in dataset after undersampling: \n", hcp_feats_master_w_target_undersampled_pdf.JIVI_NEW_WRITER_FLG.value_counts())
print("Shape of dataset after undersampling: ", hcp_feats_master_w_target_undersampled_pdf.shape)

### Overall logistic regression performs consistently without overfitting and undersampling seems to be working better than oversampling for class imbalance

In [0]:
# Create train and test dataset
X_train, X_test, y_train, y_test = prepare_data(
  hcp_feats_master_w_target_undersampled_pdf, 
  target_col_nm, 
  feat_cols,
  numeric_cols,
  train_end_month, 
  scale=True)

In [0]:
mlflow.autolog()

# Initialize the logistic regression model

# For Ridge regression
# logit_reg = LogisticRegression(penalty='l2', class_weight='balanced', random_state=42, max_iter=1000)

# For Lasso regression
logit_reg = LogisticRegression(penalty='l1', solver='liblinear', class_weight='balanced', random_state=42, max_iter=1000)

# Fit the model on the training data
logit_reg.fit(X_train, y_train)

# Predict on the test data
y_pred = logit_reg.predict(X_test)
y_pred_proba = logit_reg.predict_proba(X_test)[:, 1]

# Calculate metrics
metrics = {
    'auc_roc': roc_auc_score(y_test, y_pred_proba),
    'precision': precision_score(y_test, y_pred),
    'recall': recall_score(y_test, y_pred),
    'f1': f1_score(y_test, y_pred),
}

# Calculate PR AUC
precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
metrics['auc_pr'] = auc(recall, precision)

# Calculate confusion matrix
cm = confusion_matrix(y_test, y_pred)
metrics['TNs'] = cm[0, 0]
metrics['FPs'] = cm[0, 1]
metrics['FNs'] = cm[1, 0]
metrics['TPs'] = cm[1, 1]


for metric_name, value in metrics.items():
  print(f"{metric_name}: {value:.3f}")
  
print("Classification Report: ")
print(classification_report(y_test, y_pred))

if y_pred_proba is not None:
    plt.figure(figsize=(8, 6))
    sns.histplot(y_pred_proba, bins=50)
    plt.title('Prediction Probability Distribution')
    plt.show()

In [0]:
# # Get model co_efficients
# co_eff = logit_reg.coef_[0]

# # Put in DataFrame and sort by effect size
# co_eff_df = pd.DataFrame()
# co_eff_df['feature'] = feat_cols
# co_eff_df['co_eff'] = co_eff
# co_eff_df['abs_co_eff'] = np.abs(co_eff)
# co_eff_df_sorted = co_eff_df.sort_values(by='abs_co_eff', ascending=False, inplace=False)
# display(co_eff_df_sorted)

### SHAP feature importance

In [0]:
# Initialize the SHAP explainer
explainer = shap.Explainer(logit_reg, X_train)

# Calculate SHAP values
shap_values = explainer(X_test)
# shap_values = explainer(X_train)

# Plot the SHAP summary plot
# shap.summary_plot(shap_values, X_test, feature_names=feat_cols)
shap.plots.beeswarm(shap_values)

In [0]:
shap.summary_plot(shap_values = explainer(X_train), 
                  features = X_train.values,
                  feature_names = X_train.columns.values,
                  plot_type='bar',
                  max_display=15,
                  show=False)
plt.tight_layout(rect=[0, 0, 2, 1])
plt.show()

In [0]:
# Initialize SHAP JavaScript visualization
shap.initjs()

# Select an index for the SHAP force plot
ind = 1

# Plot the SHAP force plot
shap.force_plot(shap_values[ind], matplotlib=True)

In [0]:
# top 20 features to show importance
max_display = 20

# For linear models, use coefficients directly
importance = np.abs(logit_reg.coef_[0])
feature_importance_df = pd.DataFrame({
    'feature': feat_cols,
    'importance': importance
})
feature_importance_df = feature_importance_df.sort_values(
    'importance', ascending=False
).head(max_display)

plt.figure(figsize=(10, 8))
plt.barh(
    range(len(feature_importance_df)),
    feature_importance_df['importance']
)
plt.yticks(
    range(len(feature_importance_df)),
    feature_importance_df['feature']
)
plt.xlabel('|Coefficient|')
plt.title('Feature Importance (Logistic Regression Coefficients)')