# Training binary classification model for Jivi restart writers

In [0]:
%pip install xgboost

In [0]:
%pip install shap

In [0]:
%restart_python

In [0]:
import pandas as pd
import numpy as np
from sklearn.base import clone
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectFromModel, RFE
from sklearn.metrics import (
    roc_auc_score, precision_score, recall_score, 
    f1_score, precision_recall_curve, auc
)
import shap
import matplotlib.pyplot as plt
from typing import Tuple, List, Dict, Any
import warnings
warnings.filterwarnings('ignore')

In [0]:
%run "../00_config/set-up"

In [0]:
# Month and Date parameters for manual control
first_month = "2019-12"
last_month = "2024-11"

train_start_month = "2023-01"
train_end_month = "2024-04"
test_start_month = "2024-05"
test_end_month = "2024-11"

In [0]:
# Reading the feature master table from Hivestore
hcp_feats_master_w_target_sdf = spark.sql("SELECT * FROM jivi_new_writer_model.hcp_feats_master_w_target")
print(
    "Row count: ",
    hcp_feats_master_w_target_sdf.count(),
    "Column Count: ",
    len(hcp_feats_master_w_target_sdf.columns),
)

In [0]:
# Converting Spark dataframe to Pandas dataframe
hcp_feats_master_w_target_pdf = hcp_feats_master_w_target_sdf.toPandas()

In [0]:
feat_cols_nm_lst = [col for col in hcp_feats_master_w_target_pdf.columns if col not in ['BH_ID', 'COHORT_MONTH', 'JIVI_NEW_WRITER_FLG']]
target_col_nm = 'JIVI_NEW_WRITER_FLG'
print("Names of feats", feat_cols_nm_lst)
print("Number of features: ", len(feat_cols_nm_lst))

In [0]:
def prepare_data(
    df: pd.DataFrame,
    target_col: str,
    feature_cols: List[str],
    train_end_month: str,
    scale: bool = False
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
    """
    Prepare data for training and testing based on COHORT_MONTH.
    
    Args:
        df: Input Pandas DataFrame
        target_col: Name of target column
        feature_cols: List of feature column names
        train_end_month: End month for training data (YYYY-MM format)
        scale: Whether to apply StandardScaler to the features
    
    Returns:
        X_train, X_test, y_train, y_test as Pandas DataFrames/Series
    """
    # Ensure input is a pandas DataFrame
    if not isinstance(df, pd.DataFrame):
        raise TypeError("Input must be a pandas DataFrame")
    
    # Split data into train and test
    train_mask = pd.to_datetime(df['COHORT_MONTH']).dt.strftime('%Y-%m') <= train_end_month
    
    # Create train/test splits using pandas
    X_train = df[train_mask][feature_cols]
    X_test = df[~train_mask][feature_cols]
    y_train = df[train_mask][target_col]
    y_test = df[~train_mask][target_col]
    
    # Scale features if scale is True
    if scale:
        scaler = StandardScaler()
        X_train = pd.DataFrame(
            scaler.fit_transform(X_train),
            columns=feature_cols,
            index=X_train.index
        )
        X_test = pd.DataFrame(
            scaler.transform(X_test),
            columns=feature_cols,
            index=X_test.index
        )
    
    return X_train, X_test, y_train, y_test

In [0]:
def select_features(
    X_train: pd.DataFrame,
    y_train: pd.Series,
    feature_cols: List[str],
    method: str = None,
) -> Tuple[pd.DataFrame, List[str]]:
    """
    Perform feature selection using either Random Forest or RFE.
    
    Args:
        X_train: Training features DataFrame
        y_train: Training target Series
        feature_cols: List of feature names
        method: Feature selection method ('rf' or 'rfe')
    
    Returns:
        Selected X_train DataFrame and list of selected feature names
    """
    # Get number of features using Python's built-in len
    n_features = len(X_train.columns)
    # Use Python's built-in min function with a list
    max_features = 50 if n_features > 50 else n_features
    
    if method == 'rf':
        selector = SelectFromModel(
            RandomForestClassifier(n_estimators=100, random_state=42),
            max_features=max_features
        )
    else:
        selector = RFE(
            estimator=LogisticRegression(random_state=42),
            n_features_to_select=max_features
        )
    
    # Fit selector
    selector.fit(X_train, y_train)
    
    # Get selected feature names
    selected_features = [
        feature_name for feature_name, selected 
        in zip(feature_cols, selector.get_support())
        if selected
    ]
    
    # Return selected features DataFrame
    X_train_selected = X_train[selected_features]
    
    return X_train_selected, selected_features

In [0]:
def create_time_series_cv(
    X: pd.DataFrame,
    n_splits: int = 7 # equal to number of training months
) -> TimeSeriesSplit:
    """
    Create time series cross-validation splits.
    
    Args:
        X: Feature DataFrame
        n_splits: Number of splits for cross-validation
    
    Returns:
        TimeSeriesSplit object
    """
    return TimeSeriesSplit(n_splits=n_splits)

In [0]:
def train_evaluate_model(
    model: Any,
    X_train: pd.DataFrame,
    X_test: pd.DataFrame,
    y_train: pd.Series,
    y_test: pd.Series,
    cv: TimeSeriesSplit
) -> Dict[str, float]:
    """
    Train model and evaluate performance using multiple metrics.
    
    Args:
        model: ML model instance
        X_train, X_test: Training and test DataFrames
        y_train, y_test: Training and test Series
        cv: Cross-validation splitter
    
    Returns:
        Dictionary of evaluation metrics
    """
    # Train model
    model.fit(X_train, y_train)
    
    # Get predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    metrics = {
        'auc_roc': roc_auc_score(y_test, y_pred_proba),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'f1': f1_score(y_test, y_pred),
    }
    
    # Calculate PR AUC
    precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
    metrics['auc_pr'] = auc(recall, precision)
    
    return metrics

In [0]:
def plot_learning_curves(
    model: Any,
    X_train: pd.DataFrame,
    y_train: pd.Series,
    cv: TimeSeriesSplit
) -> None:
    """
    Plot learning curves for the model.
    
    Args:
        model: ML model instance
        X_train: Training features DataFrame
        y_train: Training target Series
        cv: Cross-validation splitter
    """
    train_sizes = np.linspace(0.1, 1.0, 10)
    train_scores_list = []
    val_scores_list = []
    
    # Convert train_sizes to actual numbers of samples
    n_samples = len(X_train)
    train_sizes_abs = [int(n * n_samples) for n in train_sizes]
    
    # For each CV split
    for train_idx, val_idx in cv.split(X_train):
        X_train_cv = X_train.iloc[train_idx]
        X_val_cv = X_train.iloc[val_idx]
        y_train_cv = y_train.iloc[train_idx]
        y_val_cv = y_train.iloc[val_idx]
        
        train_scores_split = []
        val_scores_split = []
        
        # For each training size
        for train_size in train_sizes_abs:
            # Fit model on subset of training data
            model_clone = clone(model)  # Create a fresh clone of the model
            X_subset = X_train_cv.iloc[:train_size]
            y_subset = y_train_cv.iloc[:train_size]
            
            model_clone.fit(X_subset, y_subset)
            
            # Calculate scores
            train_score = model_clone.score(X_subset, y_subset)
            val_score = model_clone.score(X_val_cv, y_val_cv)
            
            train_scores_split.append(train_score)
            val_scores_split.append(val_score)
        
        train_scores_list.append(train_scores_split)
        val_scores_list.append(val_scores_split)
    
    # Convert to numpy arrays for easier manipulation
    train_scores = np.array(train_scores_list)
    val_scores = np.array(val_scores_list)
    
    # Calculate means and standard deviations
    train_mean = np.mean(train_scores, axis=0)
    train_std = np.std(train_scores, axis=0)
    val_mean = np.mean(val_scores, axis=0)
    val_std = np.std(val_scores, axis=0)
    
    # Plot learning curves
    plt.figure(figsize=(10, 6))
    plt.grid()
    
    # Plot training scores
    plt.fill_between(train_sizes, 
                    train_mean - train_std,
                    train_mean + train_std, 
                    alpha=0.1,
                    color="r")
    plt.plot(train_sizes, train_mean, 'o-', color="r",
             label="Training score")
    
    # Plot cross-validation scores
    plt.fill_between(train_sizes, 
                    val_mean - val_std,
                    val_mean + val_std, 
                    alpha=0.1, 
                    color="g")
    plt.plot(train_sizes, val_mean, 'o-', color="g",
             label="Cross-validation score")
    
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    plt.title("Learning Curves")
    plt.legend(loc="best")
    plt.show()

In [0]:
def analyze_feature_importance(
    model: Any,
    X_train: pd.DataFrame,
    feature_names: List[str],
    max_display: int = 20
) -> None:
    """
    Analyze feature importance using SHAP values with proper handling for different model types.
    
    Args:
        model: Trained ML model
        X_train: Training features DataFrame
        feature_names: List of feature names
        max_display: Maximum number of features to display
    """
    try:
        # For tree-based models (Random Forest, XGBoost)
        if isinstance(model, (RandomForestClassifier, XGBClassifier)):
            # Use TreeExplainer for tree-based models
            explainer = shap.TreeExplainer(model)
            
            # Calculate sample size
            full_size = len(X_train)
            sample_size = 1000 if full_size > 1000 else full_size
            
            # Sample data and ensure it's a DataFrame
            X_sample = X_train.sample(n=sample_size, random_state=42)
            
            # Calculate SHAP values
            shap_values = explainer.shap_values(X_sample)
            
            # Handle different SHAP value formats
            if isinstance(shap_values, list):
                shap_values = shap_values[1]  # Get values for class 1
            
            plt.figure(figsize=(10, 8))
            # Use absolute mean for sorting features
            feature_importance = np.abs(shap_values).mean(0)
            feature_importance_df = pd.DataFrame({
                'feature': feature_names,
                'importance': feature_importance
            })
            feature_importance_df = feature_importance_df.sort_values(
                'importance', ascending=False
            ).head(max_display)
            
            plt.barh(
                range(len(feature_importance_df)),
                feature_importance_df['importance']
            )
            plt.yticks(
                range(len(feature_importance_df)),
                feature_importance_df['feature']
            )
            plt.xlabel('mean(|SHAP value|)')
            plt.title('Feature Importance (SHAP values)')
            
        # For linear models (Logistic Regression)
        elif isinstance(model, LogisticRegression):
            # For linear models, use coefficients directly
            importance = np.abs(model.coef_[0])
            feature_importance_df = pd.DataFrame({
                'feature': feature_names,
                'importance': importance
            })
            feature_importance_df = feature_importance_df.sort_values(
                'importance', ascending=False
            ).head(max_display)
            
            plt.figure(figsize=(10, 8))
            plt.barh(
                range(len(feature_importance_df)),
                feature_importance_df['importance']
            )
            plt.yticks(
                range(len(feature_importance_df)),
                feature_importance_df['feature']
            )
            plt.xlabel('|Coefficient|')
            plt.title('Feature Importance (Logistic Regression Coefficients)')
            
        # For neural networks (MLPClassifier)
        elif isinstance(model, MLPClassifier):
            # Use permutation importance for neural networks
            from sklearn.inspection import permutation_importance
            
            result = permutation_importance(
                model, X_train, y_train,
                n_repeats=10,
                random_state=42
            )
            
            feature_importance_df = pd.DataFrame({
                'feature': feature_names,
                'importance': result.importances_mean
            })
            feature_importance_df = feature_importance_df.sort_values(
                'importance', ascending=False
            ).head(max_display)
            
            plt.figure(figsize=(10, 8))
            plt.barh(
                range(len(feature_importance_df)),
                feature_importance_df['importance']
            )
            plt.yticks(
                range(len(feature_importance_df)),
                feature_importance_df['feature']
            )
            plt.xlabel('Permutation Importance')
            plt.title('Feature Importance (Permutation)')
        
        else:
            print(f"Feature importance analysis not implemented for model type: {type(model)}")
            return
        
        plt.tight_layout()
        plt.show()
        
        # Print numerical values
        print("\nFeature Importance Values:")
        print(feature_importance_df.to_string(index=False))
        
    except Exception as e:
        print(f"Error in feature importance analysis: {str(e)}")
        
        # Fallback to basic feature importance for tree-based models
        if hasattr(model, 'feature_importances_'):
            importances = pd.DataFrame({
                'feature': feature_names,
                'importance': model.feature_importances_
            })
            importances = importances.sort_values(
                'importance', ascending=False
            ).head(max_display)
            
            plt.figure(figsize=(10, 8))
            plt.barh(range(len(importances)), importances['importance'])
            plt.yticks(
                range(len(importances)),
                importances['feature']
            )
            plt.xlabel('Feature Importance')
            plt.title("Feature Importance (Model's Built-in Method)")
            plt.tight_layout()
            plt.show()
            
            print("\nFeature Importance Values (Fallback Method):")
            print(importances.to_string(index=False))

In [0]:
def run_ml_pipeline(
    df: pd.DataFrame,
    target_col: str,
    feature_cols: List[str],
    train_end_month: str = train_end_month,
    scale: bool = False,
    use_feature_selection: bool = False,
    feature_selection_method: str = None,
    use_shap: bool = False 
) -> Dict[str, Dict[str, float]]:
    """
    Run the complete ML pipeline with optional feature selection and SHAP analysis.
    
    Args:
        df: Input Pandas DataFrame
        target_col: Name of target column
        feature_cols: List of feature column names
        use_feature_selection: Whether to use feature selection (default: False)
        feature_selection_method: Method for feature selection ('rf' or 'rfe')
        use_shap: Whether to use SHAP for feature importance analysis (default: False)
    
    Returns:
        Dictionary of model performances
    """
    # Ensure input is a pandas DataFrame
    if not isinstance(df, pd.DataFrame):
        raise TypeError("Input must be a pandas DataFrame")
    
    # Prepare data
    X_train, X_test, y_train, y_test = prepare_data(df, target_col, feature_cols, train_end_month, scale=True)
    
    # Feature selection (if enabled)
    if use_feature_selection:
        print("\nPerforming feature selection...")
        X_train_final, selected_features = select_features(
            X_train,
            y_train,
            feature_cols,
            method=feature_selection_method
        )
        X_test_final = X_test[selected_features]
        print(f"Selected {len(selected_features)} features")
        print("Selected features:", selected_features)
    else:
        X_train_final = X_train
        X_test_final = X_test
        selected_features = feature_cols
        print("All features used")
    
    # Create CV splits
    cv = create_time_series_cv(X_train_final)
    
    # Calculate class weight properly for pandas Series
    n_samples = len(y_train)
    n_positives = y_train.sum()
    class_weight = n_samples / (2 * n_positives)  # adjusted class weight calculation
    
    # Initialize models with class weights
    models = {
        'logistic': LogisticRegression(
            class_weight='balanced', 
            random_state=42,
            max_iter=1000
        ),
        'random_forest': RandomForestClassifier(
            n_estimators=100, 
            class_weight='balanced', 
            random_state=42
        ),
        'xgboost': XGBClassifier(
            scale_pos_weight=class_weight, 
            random_state=42
        ),
        'neural_network': MLPClassifier(
            hidden_layer_sizes=(100, 50), 
            max_iter=1000, 
            random_state=42
        )
    }
    
    # Train and evaluate models
    results = {}
    for name, model in models.items():
        print(f"\nTraining {name}...")
        results[name] = train_evaluate_model(
            model, 
            X_train_final, 
            X_test_final, 
            y_train, 
            y_test, 
            cv
        )
        
        print(f"Plotting learning curves for {name}...")
        plot_learning_curves(model, X_train_final, y_train, cv)
        
        if use_shap:
            print(f"Analyzing feature importance for {name} using SHAP...")
            analyze_feature_importance(model, X_train_final, selected_features)
        
        # Print metrics
        print(f"\nMetrics for {name}:")
        for metric_name, value in results[name].items():
            print(f"{metric_name}: {value:.3f}")
    
    return results

In [0]:
# Example usage
results = run_ml_pipeline(
    df=hcp_feats_master_w_target_pdf,
    target_col=target_col_nm,
    feature_cols=feat_cols_nm_lst,
    scale=True,
    use_feature_selection = False,
    feature_selection_method = 'rf',
    use_shap = True 
)

# Print results
for model_name, metrics in results.items():
    print(f"\nResults for {model_name}:")
    for metric_name, value in metrics.items():
        print(f"{metric_name}: {value:.3f}")