# Training binary classification model for Jivi restart writers

In [0]:
%pip install imbalanced-learn

In [0]:
%pip install tensorflow

In [0]:
# %restart_python

In [0]:
import pandas as pd
from typing import Tuple, List, Dict, Any
import mlflow
from sklearn import metrics

from imblearn.over_sampling import RandomOverSampler

In [0]:
%run "../00_config/set-up"

In [0]:
# Month and Date parameters for manual control
first_month = "2019-12"
last_month = "2024-11"

train_start_month = "2023-01"
train_end_month = "2024-04"
test_start_month = "2024-05"
test_end_month = "2024-11"

In [0]:
# Reading the feature master table from Hivestore
hcp_feats_master_w_target_sdf = spark.sql("SELECT * FROM jivi_new_writer_model.hcp_feats_master_w_target")
print(
    "Row count: ",
    hcp_feats_master_w_target_sdf.count(),
    "Column Count: ",
    len(hcp_feats_master_w_target_sdf.columns),
)

In [0]:
# Converting Spark dataframe to Pandas dataframe
hcp_feats_master_w_target_pdf = hcp_feats_master_w_target_sdf.toPandas()

In [0]:
feat_cols_nm_lst = [col for col in hcp_feats_master_w_target_pdf.columns if col not in ['BH_ID', 'COHORT_MONTH', 'JIVI_NEW_WRITER_FLG']]
target_col_nm = 'JIVI_NEW_WRITER_FLG'
print("Names of feats", feat_cols_nm_lst)
print("Number of features: ", len(feat_cols_nm_lst))

In [0]:
def train_test_split_udf(
    df: pd.DataFrame,
    target_col: str,
    feature_cols: List[str],
    train_end_month: str,
    scale: bool = False
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
    """
    Prepare data for training and testing based on COHORT_MONTH.
    
    Args:
        df: Input Pandas DataFrame
        target_col: Name of target column
        feature_cols: List of feature column names
        train_end_month: End month for training data (YYYY-MM format)
        scale: Whether to apply StandardScaler to the features
    
    Returns:
        X_train, X_test, y_train, y_test as Pandas DataFrames/Series
    """
    # Ensure input is a pandas DataFrame
    if not isinstance(df, pd.DataFrame):
        raise TypeError("Input must be a pandas DataFrame")
    
    # Split data into train and test
    train_mask = pd.to_datetime(df['COHORT_MONTH']).dt.strftime('%Y-%m') <= train_end_month
    
    # Create train/test splits using pandas
    X_train = df[train_mask][feature_cols]
    X_test = df[~train_mask][feature_cols]
    y_train = df[train_mask][target_col]
    y_test = df[~train_mask][target_col]

    print("No. of features in input dataframe: ", len(feature_cols))
    print("Positives/Negatives in train: \n", y_train.value_counts())
    print("Positives/Negatives in test: \n", y_test.value_counts())
    print("Shape of X_train: ", X_train.shape)
    print("Shape of X_test: ", X_test.shape)
    
    # Scale features if scale is True
    if scale:
        scaler = StandardScaler()
        X_train = pd.DataFrame(
            scaler.fit_transform(X_train),
            columns=feature_cols,
            index=X_train.index
        )
        X_test = pd.DataFrame(
            scaler.transform(X_test),
            columns=feature_cols,
            index=X_test.index
        )
    
    return X_train, X_test, y_train, y_test

In [0]:
X_train, X_test, y_train, y_test = train_test_split_udf(df=hcp_feats_master_w_target_pdf, target_col=target_col_nm, feature_cols=feat_cols_nm_lst, train_end_month=train_end_month, scale=True)

In [0]:
ros = RandomOverSampler()
X_train_oversampled, y_train_oversampled = ros.fit_resample(X_train, y_train)
print("Positives/Negatives in train: \n", y_train_oversampled.value_counts())
print("Positives/Negatives in test: \n", y_test.value_counts())
print("Shape of X_train: ", X_train_oversampled.shape)
print("Shape of X_test: ", X_test.shape)

In [0]:
def train_keras_model(X_train, y_train, X_test, y_test, feature_cols):
  import tensorflow as tf
  import tensorflow.keras
  from tensorflow.keras.models import Sequential
  from tensorflow.keras.constraints import MaxNorm
  from tensorflow.keras.layers import Dense, Dropout

  model_dl = Sequential() 
  model_dl.add(Dense(16, input_dim=len(feature_cols), activation='relu', kernel_constraint=MaxNorm(3)))
  model_dl.add(Dropout(rate=0.2)) 
  model_dl.add(Dense(8, activation='relu', kernel_constraint=MaxNorm(3)))
  model_dl.add(Dropout(rate=0.2)) 
  model_dl.add(Dense(1, activation='sigmoid'))
  
  model_dl.compile(loss = "binary_crossentropy", optimizer = 'adam', metrics=[tf.keras.metrics.AUC(name='auc'), tf.keras.metrics.Precision(name='precision'), tf.keras.metrics.Recall(name='recall')])
  
  model_dl.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=20, batch_size=10)

  return model_dl



In [0]:
# model_dl_temp = train_keras_model(X_train, y_train, X_test, y_test, feat_cols_nm_lst)

In [0]:
with mlflow.start_run(run_name='Jivi_HCP_new_writer_keras_dl') as run :

  mlflow.tensorflow.autolog()

  train_keras_model(X_train_oversampled, y_train_oversampled, X_test, y_test, feat_cols_nm_lst)

  run_id = run.info.run_id
  run_name = run.data.tags['mlflow.runName']

model_uri = "runs:/{run_id}/model".format(run_id=run_id)
print("model_uri: \n", model_uri)

model_details = mlflow.register_model(model_uri=model_uri, name="Jivi_HCP_new_writer_keras_dl")
print("model_details: \n", model_details)


In [0]:
model_name = "Jivi_HCP_new_writer_keras_dl"
stage = None
model_dl = mlflow.keras.load_model(
model_uri=f"models:/{model_name}/{stage}"
)


In [0]:
# For test set

print("Model Metrics:")
print("")
y_pred = model_dl_temp.predict(X_test)
# y_pred = list(np.array(model_dl.predict(X)).reshape((len(np.array(model_dl.predict(X))),)))
#y_pred = y_pred[:,1]

predictions_t = []
for prediction in y_pred:
    if prediction > 0.5:
        predictions_t.append(1)
    else:
        predictions_t.append(0)

y_true = y_test
print(metrics.classification_report(y_true, predictions_t))


In [0]:
# For train set

print("Model Metrics:")
print("")
y_pred = model_dl_temp.predict(X_train)
# y_pred = list(np.array(model_dl.predict(X)).reshape((len(np.array(model_dl.predict(X))),)))
#y_pred = y_pred[:,1]

predictions_t = []
for prediction in y_pred:
    if prediction > 0.5:
        predictions_t.append(1)
    else:
        predictions_t.append(0)

y_true = y_train
print(metrics.classification_report(y_true, predictions_t))
