In [8]:
import argparse
import os
import glob
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import numpy as np
import random
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import pprint
import pyspark
import pyspark.sql.functions as F

from pyspark.sql.functions import col, to_date
from pyspark.sql.types import StringType, IntegerType, FloatType, DateType

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

In [9]:
# Initialize SparkSession
spark = pyspark.sql.SparkSession.builder \
    .appName("dev") \
    .master("local[*]") \
    .getOrCreate()

# Set log level to ERROR to hide warnings
spark.sparkContext.setLogLevel("ERROR")

## set up config

In [10]:
snapshot_date_str = "2024-01-01"
model_name = "credit_model_2024_09_01.pkl"

In [11]:
# --- set up config ---
config = {}
config["snapshot_date_str"] = snapshot_date_str
config["snapshot_date"] = datetime.strptime(config["snapshot_date_str"], "%Y-%m-%d")

pprint.pprint(config)

{'snapshot_date': datetime.datetime(2024, 1, 1, 0, 0),
 'snapshot_date_str': '2024-01-01'}


## load prediction result

In [28]:
# --- load prediction result ---
folder_path_1 = "datamart/gold/model_predictions/credit_model_2024_09_01/"
files_list_1 = [folder_path_1+os.path.basename(f) for f in glob.glob(os.path.join(folder_path_1, '*'))]

# Load CSV into DataFrame - connect to prediction store
prediction_sdf = spark.read.parquet(*files_list_1)
prediction_sdf.show()

+-----------+-------------+--------------------+-------------------+
|Customer_ID|snapshot_date|          model_name|  model_predictions|
+-----------+-------------+--------------------+-------------------+
| CUS_0x9e5f|   2024-11-01|credit_model_2024...| 0.6832869052886963|
| CUS_0xbe64|   2024-11-01|credit_model_2024...|0.14518484473228455|
| CUS_0xc3d5|   2024-11-01|credit_model_2024...|0.25410333275794983|
| CUS_0x9ce3|   2024-11-01|credit_model_2024...|0.47862982749938965|
| CUS_0x26da|   2024-11-01|credit_model_2024...|0.13304248452186584|
| CUS_0x4da9|   2024-11-01|credit_model_2024...| 0.5791769623756409|
| CUS_0x3d09|   2024-11-01|credit_model_2024...|  0.455234169960022|
| CUS_0x1a2f|   2024-11-01|credit_model_2024...|0.13843978941440582|
| CUS_0x8643|   2024-11-01|credit_model_2024...| 0.4067968428134918|
| CUS_0x13a8|   2024-11-01|credit_model_2024...|0.17660976946353912|
| CUS_0xc2a1|   2024-11-01|credit_model_2024...| 0.6888681054115295|
| CUS_0x1233|   2024-11-01|credit_

In [29]:
# Ensure snapshot_date is in date format
prediction_sdf = prediction_sdf.withColumn("snapshot_date", to_date(col("snapshot_date")))

# Filter the DataFrame for the specific snapshot_date
prediction_sdf = prediction_sdf.filter(col("snapshot_date") == config["snapshot_date"])
prediction_sdf = prediction_sdf.withColumn("predicted_label", F.when(col("model_predictions") >= 0.5, 1).otherwise(0))
prediction_sdf.show()



+-----------+-------------+--------------------+-------------------+---------------+
|Customer_ID|snapshot_date|          model_name|  model_predictions|predicted_label|
+-----------+-------------+--------------------+-------------------+---------------+
| CUS_0xc5cc|   2024-01-01|credit_model_2024...|  0.286805659532547|              0|
| CUS_0x5f86|   2024-01-01|credit_model_2024...|0.16115139424800873|              0|
| CUS_0xa788|   2024-01-01|credit_model_2024...| 0.6790107488632202|              1|
| CUS_0xb756|   2024-01-01|credit_model_2024...| 0.4280065894126892|              0|
| CUS_0x8b96|   2024-01-01|credit_model_2024...| 0.3511424660682678|              0|
| CUS_0x5a7d|   2024-01-01|credit_model_2024...|0.09726028144359589|              0|
| CUS_0xc653|   2024-01-01|credit_model_2024...|0.08351144194602966|              0|
| CUS_0x8d74|   2024-01-01|credit_model_2024...|0.17322394251823425|              0|
| CUS_0x94f4|   2024-01-01|credit_model_2024...|0.123945333063602

                                                                                

## load label

In [32]:
# --- load label  ---
folder_path_2 = "datamart/gold/label_store/"
files_list_2 = [folder_path_2+os.path.basename(f) for f in glob.glob(os.path.join(folder_path_2, '*'))]

# Load CSV into DataFrame - connect to label store
label_sdf = spark.read.option("header", "true").parquet(*files_list_2)

# Ensure snapshot_date is in date format
label_sdf = label_sdf.withColumn("snapshot_date", to_date(col("snapshot_date")))

# Filter the DataFrame for the specific snapshot_date
label_sdf = label_sdf.filter(col("snapshot_date") == config["snapshot_date"])

label_sdf.show()

+--------------------+-----------+-----+----------+-------------+
|             loan_id|Customer_ID|label| label_def|snapshot_date|
+--------------------+-----------+-----+----------+-------------+
|CUS_0x1130_2023_0...| CUS_0x1130|    0|30dpd_6mob|   2024-01-01|
|CUS_0x11d1_2023_0...| CUS_0x11d1|    0|30dpd_6mob|   2024-01-01|
|CUS_0x11eb_2023_0...| CUS_0x11eb|    0|30dpd_6mob|   2024-01-01|
|CUS_0x120c_2023_0...| CUS_0x120c|    1|30dpd_6mob|   2024-01-01|
|CUS_0x124a_2023_0...| CUS_0x124a|    0|30dpd_6mob|   2024-01-01|
|CUS_0x12ce_2023_0...| CUS_0x12ce|    1|30dpd_6mob|   2024-01-01|
|CUS_0x13b0_2023_0...| CUS_0x13b0|    0|30dpd_6mob|   2024-01-01|
|CUS_0x13d6_2023_0...| CUS_0x13d6|    1|30dpd_6mob|   2024-01-01|
|CUS_0x13de_2023_0...| CUS_0x13de|    0|30dpd_6mob|   2024-01-01|
|CUS_0x13ec_2023_0...| CUS_0x13ec|    0|30dpd_6mob|   2024-01-01|
|CUS_0x13f6_2023_0...| CUS_0x13f6|    0|30dpd_6mob|   2024-01-01|
|CUS_0x14a3_2023_0...| CUS_0x14a3|    0|30dpd_6mob|   2024-01-01|
|CUS_0x14b

In [33]:
# join two feature tables
monitor_sdf = label_sdf.join(prediction_sdf, on=["Customer_ID"], how="left")
monitor_pdf = monitor_sdf.toPandas()
monitor_pdf

                                                                                

Unnamed: 0,Customer_ID,loan_id,label,label_def,snapshot_date,snapshot_date.1,model_name,model_predictions,predicted_label
0,CUS_0x1130,CUS_0x1130_2023_07_01,0,30dpd_6mob,2024-01-01,2024-01-01,credit_model_2024_09_01.pkl,0.096445,0
1,CUS_0x11d1,CUS_0x11d1_2023_07_01,0,30dpd_6mob,2024-01-01,2024-01-01,credit_model_2024_09_01.pkl,0.177976,0
2,CUS_0x11eb,CUS_0x11eb_2023_07_01,0,30dpd_6mob,2024-01-01,2024-01-01,credit_model_2024_09_01.pkl,0.129160,0
3,CUS_0x120c,CUS_0x120c_2023_07_01,1,30dpd_6mob,2024-01-01,2024-01-01,credit_model_2024_09_01.pkl,0.577241,1
4,CUS_0x124a,CUS_0x124a_2023_07_01,0,30dpd_6mob,2024-01-01,2024-01-01,credit_model_2024_09_01.pkl,0.113944,0
...,...,...,...,...,...,...,...,...,...
466,CUS_0xf2e,CUS_0xf2e_2023_07_01,0,30dpd_6mob,2024-01-01,2024-01-01,credit_model_2024_09_01.pkl,0.789582,1
467,CUS_0xf5d,CUS_0xf5d_2023_07_01,1,30dpd_6mob,2024-01-01,2024-01-01,credit_model_2024_09_01.pkl,0.264715,0
468,CUS_0xf8f,CUS_0xf8f_2023_07_01,1,30dpd_6mob,2024-01-01,2024-01-01,credit_model_2024_09_01.pkl,0.614932,1
469,CUS_0xfaa,CUS_0xfaa_2023_07_01,0,30dpd_6mob,2024-01-01,2024-01-01,credit_model_2024_09_01.pkl,0.471838,0


## evaluate

In [37]:
# evaluation metrics
y_true = monitor_pdf['label']
y_pred = monitor_pdf['predicted_label']
accuracy = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
roc_auc = roc_auc_score(y_true, monitor_pdf['model_predictions'])
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")

Accuracy: 0.7707
F1 Score: 0.5714
ROC AUC Score: 0.8177


In [48]:
evaluation = pd.DataFrame()
evaluation['model'] = [model_name[:-4] + "_predictions_" + snapshot_date_str.replace('-','_')]
evaluation['accuracy'] = [accuracy]
evaluation['f1'] = [f1]
evaluation['ROC AUC'] = [roc_auc]
evaluation

Unnamed: 0,model,accuracy,f1,ROC AUC
0,credit_model_2024_09_01_predictions_2024_01_01,0.770701,0.571429,0.817695


In [51]:
# save evaluation
evaluation_directory = f"datamart/gold/model_predictions/{model_name[:-4]}/"
partition_name = model_name[:-4] + "_predictions_" + snapshot_date_str.replace('-','_') + '.csv'
filepath = evaluation_directory + partition_name
evaluation.to_csv(filepath)
print('saved to:', filepath)

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2024_01_01.csv
