In [1]:
import os
import glob
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import numpy as np
import random
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import pprint
import pyspark
import pyspark.sql.functions as F

from pyspark.sql.functions import col, to_date
from pyspark.sql.types import StringType, IntegerType, FloatType, DateType

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score, roc_auc_score
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split


In [2]:
# Build a .py script that takes a snapshot date, trains a model and outputs artefact into storage.

## set up pyspark session

In [3]:
# Initialize SparkSession
spark = pyspark.sql.SparkSession.builder \
    .appName("dev") \
    .master("local[*]") \
    .getOrCreate()

# Set log level to ERROR to hide warnings
spark.sparkContext.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/25 12:42:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## set up config

In [4]:
# set up config
model_train_date_str = "2023-12-01"
train_test_period_months = 12
oot_period_months = 2
train_test_ratio = 0.8

config = {}
config["model_train_date_str"] = model_train_date_str
config["train_test_period_months"] = train_test_period_months
config["oot_period_months"] =  oot_period_months
config["model_train_date"] =  datetime.strptime(model_train_date_str, "%Y-%m-%d")
config["oot_end_date"] =  config['model_train_date'] - timedelta(days = 1)
config["oot_start_date"] =  config['model_train_date'] - relativedelta(months = oot_period_months)
config["train_test_end_date"] =  config["oot_start_date"] - timedelta(days = 1)
config["train_test_start_date"] =  config["oot_start_date"] - relativedelta(months = train_test_period_months)
config["train_test_ratio"] = train_test_ratio 


pprint.pprint(config)

{'model_train_date': datetime.datetime(2023, 12, 1, 0, 0),
 'model_train_date_str': '2023-12-01',
 'oot_end_date': datetime.datetime(2023, 11, 30, 0, 0),
 'oot_period_months': 2,
 'oot_start_date': datetime.datetime(2023, 10, 1, 0, 0),
 'train_test_end_date': datetime.datetime(2023, 9, 30, 0, 0),
 'train_test_period_months': 12,
 'train_test_ratio': 0.8,
 'train_test_start_date': datetime.datetime(2022, 10, 1, 0, 0)}


## get label store

In [5]:
# connect to label store
folder_path = "datamart/gold/label_store/"
files_list = [folder_path+os.path.basename(f) for f in glob.glob(os.path.join(folder_path, '*'))]
label_store_sdf = spark.read.option("header", "true").parquet(*files_list)
print("row_count:",label_store_sdf.count())

label_store_sdf.show()

row_count: 9963
+--------------------+-----------+-----+----------+-------------+
|             loan_id|Customer_ID|label| label_def|snapshot_date|
+--------------------+-----------+-----+----------+-------------+
|CUS_0x1037_2023_0...| CUS_0x1037|    0|30dpd_6mob|   2023-07-01|
|CUS_0x1069_2023_0...| CUS_0x1069|    0|30dpd_6mob|   2023-07-01|
|CUS_0x114a_2023_0...| CUS_0x114a|    0|30dpd_6mob|   2023-07-01|
|CUS_0x1184_2023_0...| CUS_0x1184|    0|30dpd_6mob|   2023-07-01|
|CUS_0x1297_2023_0...| CUS_0x1297|    1|30dpd_6mob|   2023-07-01|
|CUS_0x12fb_2023_0...| CUS_0x12fb|    0|30dpd_6mob|   2023-07-01|
|CUS_0x1325_2023_0...| CUS_0x1325|    0|30dpd_6mob|   2023-07-01|
|CUS_0x1341_2023_0...| CUS_0x1341|    0|30dpd_6mob|   2023-07-01|
|CUS_0x1375_2023_0...| CUS_0x1375|    1|30dpd_6mob|   2023-07-01|
|CUS_0x13a8_2023_0...| CUS_0x13a8|    0|30dpd_6mob|   2023-07-01|
|CUS_0x13ef_2023_0...| CUS_0x13ef|    0|30dpd_6mob|   2023-07-01|
|CUS_0x1440_2023_0...| CUS_0x1440|    0|30dpd_6mob|   2023-0

In [6]:
# extract label store
labels_sdf = label_store_sdf.filter((col("snapshot_date") >= config["train_test_start_date"]) & (col("snapshot_date") <= config["oot_end_date"]))

print("extracted labels_sdf", labels_sdf.count(), config["train_test_start_date"], config["oot_end_date"])

extracted labels_sdf 2568 2022-10-01 00:00:00 2023-11-30 00:00:00


## get features

In [7]:
gold_clks_directory = "datamart/gold/feature_store/eng/"

folder_path = gold_clks_directory
files_list = [folder_path+os.path.basename(f) for f in glob.glob(os.path.join(folder_path, '*'))]
feature_store_sdf_1 = spark.read.parquet(*files_list)
print("row_count:",feature_store_sdf_1.count())
feature_store_sdf_1.printSchema()
feature_store_sdf_1.show(5)

row_count: 224350
root
 |-- Customer_ID: string (nullable = true)
 |-- snapshot_date: string (nullable = true)
 |-- click_1m: integer (nullable = true)
 |-- click_2m: integer (nullable = true)
 |-- click_3m: integer (nullable = true)
 |-- click_4m: integer (nullable = true)
 |-- click_5m: integer (nullable = true)
 |-- click_6m: integer (nullable = true)

+-----------+-------------+--------+--------+--------+--------+--------+--------+
|Customer_ID|snapshot_date|click_1m|click_2m|click_3m|click_4m|click_5m|click_6m|
+-----------+-------------+--------+--------+--------+--------+--------+--------+
| CUS_0xc65a|   2024-03-01|     239|     236|      44|     309|       0|      35|
| CUS_0x5e1f|   2024-03-01|     103|      26|     182|     133|      27|      43|
| CUS_0x78d3|   2024-03-01|      69|     124|     140|       0|       0|     149|
| CUS_0x1844|   2024-03-01|     142|     232|      25|     145|       0|      97|
| CUS_0x7f07|   2024-03-01|      53|       0|     256|     158|     

In [8]:
gold_fin_directory = "datamart/gold/feature_store/cust_fin_risk/"

folder_path = gold_fin_directory
files_list = [folder_path+os.path.basename(f) for f in glob.glob(os.path.join(folder_path, '*'))]
feature_store_sdf_2 = spark.read.parquet(*files_list)
print("row_count:",feature_store_sdf_2.count())
feature_store_sdf_2.printSchema()
feature_store_sdf_2.show(5)

row_count: 12977
root
 |-- Customer_ID: string (nullable = true)
 |-- snapshot_date: date (nullable = true)
 |-- Credit_History_Age: integer (nullable = true)
 |-- Num_Fin_Pdts: integer (nullable = true)
 |-- EMI_to_Salary: double (nullable = true)
 |-- Debt_to_Salary: double (nullable = true)
 |-- Repayment_Ability: double (nullable = true)
 |-- Loans_per_Credit_Item: double (nullable = true)
 |-- Loan_Extent: integer (nullable = true)
 |-- Outstanding_Debt: double (nullable = true)
 |-- Interest_Rate: integer (nullable = true)
 |-- Delay_from_due_date: integer (nullable = true)
 |-- Changed_Credit_Limit: double (nullable = true)

+-----------+-------------+------------------+------------+--------------------+-------------------+-----------------+---------------------+-----------+----------------+-------------+-------------------+--------------------+
|Customer_ID|snapshot_date|Credit_History_Age|Num_Fin_Pdts|       EMI_to_Salary|     Debt_to_Salary|Repayment_Ability|Loans_per_Credit_

In [9]:
feature_store_sdf_1 = feature_store_sdf_1.withColumn("snapshot_date", to_date(col("snapshot_date")))
feature_store_sdf_2 = feature_store_sdf_2.withColumn("snapshot_date", to_date(col("snapshot_date")))
feature_store_sdf_1.printSchema()
feature_store_sdf_2.printSchema()

root
 |-- Customer_ID: string (nullable = true)
 |-- snapshot_date: date (nullable = true)
 |-- click_1m: integer (nullable = true)
 |-- click_2m: integer (nullable = true)
 |-- click_3m: integer (nullable = true)
 |-- click_4m: integer (nullable = true)
 |-- click_5m: integer (nullable = true)
 |-- click_6m: integer (nullable = true)

root
 |-- Customer_ID: string (nullable = true)
 |-- snapshot_date: date (nullable = true)
 |-- Credit_History_Age: integer (nullable = true)
 |-- Num_Fin_Pdts: integer (nullable = true)
 |-- EMI_to_Salary: double (nullable = true)
 |-- Debt_to_Salary: double (nullable = true)
 |-- Repayment_Ability: double (nullable = true)
 |-- Loans_per_Credit_Item: double (nullable = true)
 |-- Loan_Extent: integer (nullable = true)
 |-- Outstanding_Debt: double (nullable = true)
 |-- Interest_Rate: integer (nullable = true)
 |-- Delay_from_due_date: integer (nullable = true)
 |-- Changed_Credit_Limit: double (nullable = true)



In [10]:
# extract feature store
features_sdf_1 = feature_store_sdf_1.filter((col("snapshot_date") >= config["train_test_start_date"]) & (col("snapshot_date") <= config["oot_end_date"]))
features_sdf_2 = feature_store_sdf_2
features_sdf_2 = features_sdf_2.drop('snapshot_date')

print("extracted features_sdf_1", features_sdf_1.count(), config["train_test_start_date"], config["oot_end_date"])
print("extracted features_sdf_2", features_sdf_2.count(), config["train_test_start_date"], config["oot_end_date"])

extracted features_sdf_1 89740 2022-10-01 00:00:00 2023-11-30 00:00:00
extracted features_sdf_2 12977 2022-10-01 00:00:00 2023-11-30 00:00:00


In [11]:
# join two feature tables
features_sdf = features_sdf_1.join(features_sdf_2, on=["Customer_ID"], how="left")

print("extracted features_sdf", features_sdf.count(), config["train_test_start_date"], config["oot_end_date"])
features_sdf.show(5)

extracted features_sdf 89740 2022-10-01 00:00:00 2023-11-30 00:00:00
+-----------+-------------+--------+--------+--------+--------+--------+--------+------------------+------------+--------------------+------------------+------------------+---------------------+-----------+----------------+-------------+-------------------+--------------------+
|Customer_ID|snapshot_date|click_1m|click_2m|click_3m|click_4m|click_5m|click_6m|Credit_History_Age|Num_Fin_Pdts|       EMI_to_Salary|    Debt_to_Salary| Repayment_Ability|Loans_per_Credit_Item|Loan_Extent|Outstanding_Debt|Interest_Rate|Delay_from_due_date|Changed_Credit_Limit|
+-----------+-------------+--------+--------+--------+--------+--------+--------+------------------+------------+--------------------+------------------+------------------+---------------------+-----------+----------------+-------------+-------------------+--------------------+
| CUS_0xc65a|   2023-07-01|      55|       0|      78|     165|      53|     150|             

In [12]:
features_pdf = features_sdf.toPandas()
columns_to_exclude = ['Customer_ID', 'snapshot_date']
columns_to_rename = [col for col in features_pdf.columns if col not in columns_to_exclude]
rename_dict = {col: 'feature_' + col for col in columns_to_rename}
features_pdf.rename(columns=rename_dict, inplace=True)
features_pdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89740 entries, 0 to 89739
Data columns (total 19 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Customer_ID                    89740 non-null  object 
 1   snapshot_date                  89740 non-null  object 
 2   feature_click_1m               89740 non-null  int32  
 3   feature_click_2m               80766 non-null  float64
 4   feature_click_3m               71792 non-null  float64
 5   feature_click_4m               62818 non-null  float64
 6   feature_click_5m               53844 non-null  float64
 7   feature_click_6m               44870 non-null  float64
 8   feature_Credit_History_Age     89740 non-null  int32  
 9   feature_Num_Fin_Pdts           89740 non-null  int32  
 10  feature_EMI_to_Salary          89740 non-null  float64
 11  feature_Debt_to_Salary         89740 non-null  float64
 12  feature_Repayment_Ability      89740 non-null 

In [13]:
features_sdf = spark.createDataFrame(features_pdf)
features_sdf.show(5)

+-----------+-------------+----------------+----------------+----------------+----------------+----------------+----------------+--------------------------+--------------------+---------------------+----------------------+-------------------------+-----------------------------+-------------------+------------------------+---------------------+---------------------------+----------------------------+
|Customer_ID|snapshot_date|feature_click_1m|feature_click_2m|feature_click_3m|feature_click_4m|feature_click_5m|feature_click_6m|feature_Credit_History_Age|feature_Num_Fin_Pdts|feature_EMI_to_Salary|feature_Debt_to_Salary|feature_Repayment_Ability|feature_Loans_per_Credit_Item|feature_Loan_Extent|feature_Outstanding_Debt|feature_Interest_Rate|feature_Delay_from_due_date|feature_Changed_Credit_Limit|
+-----------+-------------+----------------+----------------+----------------+----------------+----------------+----------------+--------------------------+--------------------+-----------------

                                                                                

## prepare data for modeling

In [14]:
# prepare data for modeling
data_pdf = labels_sdf.join(features_sdf, on=["Customer_ID", "snapshot_date"], how="left").toPandas()
data_pdf

                                                                                

Unnamed: 0,Customer_ID,snapshot_date,loan_id,label,label_def,feature_click_1m,feature_click_2m,feature_click_3m,feature_click_4m,feature_click_5m,...,feature_Num_Fin_Pdts,feature_EMI_to_Salary,feature_Debt_to_Salary,feature_Repayment_Ability,feature_Loans_per_Credit_Item,feature_Loan_Extent,feature_Outstanding_Debt,feature_Interest_Rate,feature_Delay_from_due_date,feature_Changed_Credit_Limit
0,CUS_0x10eb,2023-09-01,CUS_0x10eb_2023_03_01,0,30dpd_6mob,63,0.0,167.0,216.0,66.0,...,13,0.012509,0.280304,2385.432,0.166667,16,677.40,10,8,10.34
1,CUS_0x1192,2023-11-01,CUS_0x1192_2023_05_01,0,30dpd_6mob,0,0.0,85.0,80.0,69.0,...,12,0.010718,0.839055,1502.657,0.181818,32,1275.32,1,16,7.67
2,CUS_0x12dd,2023-09-01,CUS_0x12dd_2023_03_01,0,30dpd_6mob,94,131.0,34.0,183.0,0.0,...,8,0.030106,0.254875,2464.653,0.800000,40,647.94,11,10,6.81
3,CUS_0x1325,2023-07-01,CUS_0x1325_2023_01_01,0,30dpd_6mob,0,39.0,65.0,135.0,103.0,...,21,0.038265,1.031171,2824.894,0.294118,195,3029.92,22,39,7.35
4,CUS_0x1431,2023-10-01,CUS_0x1431_2023_04_01,1,30dpd_6mob,135,166.0,91.0,168.0,0.0,...,19,0.028222,0.104905,9204.163,0.333333,105,993.71,9,21,10.27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2563,CUS_0xc5f3,2023-10-01,CUS_0xc5f3_2023_04_01,0,30dpd_6mob,137,139.0,218.0,170.0,201.0,...,15,0.023566,1.884252,1036.644,0.142857,78,2002.37,31,39,13.59
2564,CUS_0xc8c,2023-10-01,CUS_0xc8c_2023_04_01,0,30dpd_6mob,76,0.0,211.0,154.0,16.0,...,17,0.024053,0.250703,1038.546,0.200000,39,267.04,14,13,6.76
2565,CUS_0xd6f,2023-07-01,CUS_0xd6f_2023_01_01,0,30dpd_6mob,23,0.0,80.0,158.0,0.0,...,12,0.109600,0.260777,4911.178,0.181818,4,1438.66,6,2,11.75
2566,CUS_0xf45,2023-11-01,CUS_0xf45_2023_05_01,1,30dpd_6mob,281,0.0,162.0,136.0,207.0,...,24,0.063226,0.509028,2441.583,0.388889,315,1327.26,16,45,1.40


In [15]:
# split data into train - test - oot
oot_pdf = data_pdf[(data_pdf['snapshot_date'] >= config["oot_start_date"].date()) & (data_pdf['snapshot_date'] <= config["oot_end_date"].date())]
train_test_pdf = data_pdf[(data_pdf['snapshot_date'] >= config["train_test_start_date"].date()) & (data_pdf['snapshot_date'] <= config["train_test_end_date"].date())]

feature_cols = [fe_col for fe_col in data_pdf.columns if fe_col.startswith('feature_')]

X_oot = oot_pdf[feature_cols]
y_oot = oot_pdf["label"]
X_train, X_test, y_train, y_test = train_test_split(
    train_test_pdf[feature_cols], train_test_pdf["label"], 
    test_size= 1 - config["train_test_ratio"],
    random_state=88,     # Ensures reproducibility
    shuffle=True,        # Shuffle the data before splitting
    stratify=train_test_pdf["label"]           # Stratify based on the label column
)


print('X_train', X_train.shape[0])
print('X_test', X_test.shape[0])
print('X_oot', X_oot.shape[0])
print('y_train', y_train.shape[0], round(y_train.mean(),2))
print('y_test', y_test.shape[0], round(y_test.mean(),2))
print('y_oot', y_oot.shape[0], round(y_oot.mean(),2))

X_train

X_train 1229
X_test 308
X_oot 1031
y_train 1229 0.3
y_test 308 0.3
y_oot 1031 0.27


Unnamed: 0,feature_click_1m,feature_click_2m,feature_click_3m,feature_click_4m,feature_click_5m,feature_click_6m,feature_Credit_History_Age,feature_Num_Fin_Pdts,feature_EMI_to_Salary,feature_Debt_to_Salary,feature_Repayment_Ability,feature_Loans_per_Credit_Item,feature_Loan_Extent,feature_Outstanding_Debt,feature_Interest_Rate,feature_Delay_from_due_date,feature_Changed_Credit_Limit
1137,50,0.0,25.0,128.0,0.0,311.0,300,13,0.024048,0.251246,2360.357,0.272727,72,607.90,14,24,14.13
2281,287,0.0,64.0,0.0,31.0,150.0,226,15,0.023584,0.418578,3721.424,0.230769,36,1595.76,33,12,15.02
426,196,0.0,153.0,0.0,0.0,111.0,23,21,0.059455,0.402882,5862.430,0.375000,234,2511.60,22,39,25.82
138,168,111.0,129.0,248.0,108.0,172.0,320,13,0.039186,0.175077,7344.454,0.400000,84,1338.47,17,21,14.93
1001,64,210.0,18.0,0.0,111.0,156.0,375,14,0.126492,0.268373,4204.694,0.250000,54,1292.14,11,18,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2157,0,129.0,180.0,136.0,186.0,303.0,239,22,0.046716,0.283625,5117.745,0.352941,264,1522.95,19,44,5.87
43,0,130.0,107.0,97.0,71.0,141.0,300,15,0.019909,0.136191,7071.718,0.230769,36,982.81,15,12,15.83
1698,0,129.0,0.0,0.0,0.0,83.0,219,9,0.000000,0.318598,3402.228,0.000000,0,1084.26,8,9,1.75
683,272,0.0,195.0,32.0,261.0,38.0,101,19,0.029761,0.923319,2666.822,0.176471,54,2538.81,19,18,10.17


## preprocess data

In [16]:
# set up standard scalar preprocessing
scaler = StandardScaler()

transformer_stdscaler = scaler.fit(X_train) # Q which should we use? train? test? oot? all?

# transform data
X_train_processed = transformer_stdscaler.transform(X_train)
X_test_processed = transformer_stdscaler.transform(X_test)
X_oot_processed = transformer_stdscaler.transform(X_oot)

print('X_train_processed', X_train_processed.shape[0])
print('X_test_processed', X_test_processed.shape[0])
print('X_oot_processed', X_oot_processed.shape[0])

pd.DataFrame(X_train_processed)

X_train_processed 1229
X_test_processed 308
X_oot_processed 1031


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,-0.696170,-1.212500,-1.016788,0.229985,-1.285757,2.397081,0.766328,-0.289919,-0.146970,-0.423281,0.009011,-0.169685,-0.198297,-0.704819,-0.106567,0.199795,0.556287
1,1.997733,-1.212500,-0.556366,-1.259614,-0.928591,0.485196,0.013867,0.056686,-0.147069,-0.272922,0.142508,-0.339882,-0.505501,0.153196,1.951706,-0.603743,0.692367
2,0.963365,-1.212500,0.494341,-1.259614,-1.285757,0.022069,-2.050317,1.096503,-0.139407,-0.287025,0.352505,0.245168,1.184123,0.948657,0.760074,1.204216,2.343676
3,0.645098,0.107533,0.211004,1.626484,-0.041436,0.746447,0.969696,-0.289919,-0.143737,-0.491723,0.497866,0.346577,-0.095896,-0.070276,0.218423,-0.001090,0.678606
4,-0.537036,1.284860,-1.099428,-1.259614,-0.006872,0.556447,1.528958,-0.116616,-0.125090,-0.407891,0.189909,-0.261875,-0.351899,-0.110516,-0.431558,-0.201974,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1224,-1.264504,0.321592,0.813095,0.323085,0.857240,2.302080,0.146056,1.269806,-0.142128,-0.394186,0.279464,0.155690,1.440127,0.089956,0.435084,1.539024,-0.706659
1225,-1.264504,0.333484,-0.048721,-0.130777,-0.467731,0.378321,0.766328,0.056686,-0.147854,-0.526665,0.471115,-0.339882,-0.505501,-0.379187,0.001763,-0.603743,0.816215
1226,-1.264504,0.321592,-1.311931,-1.259614,-1.285757,-0.310433,-0.057312,-0.983130,-0.152106,-0.362761,0.111201,-1.275962,-0.812706,-0.291072,-0.756548,-0.804627,-1.336603
1227,1.827233,-1.212500,0.990180,-0.887214,1.721351,-0.844810,-1.257183,0.749897,-0.145749,0.180621,0.039070,-0.560136,-0.351899,0.972290,0.435084,-0.201974,-0.049194


## train model

In [17]:
# Define the XGBoost classifier
xgb_clf = xgb.XGBClassifier(eval_metric='logloss', random_state=88)

# Define the hyperparameter space to search
param_dist = {
    'n_estimators': [25, 50],
    'max_depth': [2, 3],  # lower max_depth to simplify the model
    'learning_rate': [0.01, 0.1],
    'subsample': [0.6, 0.8],
    'colsample_bytree': [0.6, 0.8],
    'gamma': [0, 0.1],
    'min_child_weight': [1, 3, 5],
    'reg_alpha': [0, 0.1, 1],
    'reg_lambda': [1, 1.5, 2]
}

# Create a scorer based on AUC score
auc_scorer = make_scorer(roc_auc_score)

# Set up the random search with cross-validation
random_search = RandomizedSearchCV(
    estimator=xgb_clf,
    param_distributions=param_dist,
    scoring=auc_scorer,
    n_iter=100,  # Number of iterations for random search
    cv=3,       # Number of folds in cross-validation
    verbose=1,
    random_state=42,
    n_jobs=-1   # Use all available cores
)

# Perform the random search
random_search.fit(X_train_processed, y_train)

# Output the best parameters and best score
print("Best parameters found: ", random_search.best_params_)
print("Best AUC score: ", random_search.best_score_)

# Evaluate the model on the train set
best_model = random_search.best_estimator_
y_pred_proba = best_model.predict_proba(X_train_processed)[:, 1]
train_auc_score = roc_auc_score(y_train, y_pred_proba)
print("Train AUC score: ", train_auc_score)

# Evaluate the model on the test set
best_model = random_search.best_estimator_
y_pred_proba = best_model.predict_proba(X_test_processed)[:, 1]
test_auc_score = roc_auc_score(y_test, y_pred_proba)
print("Test AUC score: ", test_auc_score)

# Evaluate the model on the oot set
best_model = random_search.best_estimator_
y_pred_proba = best_model.predict_proba(X_oot_processed)[:, 1]
oot_auc_score = roc_auc_score(y_oot, y_pred_proba)
print("OOT AUC score: ", oot_auc_score)

print("TRAIN GINI score: ", round(2*train_auc_score-1,3))
print("Test GINI score: ", round(2*test_auc_score-1,3))
print("OOT GINI score: ", round(2*oot_auc_score-1,3))

Fitting 3 folds for each of 100 candidates, totalling 300 fits
Best parameters found:  {'subsample': 0.8, 'reg_lambda': 1.5, 'reg_alpha': 1, 'n_estimators': 50, 'min_child_weight': 5, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 0.1, 'colsample_bytree': 0.6}
Best AUC score:  0.6819716194483334
Train AUC score:  0.8774955606291223
Test AUC score:  0.7021319694130754
OOT AUC score:  0.7956281792878395
TRAIN GINI score:  0.755
Test GINI score:  0.404
OOT GINI score:  0.591


## prepare model artefact to save

In [18]:
model_artefact = {}

model_artefact['model'] = best_model
model_artefact['model_version'] = "credit_model_"+config["model_train_date_str"].replace('-','_')
model_artefact['preprocessing_transformers'] = {}
model_artefact['preprocessing_transformers']['stdscaler'] = transformer_stdscaler
model_artefact['data_dates'] = config
model_artefact['data_stats'] = {}
model_artefact['data_stats']['X_train'] = X_train.shape[0]
model_artefact['data_stats']['X_test'] = X_test.shape[0]
model_artefact['data_stats']['X_oot'] = X_oot.shape[0]
model_artefact['data_stats']['y_train'] = round(y_train.mean(),2)
model_artefact['data_stats']['y_test'] = round(y_test.mean(),2)
model_artefact['data_stats']['y_oot'] = round(y_oot.mean(),2)
model_artefact['results'] = {}
model_artefact['results']['auc_train'] = train_auc_score
model_artefact['results']['auc_test'] = test_auc_score
model_artefact['results']['auc_oot'] = oot_auc_score
model_artefact['results']['gini_train'] = round(2*train_auc_score-1,3)
model_artefact['results']['gini_test'] = round(2*test_auc_score-1,3)
model_artefact['results']['gini_oot'] = round(2*oot_auc_score-1,3)
model_artefact['hp_params'] = random_search.best_params_


pprint.pprint(model_artefact)

{'data_dates': {'model_train_date': datetime.datetime(2023, 12, 1, 0, 0),
                'model_train_date_str': '2023-12-01',
                'oot_end_date': datetime.datetime(2023, 11, 30, 0, 0),
                'oot_period_months': 2,
                'oot_start_date': datetime.datetime(2023, 10, 1, 0, 0),
                'train_test_end_date': datetime.datetime(2023, 9, 30, 0, 0),
                'train_test_period_months': 12,
                'train_test_ratio': 0.8,
                'train_test_start_date': datetime.datetime(2022, 10, 1, 0, 0)},
 'data_stats': {'X_oot': 1031,
                'X_test': 308,
                'X_train': 1229,
                'y_oot': 0.27,
                'y_test': 0.3,
                'y_train': 0.3},
 'hp_params': {'colsample_bytree': 0.6,
               'gamma': 0.1,
               'learning_rate': 0.1,
               'max_depth': 3,
               'min_child_weight': 5,
               'n_estimators': 50,
               'reg_alpha': 1,
            

## save artefact to model bank

In [19]:
# create model_bank dir
model_bank_directory = "model_bank/"

if not os.path.exists(model_bank_directory):
    os.makedirs(model_bank_directory)

In [20]:
# Full path to the file
file_path = os.path.join(model_bank_directory, model_artefact['model_version'] + '.pkl')

# Write the model to a pickle file
with open(file_path, 'wb') as file:
    pickle.dump(model_artefact, file)

print(f"Model saved to {file_path}")


Model saved to model_bank/credit_model_2023_12_01.pkl


## test load pickle and make model inference

In [21]:
# Load the model from the pickle file
with open(file_path, 'rb') as file:
    loaded_model_artefact = pickle.load(file)

y_pred_proba = loaded_model_artefact['model'].predict_proba(X_oot_processed)[:, 1]
oot_auc_score = roc_auc_score(y_oot, y_pred_proba)
print("OOT AUC score: ", oot_auc_score)

print("Model loaded successfully!")

OOT AUC score:  0.7956281792878395
Model loaded successfully!
