In [1]:
import os
import glob
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import numpy as np
import random
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import pprint
import pyspark
import pyspark.sql.functions as F

from pyspark.sql.functions import col, to_date
from pyspark.sql.types import StringType, IntegerType, FloatType, DateType

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score, roc_auc_score
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

import model_inference


In [2]:
# Build a .py script that takes a snapshot date, loads a model artefact and make an inference and save to datamart

## set up pyspark session

In [3]:
# Initialize SparkSession
spark = pyspark.sql.SparkSession.builder \
    .appName("dev") \
    .master("local[*]") \
    .getOrCreate()

# Set log level to ERROR to hide warnings
spark.sparkContext.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/25 09:46:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## set up config

In [4]:
snapshot_date_str = "2024-01-01"
model_name = "credit_model_2024_09_01.pkl"


In [5]:
config = {}
config["snapshot_date_str"] = snapshot_date_str
config["snapshot_date"] = datetime.strptime(config["snapshot_date_str"], "%Y-%m-%d")
config["model_name"] = model_name
config["model_bank_directory"] = "model_bank/"
config["model_artefact_filepath"] = config["model_bank_directory"] + config["model_name"]

pprint.pprint(config)

{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2024, 1, 1, 0, 0),
 'snapshot_date_str': '2024-01-01'}


## load model artefact from model bank

In [6]:
# Load the model from the pickle file
with open(config["model_artefact_filepath"], 'rb') as file:
    model_artefact = pickle.load(file)

print("Model loaded successfully! " + config["model_artefact_filepath"])

Model loaded successfully! model_bank/credit_model_2024_09_01.pkl


## load feature store

In [7]:
# --- load feature store ---
folder_path_1 = "datamart/gold/feature_store/eng/"
folder_path_2 = "datamart/gold/feature_store/cust_fin_risk/"
files_list_1 = [folder_path_1+os.path.basename(f) for f in glob.glob(os.path.join(folder_path_1, '*'))]
files_list_2 = [folder_path_2+os.path.basename(f) for f in glob.glob(os.path.join(folder_path_2, '*'))]

# Load CSV into DataFrame - connect to feature store
feature_store_sdf_1 = spark.read.option("header", "true").parquet(*files_list_1)
feature_store_sdf_2 = spark.read.option("header", "true").parquet(*files_list_2)

# Ensure snapshot_date is in date format
feature_store_sdf_1 = feature_store_sdf_1.withColumn("snapshot_date", to_date(col("snapshot_date")))
feature_store_sdf_2 = feature_store_sdf_2.withColumn("snapshot_date", to_date(col("snapshot_date")))

# extract feature store
features_sdf_1 = feature_store_sdf_1.filter(col("snapshot_date") == config["snapshot_date"])
features_sdf_2 = feature_store_sdf_2
features_sdf_2 = features_sdf_2.drop('snapshot_date')

# join two feature tables
features_sdf = features_sdf_1.join(features_sdf_2, on=["Customer_ID"], how="left")

print("extracted features_sdf", features_sdf.count(), config["snapshot_date"])
features_sdf.show(5)

                                                                                

extracted features_sdf 8974 2024-01-01 00:00:00
+-----------+-------------+--------+--------+--------+--------+--------+--------+------------------+------------+--------------------+------------------+------------------+---------------------+-----------+----------------+-------------+-------------------+--------------------+
|Customer_ID|snapshot_date|click_1m|click_2m|click_3m|click_4m|click_5m|click_6m|Credit_History_Age|Num_Fin_Pdts|       EMI_to_Salary|    Debt_to_Salary| Repayment_Ability|Loans_per_Credit_Item|Loan_Extent|Outstanding_Debt|Interest_Rate|Delay_from_due_date|Changed_Credit_Limit|
+-----------+-------------+--------+--------+--------+--------+--------+--------+------------------+------------+--------------------+------------------+------------------+---------------------+-----------+----------------+-------------+-------------------+--------------------+
| CUS_0xc65a|   2024-01-01|      44|     309|       0|      35|     179|     186|               324|           5| 0

In [8]:
features_pdf = features_sdf.toPandas()
columns_to_exclude = ['Customer_ID', 'snapshot_date']
columns_to_rename = [col for col in features_pdf.columns if col not in columns_to_exclude]
rename_dict = {col: 'feature_' + col for col in columns_to_rename}
features_pdf.rename(columns=rename_dict, inplace=True)
features_pdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8974 entries, 0 to 8973
Data columns (total 19 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Customer_ID                    8974 non-null   object 
 1   snapshot_date                  8974 non-null   object 
 2   feature_click_1m               8974 non-null   int32  
 3   feature_click_2m               8974 non-null   int32  
 4   feature_click_3m               8974 non-null   int32  
 5   feature_click_4m               8974 non-null   int32  
 6   feature_click_5m               8974 non-null   int32  
 7   feature_click_6m               8974 non-null   int32  
 8   feature_Credit_History_Age     8974 non-null   int32  
 9   feature_Num_Fin_Pdts           8974 non-null   int32  
 10  feature_EMI_to_Salary          8974 non-null   float64
 11  feature_Debt_to_Salary         8974 non-null   float64
 12  feature_Repayment_Ability      8974 non-null   f

## preprocess data for modeling

In [9]:
# prepare X_inference
feature_cols = [fe_col for fe_col in features_pdf.columns if fe_col.startswith('feature_')]
X_inference = features_pdf[feature_cols]

# apply transformer - standard scaler
transformer_stdscaler = model_artefact["preprocessing_transformers"]["stdscaler"]
X_inference = transformer_stdscaler.transform(X_inference)

print('X_inference', X_inference.shape[0])
X_inference

X_inference 8974


array([[-0.75456548,  2.26145307, -1.26834788, ..., -0.31306294,
        -1.4236262 , -0.31738886],
       [ 0.83064097,  0.26763229, -0.95839539, ..., -0.63895838,
         0.00788083, -0.46845846],
       [ 0.34818683, -1.23906182, -1.26834788, ..., -1.39938108,
        -1.08279119, -0.87181429],
       ...,
       [ 0.46305686,  0.59615958, -1.21094927, ..., -0.63895838,
        -0.87829019,  1.36099438],
       [-0.63969545, -0.85389189,  0.2469754 , ...,  1.42504609,
        -0.33295418, -0.13610534],
       [-1.25999362, -0.92186306, -0.24665264, ..., -0.74759019,
        -1.15095819, -0.19502248]])

## model prediction inference

In [10]:
# load model
model = model_artefact["model"]

# predict model
y_inference = model.predict_proba(X_inference)[:, 1]

# prepare output
y_inference_pdf = features_pdf[["Customer_ID","snapshot_date"]].copy()
y_inference_pdf["model_name"] = config["model_name"]
y_inference_pdf["model_predictions"] = y_inference
y_inference_pdf

Unnamed: 0,Customer_ID,snapshot_date,model_name,model_predictions
0,CUS_0xc65a,2024-01-01,credit_model_2024_09_01.pkl,0.167670
1,CUS_0x5e1f,2024-01-01,credit_model_2024_09_01.pkl,0.159867
2,CUS_0x78d3,2024-01-01,credit_model_2024_09_01.pkl,0.126014
3,CUS_0x1844,2024-01-01,credit_model_2024_09_01.pkl,0.102062
4,CUS_0x7f07,2024-01-01,credit_model_2024_09_01.pkl,0.086357
...,...,...,...,...
8969,CUS_0x8a04,2024-01-01,credit_model_2024_09_01.pkl,0.452574
8970,CUS_0x7f3f,2024-01-01,credit_model_2024_09_01.pkl,0.502555
8971,CUS_0x915,2024-01-01,credit_model_2024_09_01.pkl,0.109380
8972,CUS_0x4b67,2024-01-01,credit_model_2024_09_01.pkl,0.603051


## save model inference to datamart gold table

In [11]:
# create bronze datalake
gold_directory = f"datamart/gold/model_predictions/{config['model_name'][:-4]}/"
print(gold_directory)

if not os.path.exists(gold_directory):
    os.makedirs(gold_directory)

# save gold table - IRL connect to database to write
partition_name = config["model_name"][:-4] + "_predictions_" + snapshot_date_str.replace('-','_') + '.parquet'
filepath = gold_directory + partition_name
spark.createDataFrame(y_inference_pdf).write.mode("overwrite").parquet(filepath)
# df.toPandas().to_parquet(filepath,
#           compression='gzip')
print('saved to:', filepath)

datamart/gold/model_predictions/credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2024_01_01.parquet


## backfill

In [12]:
# set up config
snapshot_date_str = "2024-01-01"

start_date_str = "2023-02-01"
end_date_str = "2024-12-01"

In [13]:
# generate list of dates to process
def generate_first_of_month_dates(start_date_str, end_date_str):
    # Convert the date strings to datetime objects
    start_date = datetime.strptime(start_date_str, "%Y-%m-%d")
    end_date = datetime.strptime(end_date_str, "%Y-%m-%d")
    
    # List to store the first of month dates
    first_of_month_dates = []

    # Start from the first of the month of the start_date
    current_date = datetime(start_date.year, start_date.month, 1)

    while current_date <= end_date:
        # Append the date in yyyy-mm-dd format
        first_of_month_dates.append(current_date.strftime("%Y-%m-%d"))
        
        # Move to the first of the next month
        if current_date.month == 12:
            current_date = datetime(current_date.year + 1, 1, 1)
        else:
            current_date = datetime(current_date.year, current_date.month + 1, 1)

    return first_of_month_dates

dates_str_lst = generate_first_of_month_dates(start_date_str, end_date_str)


In [None]:
for snapshot_date in dates_str_lst:
    print(snapshot_date)
    model_inference.main(snapshot_date, model_name)

2023-02-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2023, 2, 1, 0, 0),
 'snapshot_date_str': '2023-02-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
extracted features_sdf 8974 2023-02-01 00:00:00
+-----------+-------------+--------+--------+--------+--------+--------+--------+------------------+------------+--------------------+------------------+------------------+---------------------+-----------+----------------+-------------+-------------------+--------------------+
|Customer_ID|snapshot_date|click_1m|click_2m|click_3m|click_4m|click_5m|click_6m|Credit_History_Age|Num_Fin_Pdts|       EMI_to_Salary|    Debt_to_Salary| Repayment_Ability|Loans_per_Credit_Item|Loan_Extent|Outstanding_Debt|Interest_Rate|Delay_from_due_date|Changed_Credit_Limit|
+-----------+-------------+--------+--------+--

                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2023_02_01.parquet
2023-03-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2023, 3, 1, 0, 0),
 'snapshot_date_str': '2023-03-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
extracted features_sdf 8974 2023-03-01 00:00:00
+-----------+-------------+--------+--------+--------+--------+--------+--------+------------------+------------+--------------------+------------------+------------------+---------------------+-----------+----------------+-------------+-------------------+--------------------+
|Customer_ID|snapshot_date|click_1m|click_2m|click_3m|click_4m|click_5m|click_6m|Credit_History_Age|Num_Fin_Pdts|       EMI_to_Salary|    Debt_to_Salary| Repayment_Ability|Loans_per_Credit_Item|Loan_Extent

                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2023_03_01.parquet
2023-04-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2023, 4, 1, 0, 0),
 'snapshot_date_str': '2023-04-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
extracted features_sdf 8974 2023-04-01 00:00:00
+-----------+-------------+--------+--------+--------+--------+--------+--------+------------------+------------+--------------------+------------------+------------------+---------------------+-----------+----------------+-------------+-------------------+--------------------+
|Customer_ID|snapshot_date|click_1m|click_2m|click_3m|click_4m|click_5m|click_6m|Credit_History_Age|Num_Fin_Pdts|       EMI_to_Salary|    Debt_to_Salary| Repayment_Ability|Loans_per_Credit_Item|Loan_Extent

                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2023_04_01.parquet
2023-05-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2023, 5, 1, 0, 0),
 'snapshot_date_str': '2023-05-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
extracted features_sdf 8974 2023-05-01 00:00:00
+-----------+-------------+--------+--------+--------+--------+--------+--------+------------------+------------+--------------------+------------------+------------------+---------------------+-----------+----------------+-------------+-------------------+--------------------+
|Customer_ID|snapshot_date|click_1m|click_2m|click_3m|click_4m|click_5m|click_6m|Credit_History_Age|Num_Fin_Pdts|       EMI_to_Salary|    Debt_to_Salary| Repayment_Ability|Loans_per_Credit_Item|Loan_Extent

                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2023_05_01.parquet
2023-06-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2023, 6, 1, 0, 0),
 'snapshot_date_str': '2023-06-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
extracted features_sdf 8974 2023-06-01 00:00:00
+-----------+-------------+--------+--------+--------+--------+--------+--------+------------------+------------+--------------------+------------------+------------------+---------------------+-----------+----------------+-------------+-------------------+--------------------+
|Customer_ID|snapshot_date|click_1m|click_2m|click_3m|click_4m|click_5m|click_6m|Credit_History_Age|Num_Fin_Pdts|       EMI_to_Salary|    Debt_to_Salary| Repayment_Ability|Loans_per_Credit_Item|Loan_Extent

                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2023_06_01.parquet
2023-07-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2023, 7, 1, 0, 0),
 'snapshot_date_str': '2023-07-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
extracted features_sdf 8974 2023-07-01 00:00:00


                                                                                

+-----------+-------------+--------+--------+--------+--------+--------+--------+------------------+------------+--------------------+------------------+------------------+---------------------+-----------+----------------+-------------+-------------------+--------------------+
|Customer_ID|snapshot_date|click_1m|click_2m|click_3m|click_4m|click_5m|click_6m|Credit_History_Age|Num_Fin_Pdts|       EMI_to_Salary|    Debt_to_Salary| Repayment_Ability|Loans_per_Credit_Item|Loan_Extent|Outstanding_Debt|Interest_Rate|Delay_from_due_date|Changed_Credit_Limit|
+-----------+-------------+--------+--------+--------+--------+--------+--------+------------------+------------+--------------------+------------------+------------------+---------------------+-----------+----------------+-------------+-------------------+--------------------+
| CUS_0xc65a|   2023-07-01|      55|       0|      78|     165|      53|     150|               324|           5| 0.01968154351776198|0.3397862526066755|          

                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2023_07_01.parquet
2023-08-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2023, 8, 1, 0, 0),
 'snapshot_date_str': '2023-08-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
extracted features_sdf 8974 2023-08-01 00:00:00
+-----------+-------------+--------+--------+--------+--------+--------+--------+------------------+------------+--------------------+------------------+------------------+---------------------+-----------+----------------+-------------+-------------------+--------------------+
|Customer_ID|snapshot_date|click_1m|click_2m|click_3m|click_4m|click_5m|click_6m|Credit_History_Age|Num_Fin_Pdts|       EMI_to_Salary|    Debt_to_Salary| Repayment_Ability|Loans_per_Credit_Item|Loan_Extent

                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2023_08_01.parquet
2023-09-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2023, 9, 1, 0, 0),
 'snapshot_date_str': '2023-09-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
extracted features_sdf 8974 2023-09-01 00:00:00
+-----------+-------------+--------+--------+--------+--------+--------+--------+------------------+------------+--------------------+------------------+------------------+---------------------+-----------+----------------+-------------+-------------------+--------------------+
|Customer_ID|snapshot_date|click_1m|click_2m|click_3m|click_4m|click_5m|click_6m|Credit_History_Age|Num_Fin_Pdts|       EMI_to_Salary|    Debt_to_Salary| Repayment_Ability|Loans_per_Credit_Item|Loan_Extent

                                                                                

## Check datamart

In [36]:
# Initialize SparkSession
spark = pyspark.sql.SparkSession.builder \
    .appName("dev") \
    .master("local[*]") \
    .getOrCreate()

# Set log level to ERROR to hide warnings
spark.sparkContext.setLogLevel("ERROR")

In [37]:
folder_path = "datamart/gold/model_predictions/credit_model_2024_09_01/"
files_list = [folder_path+os.path.basename(f) for f in glob.glob(os.path.join(folder_path, '*'))]
df = spark.read.option("header", "true").parquet(*files_list)
print("row_count:",df.count())

df.show()

                                                                                

row_count: 215376
+-----------+-------------+--------------------+--------------------+
|Customer_ID|snapshot_date|          model_name|   model_predictions|
+-----------+-------------+--------------------+--------------------+
| CUS_0xc5cc|   2024-10-01|credit_model_2024...|  0.3085257112979889|
| CUS_0x5f86|   2024-10-01|credit_model_2024...|  0.1593918651342392|
| CUS_0xa788|   2024-10-01|credit_model_2024...|  0.6805830597877502|
| CUS_0xb756|   2024-10-01|credit_model_2024...| 0.40769729018211365|
| CUS_0x8b96|   2024-10-01|credit_model_2024...|  0.2909873425960541|
| CUS_0x5a7d|   2024-10-01|credit_model_2024...| 0.09536822140216827|
| CUS_0xc653|   2024-10-01|credit_model_2024...| 0.07869965583086014|
| CUS_0x8d74|   2024-10-01|credit_model_2024...|  0.1666775345802307|
| CUS_0x94f4|   2024-10-01|credit_model_2024...|0.053171638399362564|
| CUS_0x2296|   2024-10-01|credit_model_2024...| 0.10931466519832611|
| CUS_0x85f4|   2024-10-01|credit_model_2024...| 0.09389250725507736|
| 