In [1]:
import os
import glob
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import random
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, lower, trim, regexp_replace, split, udf, explode, array_contains
from pyspark.sql.types import FloatType, IntegerType, DateType
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml import Pipeline

import utils.financials_processing_bronze_table
import utils.financials_processing_silver_table
import utils.financials_processing_gold_table

## set up pyspark session

In [2]:
# Initialize SparkSession
spark = SparkSession.builder \
    .appName("features_financials_preprocessing") \
    .master("local[*]") \
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/22 06:25:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## set up config

In [3]:
# set up config
snapshot_date_str = "2023-01-01"

start_date_str = "2023-01-01"
end_date_str = "2024-12-01"

In [4]:
# generate list of dates to process
def generate_first_of_month_dates(start_date_str, end_date_str):
    # Convert the date strings to datetime objects
    start_date = datetime.strptime(start_date_str, "%Y-%m-%d")
    end_date = datetime.strptime(end_date_str, "%Y-%m-%d")
    
    # List to store the first of month dates
    first_of_month_dates = []

    # Start from the first of the month of the start_date
    current_date = datetime(start_date.year, start_date.month, 1)

    while current_date <= end_date:
        # Append the date in yyyy-mm-dd format
        first_of_month_dates.append(current_date.strftime("%Y-%m-%d"))
        
        # Move to the first of the next month
        if current_date.month == 12:
            current_date = datetime(current_date.year + 1, 1, 1)
        else:
            current_date = datetime(current_date.year, current_date.month + 1, 1)

    return first_of_month_dates

dates_str_lst = generate_first_of_month_dates(start_date_str, end_date_str)
dates_str_lst

['2023-01-01',
 '2023-02-01',
 '2023-03-01',
 '2023-04-01',
 '2023-05-01',
 '2023-06-01',
 '2023-07-01',
 '2023-08-01',
 '2023-09-01',
 '2023-10-01',
 '2023-11-01',
 '2023-12-01',
 '2024-01-01',
 '2024-02-01',
 '2024-03-01',
 '2024-04-01',
 '2024-05-01',
 '2024-06-01',
 '2024-07-01',
 '2024-08-01',
 '2024-09-01',
 '2024-10-01',
 '2024-11-01',
 '2024-12-01']

## Build Bronze Table

In [5]:
# Define input and output directories
bronze_features_financials_directory = "datamart/bronze/features_financials/"
if not os.path.exists(bronze_features_financials_directory):
    os.makedirs(bronze_features_financials_directory)

In [6]:
# run bronze backfill
for date_str in dates_str_lst:
    utils.financials_processing_bronze_table.process_bronze_table(date_str, bronze_features_financials_directory, spark)

2023-01-01row count: 530
saved to: datamart/bronze/features_financials/bronze_features_financials_2023_01_01.csv
2023-02-01row count: 501
saved to: datamart/bronze/features_financials/bronze_features_financials_2023_02_01.csv
2023-03-01row count: 506
saved to: datamart/bronze/features_financials/bronze_features_financials_2023_03_01.csv
2023-04-01row count: 510
saved to: datamart/bronze/features_financials/bronze_features_financials_2023_04_01.csv
2023-05-01row count: 521
saved to: datamart/bronze/features_financials/bronze_features_financials_2023_05_01.csv
2023-06-01row count: 517
saved to: datamart/bronze/features_financials/bronze_features_financials_2023_06_01.csv
2023-07-01row count: 471
saved to: datamart/bronze/features_financials/bronze_features_financials_2023_07_01.csv
2023-08-01row count: 481
saved to: datamart/bronze/features_financials/bronze_features_financials_2023_08_01.csv
2023-09-01row count: 454
saved to: datamart/bronze/features_financials/bronze_features_financial

In [7]:
# inspect output
utils.financials_processing_bronze_table.process_bronze_table(date_str, bronze_features_financials_directory, spark).toPandas()

2024-12-01row count: 515
saved to: datamart/bronze/features_financials/bronze_features_financials_2024_12_01.csv


Unnamed: 0,Customer_ID,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Type_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,snapshot_date
0,CUS_0x103e,98690.8,8262.233333,4,6,9,1_,Student Loan,6,17,...,Good,706.96,26.860663,26 Years and 11 Months,No,55.004408,913.4813186573292,Low_spent_Small_value_payments,147.7376071067124,2024-12-01
1,CUS_0x1195,30429.91,2808.825833,4,6,16,2,"Auto Loan, and Auto Loan",22,17,...,Standard,362.48,33.349050,28 Years and 11 Months,No,29.914076,82.87878577514347,Low_spent_Large_value_payments,438.08972109416084,2024-12-01
2,CUS_0x1197,92300.01,7437.667500,2,4,11,3,"Credit-Builder Loan, Not Specified, and Credit...",27,9,...,_,755.17_,26.989787,18 Years and 11 Months,Yes,49236.000000,220.8621525417414,Low_spent_Large_value_payments,581.1567885447394,2024-12-01
3,CUS_0x11e2,44986.55,3689.879167,6,5,11,1,Credit-Builder Loan,0,4,...,Good,753.21,25.586286,20 Years and 0 Months,No,23.267135,43.20363344633164,High_spent_Large_value_payments,542.5171477430948,2024-12-01
4,CUS_0x11ec,14867.69,1005.974167,9,9,18,6,"Debt Consolidation Loan, Student Loan, Persona...",39,15,...,Standard,2344.06,24.344388,17 Years and 2 Months,Yes,55.459604,100.14574834721886,Low_spent_Medium_value_payments,224.99206407779144,2024-12-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
510,CUS_0xe6c,125597.52,9367.500187,1,3,12,4,"Debt Consolidation Loan, Not Specified, Studen...",2,9,...,Good,1294.94,30.324257,30 Years and 5 Months,NM,1278.186251,964.5381161830327,Low_spent_Medium_value_payments,763.3982127892344,2024-12-01
511,CUS_0xe99,45461.54,3917.461667,6,3,5,2,"Credit-Builder Loan, and Payday Loan",20,9,...,Standard,647.24,27.264685,16 Years and 9 Months,No,69.318349,42.941001590068666,High_spent_Large_value_payments,519.4868162135749,2024-12-01
512,CUS_0xf55,78443.48_,6358.956667,7,5,23,4,"Personal Loan, Home Equity Loan, Mortgage Loan...",39,19,...,Bad,1527.77,24.704429,15 Years and 10 Months,NM,177.387563,528.7469053018515,Low_spent_Medium_value_payments,209.76119880079318,2024-12-01
513,CUS_0xfd1,78666.56999999999,6485.547500,3,4,17,4,"Not Specified, Personal Loan, Home Equity Loan...",29,10,...,Standard,1498.7,37.831762,22 Years and 5 Months,No,247.851145,252.3461368272953,High_spent_Small_value_payments,408.35746850506007,2024-12-01


## Build Silver Table

In [8]:
# create bronze datalake
silver_features_financials_directory = "datamart/silver/features_financials/"

if not os.path.exists(silver_features_financials_directory):
    os.makedirs(silver_features_financials_directory)

In [9]:
# run silver backfill
for date_str in dates_str_lst:
    utils.financials_processing_silver_table.process_silver_table(date_str, bronze_features_financials_directory, silver_features_financials_directory, spark)

loaded from: datamart/bronze/features_financials/bronze_features_financials_2023_01_01.csv row count: 530


                                                                                

saved to: datamart/silver/features_financials/silver_features_financials_2023_01_01.parquet
loaded from: datamart/bronze/features_financials/bronze_features_financials_2023_02_01.csv row count: 501
saved to: datamart/silver/features_financials/silver_features_financials_2023_02_01.parquet
loaded from: datamart/bronze/features_financials/bronze_features_financials_2023_03_01.csv row count: 506
saved to: datamart/silver/features_financials/silver_features_financials_2023_03_01.parquet
loaded from: datamart/bronze/features_financials/bronze_features_financials_2023_04_01.csv row count: 510
saved to: datamart/silver/features_financials/silver_features_financials_2023_04_01.parquet
loaded from: datamart/bronze/features_financials/bronze_features_financials_2023_05_01.csv row count: 521
saved to: datamart/silver/features_financials/silver_features_financials_2023_05_01.parquet
loaded from: datamart/bronze/features_financials/bronze_features_financials_2023_06_01.csv row count: 517


                                                                                

saved to: datamart/silver/features_financials/silver_features_financials_2023_06_01.parquet
loaded from: datamart/bronze/features_financials/bronze_features_financials_2023_07_01.csv row count: 471


                                                                                

saved to: datamart/silver/features_financials/silver_features_financials_2023_07_01.parquet
loaded from: datamart/bronze/features_financials/bronze_features_financials_2023_08_01.csv row count: 481
saved to: datamart/silver/features_financials/silver_features_financials_2023_08_01.parquet
loaded from: datamart/bronze/features_financials/bronze_features_financials_2023_09_01.csv row count: 454
saved to: datamart/silver/features_financials/silver_features_financials_2023_09_01.parquet
loaded from: datamart/bronze/features_financials/bronze_features_financials_2023_10_01.csv row count: 487
saved to: datamart/silver/features_financials/silver_features_financials_2023_10_01.parquet
loaded from: datamart/bronze/features_financials/bronze_features_financials_2023_11_01.csv row count: 491
saved to: datamart/silver/features_financials/silver_features_financials_2023_11_01.parquet
loaded from: datamart/bronze/features_financials/bronze_features_financials_2023_12_01.csv row count: 489
saved to: 

In [10]:
utils.financials_processing_silver_table.process_silver_table(date_str, bronze_features_financials_directory, silver_features_financials_directory, spark).toPandas()

loaded from: datamart/bronze/features_financials/bronze_features_financials_2024_12_01.csv row count: 515
saved to: datamart/silver/features_financials/silver_features_financials_2024_12_01.parquet


Unnamed: 0,Customer_ID,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Type_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,snapshot_date
0,CUS_0x103e,98690.796875,8262.233398,4,6,9.0,1,student loan,6,17,...,good,706.960022,26.860664,26 Years and 11 Months,no,55.004406,913.481323,low_spent_small_value_payments,147.737610,2024-12-01
1,CUS_0x1195,30429.910156,2808.825928,4,6,16.0,2,"auto loan, and auto loan",22,17,...,standard,362.480011,33.349049,28 Years and 11 Months,no,29.914076,82.878784,low_spent_large_value_payments,438.089722,2024-12-01
2,CUS_0x1197,92300.007812,7437.667480,2,4,11.0,3,"credit-builder loan, not specified, and credit...",27,9,...,,755.169983,26.989786,18 Years and 11 Months,yes,49236.000000,220.862152,low_spent_large_value_payments,581.156799,2024-12-01
3,CUS_0x11e2,44986.550781,3689.879150,6,5,11.0,1,credit-builder loan,0,4,...,good,753.210022,25.586287,20 Years and 0 Months,no,23.267136,43.203632,high_spent_large_value_payments,542.517151,2024-12-01
4,CUS_0x11ec,14867.690430,1005.974182,9,9,18.0,6,"debt consolidation loan, student loan, persona...",39,15,...,standard,2344.060059,24.344387,17 Years and 2 Months,yes,55.459602,100.145752,low_spent_medium_value_payments,224.992065,2024-12-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
391,CUS_0xe6c,125597.523438,9367.500000,1,3,12.0,4,"debt consolidation loan, not specified, studen...",2,9,...,good,1294.939941,30.324257,30 Years and 5 Months,nm,1278.186279,964.538086,low_spent_medium_value_payments,763.398193,2024-12-01
392,CUS_0xe99,45461.539062,3917.461670,6,3,5.0,2,"credit-builder loan, and payday loan",20,9,...,standard,647.239990,27.264685,16 Years and 9 Months,no,69.318352,42.941002,high_spent_large_value_payments,519.486816,2024-12-01
393,CUS_0xf55,78443.476562,6358.956543,7,5,23.0,4,"personal loan, home equity loan, mortgage loan...",39,19,...,bad,1527.770020,24.704428,15 Years and 10 Months,nm,177.387558,528.746887,low_spent_medium_value_payments,209.761200,2024-12-01
394,CUS_0xfd1,78666.570312,6485.547363,3,4,17.0,4,"not specified, personal loan, home equity loan...",29,10,...,standard,1498.699951,37.831760,22 Years and 5 Months,no,247.851151,252.346130,high_spent_small_value_payments,408.357483,2024-12-01


## EDA

In [11]:
utils.financials_processing_silver_table.process_silver_table(date_str, bronze_features_financials_directory, silver_features_financials_directory, spark).toPandas().columns

loaded from: datamart/bronze/features_financials/bronze_features_financials_2024_12_01.csv row count: 515
saved to: datamart/silver/features_financials/silver_features_financials_2024_12_01.parquet


Index(['Customer_ID', 'Annual_Income', 'Monthly_Inhand_Salary',
       'Num_Bank_Accounts', 'Num_Credit_Card', 'Interest_Rate', 'Num_of_Loan',
       'Type_of_Loan', 'Delay_from_due_date', 'Num_of_Delayed_Payment',
       'Changed_Credit_Limit', 'Num_Credit_Inquiries', 'Credit_Mix',
       'Outstanding_Debt', 'Credit_Utilization_Ratio', 'Credit_History_Age',
       'Payment_of_Min_Amount', 'Total_EMI_per_month',
       'Amount_invested_monthly', 'Payment_Behaviour', 'Monthly_Balance',
       'snapshot_date'],
      dtype='object')

In [12]:
utils.financials_processing_silver_table.process_silver_table(date_str, bronze_features_financials_directory, silver_features_financials_directory, spark).toPandas()['Type_of_Loan'].unique()

loaded from: datamart/bronze/features_financials/bronze_features_financials_2024_12_01.csv row count: 515
saved to: datamart/silver/features_financials/silver_features_financials_2024_12_01.parquet


array(['student loan', 'auto loan, and auto loan',
       'credit-builder loan, not specified, and credit-builder loan',
       'credit-builder loan',
       'debt consolidation loan, student loan, personal loan, credit-builder loan, auto loan, and debt consolidation loan',
       'not specified, and auto loan',
       'payday loan, auto loan, debt consolidation loan, and auto loan',
       'personal loan, credit-builder loan, and debt consolidation loan',
       'auto loan',
       'payday loan, not specified, not specified, not specified, and student loan',
       'debt consolidation loan, debt consolidation loan, and mortgage loan',
       'mortgage loan, not specified, personal loan, and credit-builder loan',
       'mortgage loan',
       'credit-builder loan, home equity loan, mortgage loan, home equity loan, and mortgage loan',
       'not specified', 'student loan, mortgage loan, and not specified',
       'payday loan', 'debt consolidation loan',
       'credit-builder loan, a

In [13]:
utils.financials_processing_silver_table.process_silver_table(date_str, bronze_features_financials_directory, silver_features_financials_directory, spark).toPandas()['Payment_Behaviour'].value_counts(dropna=False)

loaded from: datamart/bronze/features_financials/bronze_features_financials_2024_12_01.csv row count: 515
saved to: datamart/silver/features_financials/silver_features_financials_2024_12_01.parquet


Payment_Behaviour
low_spent_small_value_payments      95
high_spent_medium_value_payments    78
high_spent_large_value_payments     72
low_spent_medium_value_payments     60
high_spent_small_value_payments     47
low_spent_large_value_payments      44
Name: count, dtype: int64

In [14]:
utils.financials_processing_silver_table.process_silver_table(date_str, bronze_features_financials_directory, silver_features_financials_directory, spark).toPandas()['Payment_of_Min_Amount'].value_counts(dropna=False)

loaded from: datamart/bronze/features_financials/bronze_features_financials_2024_12_01.csv row count: 515
saved to: datamart/silver/features_financials/silver_features_financials_2024_12_01.parquet


Payment_of_Min_Amount
yes    203
no     148
nm      45
Name: count, dtype: int64

In [15]:
utils.financials_processing_silver_table.process_silver_table(date_str, bronze_features_financials_directory, silver_features_financials_directory, spark).toPandas()['Credit_Mix'].value_counts(dropna=False)

loaded from: datamart/bronze/features_financials/bronze_features_financials_2024_12_01.csv row count: 515
saved to: datamart/silver/features_financials/silver_features_financials_2024_12_01.parquet


Credit_Mix
standard    149
good        102
None         80
bad          65
Name: count, dtype: int64

## Build Gold Table

In [16]:
# create bronze datalake
gold_label_store_financials_directory = "datamart/gold/label_store_financials/"

if not os.path.exists(gold_label_store_financials_directory):
    os.makedirs(gold_label_store_financials_directory)

In [17]:
# run gold backfill
for date_str in dates_str_lst:
    utils.financials_processing_gold_table.process_labels_gold_table(date_str, silver_features_financials_directory, gold_label_store_financials_directory, spark)

loaded from: datamart/silver/features_financials/silver_features_financials_2023_01_01.parquet row count: 392


                                                                                

saved to: datamart/gold/label_store_financials/gold_label_store_financials_2023_01_01.parquet
loaded from: datamart/silver/features_financials/silver_features_financials_2023_02_01.parquet row count: 394
saved to: datamart/gold/label_store_financials/gold_label_store_financials_2023_02_01.parquet
loaded from: datamart/silver/features_financials/silver_features_financials_2023_03_01.parquet row count: 384
saved to: datamart/gold/label_store_financials/gold_label_store_financials_2023_03_01.parquet
loaded from: datamart/silver/features_financials/silver_features_financials_2023_04_01.parquet row count: 378
saved to: datamart/gold/label_store_financials/gold_label_store_financials_2023_04_01.parquet
loaded from: datamart/silver/features_financials/silver_features_financials_2023_05_01.parquet row count: 415
saved to: datamart/gold/label_store_financials/gold_label_store_financials_2023_05_01.parquet
loaded from: datamart/silver/features_financials/silver_features_financials_2023_06_01.par

                                                                                

saved to: datamart/gold/label_store_financials/gold_label_store_financials_2024_04_01.parquet
loaded from: datamart/silver/features_financials/silver_features_financials_2024_05_01.parquet row count: 389
saved to: datamart/gold/label_store_financials/gold_label_store_financials_2024_05_01.parquet
loaded from: datamart/silver/features_financials/silver_features_financials_2024_06_01.parquet row count: 393
saved to: datamart/gold/label_store_financials/gold_label_store_financials_2024_06_01.parquet
loaded from: datamart/silver/features_financials/silver_features_financials_2024_07_01.parquet row count: 415
saved to: datamart/gold/label_store_financials/gold_label_store_financials_2024_07_01.parquet
loaded from: datamart/silver/features_financials/silver_features_financials_2024_08_01.parquet row count: 424
saved to: datamart/gold/label_store_financials/gold_label_store_financials_2024_08_01.parquet
loaded from: datamart/silver/features_financials/silver_features_financials_2024_09_01.par

In [18]:
utils.financials_processing_gold_table.process_labels_gold_table(date_str, silver_features_financials_directory, gold_label_store_financials_directory, spark).toPandas()

loaded from: datamart/silver/features_financials/silver_features_financials_2024_12_01.parquet row count: 396
saved to: datamart/gold/label_store_financials/gold_label_store_financials_2024_12_01.parquet


Unnamed: 0,Customer_ID,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,Changed_Credit_Limit,...,EMI_to_Income_Ratio,debt consolidation loan,personal loan,payday loan,mortgage loan,not specified,credit-builder loan,auto loan,home equity loan,student loan
0,CUS_0x103e,98690.796875,8262.233398,4,6,9.0,1,6,17,10.760000,...,0.006657,0,0,0,0,0,0,0,0,1
1,CUS_0x1195,30429.910156,2808.825928,4,6,16.0,2,22,17,1.670000,...,0.010650,0,0,0,0,0,0,1,0,0
2,CUS_0x1197,92300.007812,7437.667480,2,4,11.0,3,27,9,18.959999,...,6.619817,0,0,0,0,1,1,0,0,0
3,CUS_0x11e2,44986.550781,3689.879150,6,5,11.0,1,0,4,10.260000,...,0.006306,0,0,0,0,0,1,0,0,0
4,CUS_0x11ec,14867.690430,1005.974182,9,9,18.0,6,39,15,18.500000,...,0.055130,1,1,0,0,0,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
391,CUS_0xe6c,125597.523438,9367.500000,1,3,12.0,4,2,9,9.890000,...,0.136449,1,0,0,0,1,0,0,0,1
392,CUS_0xe99,45461.539062,3917.461670,6,3,5.0,2,20,9,11.380000,...,0.017695,0,0,1,0,0,1,0,0,0
393,CUS_0xf55,78443.476562,6358.956543,7,5,23.0,4,39,19,6.370000,...,0.027896,0,1,0,1,0,0,0,1,1
394,CUS_0xfd1,78666.570312,6485.547363,3,4,17.0,4,29,10,-5.370000,...,0.038216,1,1,0,0,1,0,0,1,0


In [19]:
utils.financials_processing_gold_table.process_labels_gold_table(date_str, silver_features_financials_directory, gold_label_store_financials_directory, spark).toPandas().columns

loaded from: datamart/silver/features_financials/silver_features_financials_2024_12_01.parquet row count: 396
saved to: datamart/gold/label_store_financials/gold_label_store_financials_2024_12_01.parquet


Index(['Customer_ID', 'Annual_Income', 'Monthly_Inhand_Salary',
       'Num_Bank_Accounts', 'Num_Credit_Card', 'Interest_Rate', 'Num_of_Loan',
       'Delay_from_due_date', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit',
       'Num_Credit_Inquiries', 'Credit_Mix', 'Outstanding_Debt',
       'Credit_Utilization_Ratio', 'Payment_of_Min_Amount',
       'Total_EMI_per_month', 'Amount_invested_monthly', 'Payment_Behaviour',
       'Monthly_Balance', 'snapshot_date', 'Credit_History_Years',
       'Credit_History_Months', 'EMI_to_Income_Ratio',
       'debt consolidation loan', 'personal loan', 'payday loan',
       'mortgage loan', 'not specified', 'credit-builder loan', 'auto loan',
       'home equity loan', 'student loan'],
      dtype='object')

In [21]:
utils.financials_processing_gold_table.process_labels_gold_table(date_str, silver_features_financials_directory, gold_label_store_financials_directory, spark).toPandas()['Credit_Mix'].value_counts(dropna=False)

loaded from: datamart/silver/features_financials/silver_features_financials_2024_12_01.parquet row count: 396
saved to: datamart/gold/label_store_financials/gold_label_store_financials_2024_12_01.parquet


Credit_Mix
standard    149
good        102
unknown      80
bad          65
Name: count, dtype: int64