In [2]:
import pandas as pd
import re
import numpy as np

In [3]:
clickstream_df = pd.read_csv('data/feature_clickstream.csv')
feature_attributtes_df = pd.read_csv('data/features_attributes.csv')
feature_financials_df = pd.read_csv('data/features_financials.csv')
lms_loan_daily_df = pd.read_csv('data/lms_loan_daily.csv')

In [4]:
clickstream_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215376 entries, 0 to 215375
Data columns (total 22 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   fe_1           215376 non-null  int64 
 1   fe_2           215376 non-null  int64 
 2   fe_3           215376 non-null  int64 
 3   fe_4           215376 non-null  int64 
 4   fe_5           215376 non-null  int64 
 5   fe_6           215376 non-null  int64 
 6   fe_7           215376 non-null  int64 
 7   fe_8           215376 non-null  int64 
 8   fe_9           215376 non-null  int64 
 9   fe_10          215376 non-null  int64 
 10  fe_11          215376 non-null  int64 
 11  fe_12          215376 non-null  int64 
 12  fe_13          215376 non-null  int64 
 13  fe_14          215376 non-null  int64 
 14  fe_15          215376 non-null  int64 
 15  fe_16          215376 non-null  int64 
 16  fe_17          215376 non-null  int64 
 17  fe_18          215376 non-null  int64 
 18  fe_1

In [5]:
feature_financials_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12500 entries, 0 to 12499
Data columns (total 22 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Customer_ID               12500 non-null  object 
 1   Annual_Income             12500 non-null  object 
 2   Monthly_Inhand_Salary     12500 non-null  float64
 3   Num_Bank_Accounts         12500 non-null  int64  
 4   Num_Credit_Card           12500 non-null  int64  
 5   Interest_Rate             12500 non-null  int64  
 6   Num_of_Loan               12500 non-null  object 
 7   Type_of_Loan              11074 non-null  object 
 8   Delay_from_due_date       12500 non-null  int64  
 9   Num_of_Delayed_Payment    12500 non-null  object 
 10  Changed_Credit_Limit      12500 non-null  object 
 11  Num_Credit_Inquiries      12500 non-null  int64  
 12  Credit_Mix                12500 non-null  object 
 13  Outstanding_Debt          12500 non-null  object 
 14  Credit

In [6]:
lms_loan_daily_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137500 entries, 0 to 137499
Data columns (total 11 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   loan_id          137500 non-null  object 
 1   Customer_ID      137500 non-null  object 
 2   loan_start_date  137500 non-null  object 
 3   tenure           137500 non-null  int64  
 4   installment_num  137500 non-null  int64  
 5   loan_amt         137500 non-null  int64  
 6   due_amt          137500 non-null  float64
 7   paid_amt         137500 non-null  float64
 8   overdue_amt      137500 non-null  float64
 9   balance          137500 non-null  float64
 10  snapshot_date    137500 non-null  object 
dtypes: float64(4), int64(3), object(4)
memory usage: 11.5+ MB


## Preprocess Feature Attributes

In [7]:
def process_age(age):
    # remove '_' 
    if pd.notna(age) and '_' in str(age):
        age = str(age).replace('_', '')
    # oldest recorded age is 115 according to BBC https://www.bbc.com/news/articles/cwy0zxzpdd4o
    if pd.notna(age) and int(age) > 115:
        return np.nan
    elif pd.notna(age) and int(age) < 1:
        return np.nan
    return age

def process_feature_attributes(input_df):
    df = input_df.copy()
    df['snapshot_date'] = pd.to_datetime(df['snapshot_date'], errors='coerce')
    df['Occupation'] = df['Occupation'].astype('category')
    pattern = r'^\d{3}-\d{2}-\d{4}$'
    df['SSN'] = df['SSN'].apply(lambda x: x if pd.notna(x) and re.match(pattern, x) else np.nan)
    df['Age'] = df['Age'].apply(process_age)
    df_cleaned = df.dropna(subset=['Age'])
    df_cleaned.loc[:,'Age'] = df_cleaned['Age'].apply(int)
    return df_cleaned.dropna()

In [8]:
cleaned_feature_attributtes_df = process_feature_attributes(feature_attributtes_df)
cleaned_feature_attributtes_df.head()

Unnamed: 0,Customer_ID,Name,Age,SSN,Occupation,snapshot_date
0,CUS_0x1000,Alistair Barrf,18,913-74-1218,Lawyer,2023-05-01
1,CUS_0x1009,Arunah,26,063-67-6938,Mechanic,2025-01-01
3,CUS_0x1011,Schneyerh,44,793-05-8223,Doctor,2023-11-01
4,CUS_0x1013,Cameront,44,930-49-9615,Mechanic,2023-12-01
5,CUS_0x1015,Holtono,27,810-97-7024,Journalist,2023-08-01


## Preprocess Feature Financials

In [19]:
def process_income(income):
    # remove '_' 
    if pd.notna(income) and '_' in str(income):
        income = str(income).replace('_', '')
    return round(float(income),2)

def process_number_loan(loan):
    if pd.isna(loan):
        return np.nan
    loan_cleaned = str(loan).replace('_', '')
    if not loan_cleaned.isdigit():
        return np.nan
    loan_int = int(loan_cleaned)
    if loan_int < 0:
        return np.nan
    return loan_int

def process_feature_financials_df(input_df):
    df = input_df.copy()
    df['Annual_Income'] = df['Annual_Income'].apply(process_income)
    df['Monthly_Inhand_Salary'] = df['Monthly_Inhand_Salary'].apply(process_income)
    df = df[df['Num_Bank_Accounts'] >= 0]
    df['Num_of_Loan'] = df['Num_of_Loan'].apply(process_number_loan)
    df_cleaned = df.dropna(subset=['Num_of_Loan']).copy()
    df_cleaned['Type_of_Loan'] = df_cleaned['Type_of_Loan'].str.strip()
    df_cleaned['Type_of_Loan'] = df_cleaned['Type_of_Loan'].str.replace(r',\s*and\s*', ',', regex=True)
    df_split_columns = df_cleaned['Type_of_Loan'].str.split(',', expand=True)
    df_split_columns.columns = [f'Type_of_Loan_{i+1}' for i in range(df_split_columns.shape[1])]
    df_cleaned = pd.concat([df_cleaned, df_split_columns], axis=1)
    for col in df_cleaned.columns:
        print(col)
        print(df_cleaned[col].unique())
        print("*"*100)
    # display(df_cleaned.head())

In [20]:
process_feature_financials_df(feature_financials_df)

Customer_ID
['CUS_0x1000' 'CUS_0x1009' 'CUS_0x100b' ... 'CUS_0xff6' 'CUS_0xffc'
 'CUS_0xffd']
****************************************************************************************************
Annual_Income
[ 30625.94  52312.68 113781.39 ... 117639.92  60877.17  41398.44]
****************************************************************************************************
Monthly_Inhand_Salary
[2706.16 4250.39 9549.78 ... 9727.33 5218.1  3749.87]
****************************************************************************************************
Num_Bank_Accounts
[   6    1    3    7    2    5    8    4    0   10    9  312  933  162
  955 1075 1275 1279 1201  882  334  689  791 1355  794 1712 1605  271
  611 1190  512 1413 1483  805 1320 1741   57  224  196  307  587 1169
 1231  560 1434  848  374  330 1277  670 1540 1067  485  588 1574  210
  983  489   28  823 1070  555 1102  535  813  795 1665 1076  194 1213
 1756 1662 1645  275 1291  511  809  888  229  427  854   11  649 1240
  9