In [1]:
# Importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import r2_score, accuracy_score, classification_report, precision_recall_fscore_support
import warnings
import os

In [2]:
# Loading the datasets
df1 = pd.read_excel("case_study1.xlsx") # Internal BOB Dataset
df2 = pd.read_excel("case_study2.xlsx") # CIBIL Dataset

In [4]:
df1.head()

Unnamed: 0,PROSPECTID,Total_TL,Tot_Closed_TL,Tot_Active_TL,Total_TL_opened_L6M,Tot_TL_closed_L6M,pct_tl_open_L6M,pct_tl_closed_L6M,pct_active_tl,pct_closed_tl,...,CC_TL,Consumer_TL,Gold_TL,Home_TL,PL_TL,Secured_TL,Unsecured_TL,Other_TL,Age_Oldest_TL,Age_Newest_TL
0,1,5,4,1,0,0,0.0,0.0,0.2,0.8,...,0,0,1,0,4,1,4,0,72,18
1,2,1,0,1,0,0,0.0,0.0,1.0,0.0,...,0,1,0,0,0,0,1,0,7,7
2,3,8,0,8,1,0,0.125,0.0,1.0,0.0,...,0,6,1,0,0,2,6,0,47,2
3,4,1,0,1,1,0,1.0,0.0,1.0,0.0,...,0,0,0,0,0,0,1,1,5,5
4,5,3,2,1,0,0,0.0,0.0,0.333,0.667,...,0,0,0,0,0,3,0,2,131,32


In [5]:
df2.head()

Unnamed: 0,PROSPECTID,time_since_recent_payment,time_since_first_deliquency,time_since_recent_deliquency,num_times_delinquent,max_delinquency_level,max_recent_level_of_deliq,num_deliq_6mts,num_deliq_12mts,num_deliq_6_12mts,...,pct_CC_enq_L6m_of_L12m,pct_PL_enq_L6m_of_ever,pct_CC_enq_L6m_of_ever,max_unsec_exposure_inPct,HL_Flag,GL_Flag,last_prod_enq2,first_prod_enq2,Credit_Score,Approved_Flag
0,1,549,35,15,11,29,29,0,0,0,...,0.0,0.0,0.0,13.333,1,0,PL,PL,696,P2
1,2,47,-99999,-99999,0,-99999,0,0,0,0,...,0.0,0.0,0.0,0.86,0,0,ConsumerLoan,ConsumerLoan,685,P2
2,3,302,11,3,9,25,25,1,9,8,...,0.0,0.0,0.0,5741.667,1,0,ConsumerLoan,others,693,P2
3,4,-99999,-99999,-99999,0,-99999,0,0,0,0,...,0.0,0.0,0.0,9.9,0,0,others,others,673,P2
4,5,583,-99999,-99999,0,-99999,0,0,0,0,...,0.0,0.0,0.0,-99999.0,0,0,AL,AL,753,P1


In [6]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51336 entries, 0 to 51335
Data columns (total 26 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   PROSPECTID            51336 non-null  int64  
 1   Total_TL              51336 non-null  int64  
 2   Tot_Closed_TL         51336 non-null  int64  
 3   Tot_Active_TL         51336 non-null  int64  
 4   Total_TL_opened_L6M   51336 non-null  int64  
 5   Tot_TL_closed_L6M     51336 non-null  int64  
 6   pct_tl_open_L6M       51336 non-null  float64
 7   pct_tl_closed_L6M     51336 non-null  float64
 8   pct_active_tl         51336 non-null  float64
 9   pct_closed_tl         51336 non-null  float64
 10  Total_TL_opened_L12M  51336 non-null  int64  
 11  Tot_TL_closed_L12M    51336 non-null  int64  
 12  pct_tl_open_L12M      51336 non-null  float64
 13  pct_tl_closed_L12M    51336 non-null  float64
 14  Tot_Missed_Pmnt       51336 non-null  int64  
 15  Auto_TL            

In [7]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51336 entries, 0 to 51335
Data columns (total 62 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   PROSPECTID                    51336 non-null  int64  
 1   time_since_recent_payment     51336 non-null  int64  
 2   time_since_first_deliquency   51336 non-null  int64  
 3   time_since_recent_deliquency  51336 non-null  int64  
 4   num_times_delinquent          51336 non-null  int64  
 5   max_delinquency_level         51336 non-null  int64  
 6   max_recent_level_of_deliq     51336 non-null  int64  
 7   num_deliq_6mts                51336 non-null  int64  
 8   num_deliq_12mts               51336 non-null  int64  
 9   num_deliq_6_12mts             51336 non-null  int64  
 10  max_deliq_6mts                51336 non-null  int64  
 11  max_deliq_12mts               51336 non-null  int64  
 12  num_times_30p_dpd             51336 non-null  int64  
 13  n

In [8]:
# -99999 are actually the null values in the dataset so we need to handle these values

# Removing null values from df1
for col in df1.columns:
    df1 = df1.loc[df1[col] != -99999]

# Removing null values from df2
columns_to_be_removed = []
for col in df2.columns:
    if df2.loc[df2[col] == -99999].shape[0] > 10000:
        columns_to_be_removed.append(col)
df2 = df2.drop(columns_to_be_removed, axis = 1)

for col in df2.columns:
    df2 = df2.loc[df2[col] != -99999]

In [9]:
print("Null values remaining in DF1", df1.isna().sum().sum())
print("Null values remaining in DF2", df2.isna().sum().sum())

Null values remaining in DF1 0
Null values remaining in DF2 0


In [10]:
# Checking for common columns names in df1 and df2

for i in df1.columns:
    if i in df2.columns:
        print(i)

PROSPECTID


In [11]:
# Merging the two dataframes using inner join so that no null values are present
df = pd.merge(df1, df2, how = "inner", left_on = ["PROSPECTID"], right_on = ["PROSPECTID"])
df.head()

Unnamed: 0,PROSPECTID,Total_TL,Tot_Closed_TL,Tot_Active_TL,Total_TL_opened_L6M,Tot_TL_closed_L6M,pct_tl_open_L6M,pct_tl_closed_L6M,pct_active_tl,pct_closed_tl,...,pct_PL_enq_L6m_of_L12m,pct_CC_enq_L6m_of_L12m,pct_PL_enq_L6m_of_ever,pct_CC_enq_L6m_of_ever,HL_Flag,GL_Flag,last_prod_enq2,first_prod_enq2,Credit_Score,Approved_Flag
0,1,5,4,1,0,0,0.0,0.0,0.2,0.8,...,0.0,0.0,0.0,0.0,1,0,PL,PL,696,P2
1,2,1,0,1,0,0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0,0,ConsumerLoan,ConsumerLoan,685,P2
2,3,8,0,8,1,0,0.125,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1,0,ConsumerLoan,others,693,P2
3,5,3,2,1,0,0,0.0,0.0,0.333,0.667,...,0.0,0.0,0.0,0.0,0,0,AL,AL,753,P1
4,6,6,5,1,0,0,0.0,0.0,0.167,0.833,...,1.0,0.0,0.429,0.0,1,0,ConsumerLoan,PL,668,P3


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42064 entries, 0 to 42063
Data columns (total 79 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   PROSPECTID                  42064 non-null  int64  
 1   Total_TL                    42064 non-null  int64  
 2   Tot_Closed_TL               42064 non-null  int64  
 3   Tot_Active_TL               42064 non-null  int64  
 4   Total_TL_opened_L6M         42064 non-null  int64  
 5   Tot_TL_closed_L6M           42064 non-null  int64  
 6   pct_tl_open_L6M             42064 non-null  float64
 7   pct_tl_closed_L6M           42064 non-null  float64
 8   pct_active_tl               42064 non-null  float64
 9   pct_closed_tl               42064 non-null  float64
 10  Total_TL_opened_L12M        42064 non-null  int64  
 11  Tot_TL_closed_L12M          42064 non-null  int64  
 12  pct_tl_open_L12M            42064 non-null  float64
 13  pct_tl_closed_L12M          420

In [13]:
df.isna().sum().sum()

0

In [14]:
# Checking for categorical columns
categorical_columns = []
for cat_col in df.columns:
    if df[cat_col].dtype == "object":
        categorical_columns.append(cat_col)

In [15]:
for cat_col in categorical_columns:
    print(df[cat_col].value_counts())

Married    30886
Single     11178
Name: MARITALSTATUS, dtype: int64
GRADUATE          14140
12TH              11703
SSC                7241
UNDER GRADUATE     4572
OTHERS             2291
POST-GRADUATE      1898
PROFESSIONAL        219
Name: EDUCATION, dtype: int64
M    37345
F     4719
Name: GENDER, dtype: int64
ConsumerLoan    16480
others          13653
PL               7553
CC               2195
AL               1353
HL                830
Name: last_prod_enq2, dtype: int64
others          20640
ConsumerLoan    11075
PL               4431
AL               2641
CC               1988
HL               1289
Name: first_prod_enq2, dtype: int64
P2    25452
P3     6440
P4     5264
P1     4908
Name: Approved_Flag, dtype: int64


In [16]:
for cat_col in categorical_columns:
    chi2, p_val, _, _ = chi2_contingency(pd.crosstab(df[cat_col], df["Approved_Flag"]))
    print("{}: {}".format(cat_col, p_val))

MARITALSTATUS: 3.578180861038862e-233
EDUCATION: 2.6942265249737532e-30
GENDER: 1.907936100186563e-05
last_prod_enq2: 0.0
first_prod_enq2: 7.84997610555419e-287
Approved_Flag: 0.0


###### Regarding the hypothesis to be tested, all chi-square tests have the same general null and alternate hypothesis. The null hypothesis states that there is no relationship between the two variables, while the research hypothesis states that there is a relationship between the two variables.

###### Since all the categorical features have p_value <= 0.05, we will accept all the features

In [17]:
# Checking for numerical columns
numerical_columns = []
for num_col in df.columns:
    if df[num_col].dtype != "object" and num_col not in ["PROSPECTID", "Approved_Flag"]:
        numerical_columns.append(num_col)

In [19]:
# Checking for multicollinearity and removing irrelevant columns using sequential VIF
vif_data = df[numerical_columns]
total_columns = vif_data.shape[1]
columns_to_be_kept = []
column_index = 0

for i in range(0, total_columns):
    vif_value = variance_inflation_factor(vif_data.values, column_index)
    print(column_index, "---", vif_value)
    if vif_value <= 6:
        columns_to_be_kept.append(numerical_columns[i])
        column_index += 1
    else:
        vif_data = vif_data.drop([numerical_columns[i]], axis = 1)

  vif = 1. / (1. - r_squared_i)


0 --- inf
0 --- inf
0 --- 11.320180023967996
0 --- 8.363698035000336
0 --- 6.520647877790928
0 --- 5.149501618212625
1 --- 2.611111040579735
2 --- inf
2 --- 1788.7926256209232
2 --- 8.601028256477228
2 --- 3.8328007921530785
3 --- 6.099653381646731
3 --- 5.581352009642762
4 --- 1.9855843530987785
5 --- inf
5 --- 4.809538302819343
6 --- 23.270628983464636
6 --- 30.595522588100053
6 --- 4.3843464059655854
7 --- 3.064658415523423
8 --- 2.898639771299253
9 --- 4.377876915347322
10 --- 2.2078535836958433
11 --- 4.916914200506864
12 --- 5.214702030064725
13 --- 3.3861625024231476
14 --- 7.840583309478997
14 --- 5.255034641721438
15 --- inf
15 --- 7.380634506427232
15 --- 1.421005001517573
16 --- 8.083255010190316
16 --- 1.6241227524040112
17 --- 7.257811920140003
17 --- 15.59624383268298
17 --- 1.8258570471324318
18 --- 1.5080839450032661
19 --- 2.172088834824578
20 --- 2.623397553527229
21 --- 2.2959970812106176
22 --- 7.360578319196446
22 --- 2.1602387773102554
23 --- 2.8686288267891467
24

In [20]:
vif_data.shape

(42064, 39)

In [21]:
# Check ANOVA for columns_to_be_kept
from scipy.stats import f_oneway
numerical_columns_to_be_kept = []
for i in columns_to_be_kept:
    a = list(df[i])
    b = list(df["Approved_Flag"])
    group_P1 = [value for value, group in zip(a, b) if group == "P1"]
    group_P2 = [value for value, group in zip(a, b) if group == "P2"]
    group_P3 = [value for value, group in zip(a, b) if group == "P3"]
    group_P4 = [value for value, group in zip(a, b) if group == "P4"]
    f_statistic, p_value = f_oneway(group_P1, group_P2, group_P3, group_P4)
    if p_value <= 0.05:
        numerical_columns_to_be_kept.append(i)
print(numerical_columns_to_be_kept)
print(len(numerical_columns_to_be_kept))

['pct_tl_open_L6M', 'pct_tl_closed_L6M', 'Tot_TL_closed_L12M', 'pct_tl_closed_L12M', 'Tot_Missed_Pmnt', 'CC_TL', 'Home_TL', 'PL_TL', 'Secured_TL', 'Unsecured_TL', 'Other_TL', 'Age_Oldest_TL', 'Age_Newest_TL', 'time_since_recent_payment', 'max_recent_level_of_deliq', 'num_deliq_6_12mts', 'num_times_60p_dpd', 'num_std_12mts', 'num_sub', 'num_sub_6mts', 'num_sub_12mts', 'num_dbt', 'num_dbt_12mts', 'num_lss', 'recent_level_of_deliq', 'CC_enq_L12m', 'PL_enq_L12m', 'time_since_recent_enq', 'enq_L3m', 'NETMONTHLYINCOME', 'Time_With_Curr_Empr', 'CC_Flag', 'PL_Flag', 'pct_PL_enq_L6m_of_ever', 'pct_CC_enq_L6m_of_ever', 'HL_Flag', 'GL_Flag']
37


In [22]:
# Listing all the final features
features = numerical_columns_to_be_kept + ["MARITALSTATUS", "EDUCATION", "GENDER", "last_prod_enq2", "first_prod_enq2", "Approved_Flag"]
df = df[features]

In [23]:
print(df["MARITALSTATUS"].unique())
print(df["EDUCATION"].unique())
print(df["GENDER"].unique())
print(df["last_prod_enq2"].unique())
print(df["first_prod_enq2"].unique())

['Married' 'Single']
['12TH' 'GRADUATE' 'SSC' 'POST-GRADUATE' 'UNDER GRADUATE' 'OTHERS'
 'PROFESSIONAL']
['M' 'F']
['PL' 'ConsumerLoan' 'AL' 'CC' 'others' 'HL']
['PL' 'ConsumerLoan' 'others' 'AL' 'HL' 'CC']


In [22]:
# Ordinal feature - EDUCATION
# Label encoding of the EDUCATION feature
df.loc[df["EDUCATION"] == "SSC", ["EDUCATION"]] = 1
df.loc[df["EDUCATION"] == "12TH", ["EDUCATION"]] = 2
df.loc[df["EDUCATION"] == "GRADUATE", ["EDUCATION"]] = 3
df.loc[df["EDUCATION"] == "UNDER GRADUATE", ["EDUCATION"]] = 3
df.loc[df["EDUCATION"] == "POST-GRADUATE", ["EDUCATION"]] = 4
df.loc[df["EDUCATION"] == "OTHERS", ["EDUCATION"]] = 1
df.loc[df["EDUCATION"] == "PROFESSIONAL", ["EDUCATION"]] = 3

print(df["EDUCATION"].value_counts())
df["EDUCATION"] = df["EDUCATION"].astype(int)
df.info()

3    18931
2    11703
1     9532
4     1898
Name: EDUCATION, dtype: int64
<class 'pandas.core.frame.DataFrame'>
Int64Index: 42064 entries, 0 to 42063
Data columns (total 43 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   pct_tl_open_L6M            42064 non-null  float64
 1   pct_tl_closed_L6M          42064 non-null  float64
 2   Tot_TL_closed_L12M         42064 non-null  int64  
 3   pct_tl_closed_L12M         42064 non-null  float64
 4   Tot_Missed_Pmnt            42064 non-null  int64  
 5   CC_TL                      42064 non-null  int64  
 6   Home_TL                    42064 non-null  int64  
 7   PL_TL                      42064 non-null  int64  
 8   Secured_TL                 42064 non-null  int64  
 9   Unsecured_TL               42064 non-null  int64  
 10  Other_TL                   42064 non-null  int64  
 11  Age_Oldest_TL              42064 non-null  int64  
 12  Age_Newest_TL              4

In [23]:
# One Hot Encoding for the other categorical variables
df_encoded = pd.get_dummies(df, columns = ["MARITALSTATUS", "GENDER", "last_prod_enq2", "first_prod_enq2"])
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42064 entries, 0 to 42063
Data columns (total 55 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   pct_tl_open_L6M               42064 non-null  float64
 1   pct_tl_closed_L6M             42064 non-null  float64
 2   Tot_TL_closed_L12M            42064 non-null  int64  
 3   pct_tl_closed_L12M            42064 non-null  float64
 4   Tot_Missed_Pmnt               42064 non-null  int64  
 5   CC_TL                         42064 non-null  int64  
 6   Home_TL                       42064 non-null  int64  
 7   PL_TL                         42064 non-null  int64  
 8   Secured_TL                    42064 non-null  int64  
 9   Unsecured_TL                  42064 non-null  int64  
 10  Other_TL                      42064 non-null  int64  
 11  Age_Oldest_TL                 42064 non-null  int64  
 12  Age_Newest_TL                 42064 non-null  int64  
 13  t

In [24]:
k = df_encoded.describe()
print(k)

       pct_tl_open_L6M  pct_tl_closed_L6M  Tot_TL_closed_L12M  \
count     42064.000000       42064.000000        42064.000000   
mean          0.179032           0.097783            0.825504   
std           0.278043           0.210957            1.537208   
min           0.000000           0.000000            0.000000   
25%           0.000000           0.000000            0.000000   
50%           0.000000           0.000000            0.000000   
75%           0.333000           0.100000            1.000000   
max           1.000000           1.000000           33.000000   

       pct_tl_closed_L12M  Tot_Missed_Pmnt         CC_TL       Home_TL  \
count        42064.000000     42064.000000  42064.000000  42064.000000   
mean             0.160365         0.525746      0.145921      0.076241   
std              0.258831         1.106442      0.549314      0.358582   
min              0.000000         0.000000      0.000000      0.000000   
25%              0.000000         0.000000  

In [25]:
df_encoded.head()

Unnamed: 0,pct_tl_open_L6M,pct_tl_closed_L6M,Tot_TL_closed_L12M,pct_tl_closed_L12M,Tot_Missed_Pmnt,CC_TL,Home_TL,PL_TL,Secured_TL,Unsecured_TL,...,last_prod_enq2_ConsumerLoan,last_prod_enq2_HL,last_prod_enq2_PL,last_prod_enq2_others,first_prod_enq2_AL,first_prod_enq2_CC,first_prod_enq2_ConsumerLoan,first_prod_enq2_HL,first_prod_enq2_PL,first_prod_enq2_others
0,0.0,0.0,0,0.0,0,0,0,4,1,4,...,0,0,1,0,0,0,0,0,1,0
1,0.0,0.0,0,0.0,0,0,0,0,0,1,...,1,0,0,0,0,0,1,0,0,0
2,0.125,0.0,0,0.0,1,0,0,0,2,6,...,1,0,0,0,0,0,0,0,0,1
3,0.0,0.0,0,0.0,0,0,0,0,3,0,...,0,0,0,0,1,0,0,0,0,0
4,0.0,0.0,1,0.167,0,0,0,0,6,0,...,1,0,0,0,0,0,0,0,1,0


### Machine Learning Model Training

In [26]:
# Train test split
y = df_encoded["Approved_Flag"]
X = df_encoded.drop(["Approved_Flag"], axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

##### 1. Random Forest Classifier

In [27]:
rf_classifier = RandomForestClassifier(n_estimators = 200, random_state = 42)
rf_classifier.fit(X_train, y_train)

In [28]:
y_pred = rf_classifier.predict(X_test)

In [29]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)

Accuracy: 0.7636990372043266


In [30]:
for i, v in enumerate(["P1", "P2", "P3", "P4"]):
    print(f"Class {v}:")
    print(f"Precision: {precision[i]}")
    print(f"Recall: {recall[i]}")
    print(f"F1 score: {f1_score[i]}")
    print()

Class P1:
Precision: 0.8370457209847597
Recall: 0.7041420118343196
F1 score: 0.7648634172469203

Class P2:
Precision: 0.7957519116397621
Recall: 0.9282457879088206
F1 score: 0.8569075937785909

Class P3:
Precision: 0.4423380726698262
Recall: 0.21132075471698114
F1 score: 0.28600612870275793

Class P4:
Precision: 0.7178502879078695
Recall: 0.7269193391642371
F1 score: 0.7223563495895703



##### 2. XGBoost

In [31]:
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

xgb_classifier = xgb.XGBClassifier(objective = "multi:softmax", num_class = 4)

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size = 0.2, random_state = 42)
xgb_classifier.fit(X_train, y_train)

In [32]:
y_pred = xgb_classifier.predict(X_test)

In [33]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)

Accuracy: 0.7783192677998336


In [34]:
for i, v in enumerate(["P1", "P2", "P3", "P4"]):
    print(f"Class {v}:")
    print(f"Precision: {precision[i]}")
    print(f"Recall: {recall[i]}")
    print(f"F1 score: {f1_score[i]}")
    print()

Class P1:
Precision: 0.823906083244397
Recall: 0.7613412228796844
F1 score: 0.7913890312660173

Class P2:
Precision: 0.8255418233924413
Recall: 0.913577799801784
F1 score: 0.8673315769665036

Class P3:
Precision: 0.4756380510440835
Recall: 0.30943396226415093
F1 score: 0.3749428440786465

Class P4:
Precision: 0.7342386032977691
Recall: 0.7356656948493683
F1 score: 0.7349514563106796



##### 3. Decision Tree

In [35]:
from sklearn.tree import DecisionTreeClassifier

dt_classifier = DecisionTreeClassifier(max_depth = 20, min_samples_split = 10)
dt_classifier.fit(X_train, y_train)

In [36]:
y_pred = dt_classifier.predict(X_test)

In [37]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)

Accuracy: 0.7118744799714727


In [38]:
for i, v in enumerate(["P1", "P2", "P3", "P4"]):
    print(f"Class {v}:")
    print(f"Precision: {precision[i]}")
    print(f"Recall: {recall[i]}")
    print(f"F1 score: {f1_score[i]}")
    print()

Class P1:
Precision: 0.7232142857142857
Recall: 0.7189349112426036
F1 score: 0.7210682492581603

Class P2:
Precision: 0.8095330362332881
Recall: 0.8281466798810704
F1 score: 0.8187340779933373

Class P3:
Precision: 0.3492822966507177
Recall: 0.3305660377358491
F1 score: 0.3396665374176037

Class P4:
Precision: 0.6505050505050505
Recall: 0.6258503401360545
F1 score: 0.6379395740465577



In [39]:
df_encoded["Approved_Flag"].value_counts()

P2    25452
P3     6440
P4     5264
P1     4908
Name: Approved_Flag, dtype: int64

##### Since XGBoost is giving the best accuracy, we will finetune it

### Hyperparameter Tuning

##### Motive of hyperparamter tuning: It decides how fast the algorithm wants to finish/converge
##### In XGBoost, learning rate reduces the overfitting of the model

In [40]:
param_grid = {
    "colsample_bytree": [0.1, 0.3, 0.5, 0.7, 0.9],
    "learning_rate": [0.001, 0.01, 0.1, 1],
    "max_depth": [3, 5, 8, 10, 16],
    "alpha": [1, 10, 100],
    "n_estimators": [10, 50, 100, 150, 200]
}

index = 0

answers_grid = {
    "combination": [],
    "train_Accuracy": [],
    "test_Accuracy": [],
    "colsample_bytree": [],
    "learning_rate": [],
    "max_depth": [],
    "alpha": [],
    "n_estimators": []
}

In [41]:
# Loop through each combination of hyperparameters
for colsample_bytree in param_grid['colsample_bytree']:
    for learning_rate in param_grid['learning_rate']:
        for max_depth in param_grid['max_depth']:
            for alpha in param_grid['alpha']:
                for n_estimators in param_grid['n_estimators']:
                    index = index + 1

                    # Define and train the XGBoost model
                    model = xgb.XGBClassifier(objective = 'multi:softmax',
                                              num_class = 4,
                                              colsample_bytree = colsample_bytree,
                                              learning_rate = learning_rate,
                                              max_depth = max_depth,
                                              alpha = alpha,
                                              n_estimators = n_estimators)
                    
                    y = df_encoded['Approved_Flag']
                    x = df_encoded.drop(['Approved_Flag'], axis = 1)

                    label_encoder = LabelEncoder()
                    y_encoded = label_encoder.fit_transform(y)

                    x_train, x_test, y_train, y_test = train_test_split(x, y_encoded, test_size=0.2, random_state=42)

                    model.fit(x_train, y_train)

                    # Predict on training and testing sets
                    y_pred_train = model.predict(x_train)
                    y_pred_test = model.predict(x_test)

                    # Calculate train and test results

                    train_accuracy = accuracy_score(y_train, y_pred_train)
                    test_accuracy = accuracy_score(y_test , y_pred_test)

                    # Include into the lists
                    answers_grid['combination'].append(index)
                    answers_grid['train_Accuracy'].append(train_accuracy)
                    answers_grid['test_Accuracy'].append(test_accuracy)
                    answers_grid['colsample_bytree'].append(colsample_bytree)
                    answers_grid['learning_rate'].append(learning_rate)
                    answers_grid['max_depth'].append(max_depth)
                    answers_grid['alpha'].append(alpha)
                    answers_grid['n_estimators'].append(n_estimators)

                    # Print results for this combination
                    print(f"Combination {index}")
                    print(f"colsample_bytree: {colsample_bytree}, learning_rate: {learning_rate}, max_depth: {max_depth}, alpha: {alpha}, n_estimators: {n_estimators}")
                    print(f"Train Accuracy: {train_accuracy:.2f}")
                    print(f"Test Accuracy : {test_accuracy :.2f}")
                    print("-" * 30)

Combination 1
colsample_bytree: 0.1, learning_rate: 0.001, max_depth: 3, alpha: 1, n_estimators: 10
Train Accuracy: 0.61
Test Accuracy : 0.60
------------------------------
Combination 2
colsample_bytree: 0.1, learning_rate: 0.001, max_depth: 3, alpha: 1, n_estimators: 50
Train Accuracy: 0.61
Test Accuracy : 0.60
------------------------------
Combination 3
colsample_bytree: 0.1, learning_rate: 0.001, max_depth: 3, alpha: 1, n_estimators: 100
Train Accuracy: 0.61
Test Accuracy : 0.60
------------------------------
Combination 4
colsample_bytree: 0.1, learning_rate: 0.001, max_depth: 3, alpha: 1, n_estimators: 150
Train Accuracy: 0.61
Test Accuracy : 0.60
------------------------------
Combination 5
colsample_bytree: 0.1, learning_rate: 0.001, max_depth: 3, alpha: 1, n_estimators: 200
Train Accuracy: 0.61
Test Accuracy : 0.60
------------------------------
Combination 6
colsample_bytree: 0.1, learning_rate: 0.001, max_depth: 3, alpha: 10, n_estimators: 10
Train Accuracy: 0.61
Test Accur

In [43]:
ht_results = pd.DataFrame(answers_grid, index = answers_grid["combination"])
ht_results.head()

Unnamed: 0,combination,train_Accuracy,test_Accuracy,colsample_bytree,learning_rate,max_depth,alpha,n_estimators
1,1,0.606431,0.599667,0.1,0.001,3,1,10
2,2,0.60646,0.599667,0.1,0.001,3,1,50
3,3,0.606431,0.599667,0.1,0.001,3,1,100
4,4,0.606431,0.599667,0.1,0.001,3,1,150
5,5,0.606579,0.599667,0.1,0.001,3,1,200


In [46]:
ht_results = ht_results.sort_values(by = "test_Accuracy", ascending = False)
ht_results.head()

Unnamed: 0,combination,train_Accuracy,test_Accuracy,colsample_bytree,learning_rate,max_depth,alpha,n_estimators
1104,1104,0.860925,0.780815,0.7,0.1,10,10,150
1070,1070,0.82895,0.780697,0.7,0.1,5,1,200
1103,1103,0.837746,0.780221,0.7,0.1,10,10,100
832,832,0.797123,0.779983,0.5,1.0,3,10,50
770,770,0.824552,0.779627,0.5,0.1,5,1,200


In [47]:
# final_params = {
#     "colsample_bytree": 0.7,
#     "learning_rate": 0.1,
#     "max_depth": 10,
#     "alpha": 10,
#     "n_estimators": 150
# }

final_model = xgb.XGBClassifier(objective='multi:softmax',
                                              num_class = 4,
                                              colsample_bytree = 0.7,
                                              learning_rate = 0.1,
                                              max_depth = 10,
                                              alpha = 10,
                                              n_estimators = 150)

In [48]:
df3 = pd.read_excel("Unseen_Dataset.xlsx")

In [51]:
features_of_unseen_dataset = [feat for feat in features if feat != "Approved_Flag"]
df_unseen = df3[features_of_unseen_dataset]
df_unseen.head()

Unnamed: 0,pct_tl_open_L6M,pct_tl_closed_L6M,Tot_TL_closed_L12M,pct_tl_closed_L12M,Tot_Missed_Pmnt,CC_TL,Home_TL,PL_TL,Secured_TL,Unsecured_TL,...,PL_Flag,pct_PL_enq_L6m_of_ever,pct_CC_enq_L6m_of_ever,HL_Flag,GL_Flag,MARITALSTATUS,EDUCATION,GENDER,last_prod_enq2,first_prod_enq2
0,0.0,0.0,0,0.0,0,0,0,4,1,4,...,1,0.0,0.0,1,0,Married,12TH,M,PL,PL
1,0.0,0.0,0,0.0,0,0,0,0,0,1,...,0,0.0,0.0,0,0,Single,GRADUATE,F,ConsumerLoan,ConsumerLoan
2,0.125,0.0,0,0.0,1,0,0,0,2,6,...,0,0.0,0.0,1,0,Married,SSC,M,ConsumerLoan,others
3,0.0,0.0,0,0.0,0,0,0,0,3,0,...,0,0.0,0.0,0,0,Married,POST-GRADUATE,M,AL,AL
4,0.0,0.0,1,0.167,0,0,0,0,6,0,...,0,0.429,0.0,1,0,Married,12TH,M,ConsumerLoan,PL


In [53]:
df_unseen.loc[df_unseen["EDUCATION"] == "SSC", ["EDUCATION"]] = 1
df_unseen.loc[df_unseen["EDUCATION"] == "12TH", ["EDUCATION"]] = 2
df_unseen.loc[df_unseen["EDUCATION"] == "GRADUATE", ["EDUCATION"]] = 3
df_unseen.loc[df_unseen["EDUCATION"] == "UNDER GRADUATE", ["EDUCATION"]] = 3
df_unseen.loc[df_unseen["EDUCATION"] == "POST-GRADUATE", ["EDUCATION"]] = 4
df_unseen.loc[df_unseen["EDUCATION"] == "OTHERS", ["EDUCATION"]] = 1
df_unseen.loc[df_unseen["EDUCATION"] == "PROFESSIONAL", ["EDUCATION"]] = 3

df_unseen["EDUCATION"] = df_unseen["EDUCATION"].astype(int)

df_unseen_encoded = pd.get_dummies(df_unseen, columns = ["MARITALSTATUS", "GENDER", "last_prod_enq2", "first_prod_enq2"])
df_unseen_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 54 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   pct_tl_open_L6M               100 non-null    float64
 1   pct_tl_closed_L6M             100 non-null    float64
 2   Tot_TL_closed_L12M            100 non-null    int64  
 3   pct_tl_closed_L12M            100 non-null    float64
 4   Tot_Missed_Pmnt               100 non-null    int64  
 5   CC_TL                         100 non-null    int64  
 6   Home_TL                       100 non-null    int64  
 7   PL_TL                         100 non-null    int64  
 8   Secured_TL                    100 non-null    int64  
 9   Unsecured_TL                  100 non-null    int64  
 10  Other_TL                      100 non-null    int64  
 11  Age_Oldest_TL                 100 non-null    int64  
 12  Age_Newest_TL                 100 non-null    int64  
 13  time_s

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size = 0.2, random_state = 42)
final_model.fit(X_train, y_train)

In [56]:
y_pred_unseen = final_model.predict(df_unseen_encoded)

In [58]:
df3["Target Variable"] = y_pred_unseen
df3.to_excel("Final_Prediction.xlsx", index = False)