## Objective: Prepare a model for supervised classification of applicants into loan accepted and loan rejected!

# Refer to this link for an example of how entropy is calculated for a decision tree model

https://www.saedsayad.com/decision_tree.htm

In [1]:
import pandas as pd
import plotly.express as pe

from sklearn.preprocessing import StandardScaler, LabelEncoder

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import confusion_matrix, classification_report

## step 1: Data loading

In [2]:
path = r"C:\Users\harsh\Desktop\NPCI-Python-ML\datasets\Balanced_credit_Risk.txt"

df = pd.read_csv (path)

df

Unnamed: 0.1,Unnamed: 0,index,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,0,0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,1,2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
2,2,3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
3,3,4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4
4,4,5,21,9900,OWN,2.0,VENTURE,A,2500,7.14,1,0.25,N,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12401,12401,26464,30,102540,MORTGAGE,6.0,HOMEIMPROVEMENT,A,1500,7.90,0,0.01,N,5
12402,12402,12567,24,60000,RENT,0.0,PERSONAL,B,12000,12.21,0,0.20,N,2
12403,12403,6443,22,40000,RENT,0.0,EDUCATION,C,6000,12.87,0,0.15,Y,3
12404,12404,8967,22,50000,RENT,2.0,PERSONAL,C,8000,13.16,0,0.16,Y,2


### step 2: Data exploration

In [3]:
df.shape

(12406, 14)

In [4]:
df.index

RangeIndex(start=0, stop=12406, step=1)

In [5]:
df.columns

Index(['Unnamed: 0', 'index', 'person_age', 'person_income',
       'person_home_ownership', 'person_emp_length', 'loan_intent',
       'loan_grade', 'loan_amnt', 'loan_int_rate', 'loan_status',
       'loan_percent_income', 'cb_person_default_on_file',
       'cb_person_cred_hist_length'],
      dtype='object')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12406 entries, 0 to 12405
Data columns (total 14 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Unnamed: 0                  12406 non-null  int64  
 1   index                       12406 non-null  int64  
 2   person_age                  12406 non-null  int64  
 3   person_income               12406 non-null  int64  
 4   person_home_ownership       12406 non-null  object 
 5   person_emp_length           12406 non-null  float64
 6   loan_intent                 12406 non-null  object 
 7   loan_grade                  12406 non-null  object 
 8   loan_amnt                   12406 non-null  int64  
 9   loan_int_rate               12406 non-null  float64
 10  loan_status                 12406 non-null  int64  
 11  loan_percent_income         12406 non-null  float64
 12  cb_person_default_on_file   12406 non-null  object 
 13  cb_person_cred_hist_length  124

In [7]:
df.isna().sum()

Unnamed: 0                    0
index                         0
person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_status                   0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
dtype: int64

In [8]:
df.nunique()

Unnamed: 0                    12406
index                         12406
person_age                       50
person_income                  2184
person_home_ownership             4
person_emp_length                33
loan_intent                       6
loan_grade                        7
loan_amnt                       576
loan_int_rate                   340
loan_status                       2
loan_percent_income              77
cb_person_default_on_file         2
cb_person_cred_hist_length       29
dtype: int64

In [9]:
categorical_columns = ["person_home_ownership", "loan_intent", "loan_grade"  ]
real_value_columns = ["person_age", "person_income", "loan_amnt", "loan_int_rate"]

In [10]:
df[       real_value_columns       ].describe()

Unnamed: 0,person_age,person_income,loan_amnt,loan_int_rate
count,12406.0,12406.0,12406.0,12406.0
mean,27.59447,60404.5,10187.790182,11.784964
std,6.247178,44547.57,6692.722063,3.391123
min,20.0,4000.0,800.0,5.42
25%,23.0,34615.5,5000.0,8.94
50%,26.0,50915.0,8800.0,11.83
75%,30.0,73000.0,14000.0,14.42
max,144.0,1362000.0,35000.0,23.22


In [11]:
df [     categorical_columns     ].describe(include="object")

Unnamed: 0,person_home_ownership,loan_intent,loan_grade
count,12406,12406,12406
unique,4,6,7
top,RENT,MEDICAL,B
freq,7299,2480,3550


## step 2b) Pre-processing

### scaling & encoding

In [12]:
le  = LabelEncoder()

sc = StandardScaler()

! [https://miro.medium.com/max/660/1*qXXqcZXhSTgw9tVhx7zNNw.gif]

In [13]:
for col in real_value_columns:
    # replace the column with standard scaled version of the same column
    df[    [col]     ]   =    sc.fit_transform(     df[     [col]     ]        )

df[   real_value_columns   ] #searching for columns mentioned in real_value_columns

Unnamed: 0,person_age,person_income,loan_amnt,loan_int_rate
0,-0.895556,-0.031529,3.707491,1.248910
1,-0.415320,-1.140501,-0.700459,0.319977
2,-0.735477,0.114388,3.707491,1.015939
3,-0.575398,-0.134794,3.707491,0.732836
4,-1.055634,-1.133766,-1.148725,-1.369797
...,...,...,...,...
12401,0.385074,0.945892,-1.298147,-1.145674
12402,-0.575398,-0.009081,0.270784,0.125343
12403,-0.895556,-0.458057,-0.625748,0.319977
12404,-0.895556,-0.233569,-0.326904,0.405497


In [14]:
for col  in  categorical_columns:

    df[col]  =   le.fit_transform(   df[col]   )

df[categorical_columns]

Unnamed: 0,person_home_ownership,loan_intent,loan_grade
0,3,4,3
1,0,3,2
2,3,3,2
3,3,3,2
4,2,5,0
...,...,...,...
12401,0,2,0
12402,3,4,1
12403,3,1,2
12404,3,4,2


### step 3: separate features & labels

In [15]:
features = real_value_columns + categorical_columns 

target = ["loan_status"]

## step 4: split the data into training and testing sets

## x refers to features
## y refers to target



features for training
features for testing

target for training (corresponding to features for training)
target for testing (corresponding to features for testing)

In [16]:
df[target].value_counts()

loan_status
0              6203
1              6203
dtype: int64

In [17]:
x_train, x_test, y_train, y_test = train_test_split(   df[features],  df[target],   test_size=0.2 , random_state=10, stratify=df[target]   ) #

### step 5: training the model

In [18]:
model = DecisionTreeClassifier(   )

In [19]:
model.fit(    x_train, y_train        )

### step 6: evaluation of model

In [20]:
ans = model.predict(     x_test    ) #testing features 

print( classification_report(    y_test,  ans     ) )

              precision    recall  f1-score   support

           0       0.81      0.78      0.79      1241
           1       0.79      0.82      0.80      1241

    accuracy                           0.80      2482
   macro avg       0.80      0.80      0.80      2482
weighted avg       0.80      0.80      0.80      2482



In [21]:
actual_ans_df = y_test.reset_index(drop=True)

predicted_ans_df = pd.DataFrame(ans)

result_df = pd.concat(    [actual_ans_df,  predicted_ans_df], axis=1     )

result_df.columns = [    "Actual_Answer", "Predicted_ans"    ]

result_df

Unnamed: 0,Actual_Answer,Predicted_ans
0,0,0
1,1,1
2,1,1
3,1,1
4,0,0
...,...,...
2477,0,0
2478,1,0
2479,1,1
2480,1,1


In [None]:

text_representation = tree.export_text(clf)
print(text_representation)