In [1]:
# import libraries

import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

# Downloading the dataset 

In [2]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-06-trees/CreditScoring.csv'

In [3]:
!wget $data

--2021-10-15 09:12:29--  https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-06-trees/CreditScoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 182489 (178K) [text/plain]
Saving to: 'CreditScoring.csv.4'

     0K .......... .......... .......... .......... .......... 28% 1.42M 0s
    50K .......... .......... .......... .......... .......... 56% 59.1M 0s
   100K .......... .......... .......... .......... .......... 84%  107M 0s
   150K .......... .......... ........                        100% 67.8M=0.04s

2021-10-15 09:12:30 (4.84 MB/s) - 'CreditScoring.csv.4' saved [182489/182489]



In [4]:
# head command provides first 10 rows of the dataset
!head CreditScoring.csv

"Status","Seniority","Home","Time","Age","Marital","Records","Job","Expenses","Income","Assets","Debt","Amount","Price"
1,9,1,60,30,2,1,3,73,129,0,0,800,846
1,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
1,0,1,60,24,1,1,1,63,182,2500,0,900,1325
1,0,1,36,26,1,1,1,46,107,0,0,310,910
1,1,2,60,36,2,1,1,75,214,3500,0,650,1645
1,29,2,60,44,2,1,1,75,125,10000,0,1600,1800
1,9,5,12,27,1,1,1,35,80,0,0,200,1093
1,0,2,60,32,2,1,3,90,107,15000,0,1200,1957


In [5]:
df = pd.read_csv(data)

In [6]:
df.head()

Unnamed: 0,Status,Seniority,Home,Time,Age,Marital,Records,Job,Expenses,Income,Assets,Debt,Amount,Price
0,1,9,1,60,30,2,1,3,73,129,0,0,800,846
1,1,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,2,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
3,1,0,1,60,24,1,1,1,63,182,2500,0,900,1325
4,1,0,1,36,26,1,1,1,46,107,0,0,310,910


In [7]:
# As we can see that some of the categorical variablea are coded as numbers. For example, Status, Marital, home, records, and job
# We want to understand whar 2,3 in the Marital column means?

In [8]:
# first we will lower-case the column names
df.columns = df.columns.str.lower()
df.columns

Index(['status', 'seniority', 'home', 'time', 'age', 'marital', 'records',
       'job', 'expenses', 'income', 'assets', 'debt', 'amount', 'price'],
      dtype='object')

In [9]:
# To understand the categorical variables :
# This r script provides an overview how categorical variables are coded:https://github.com/gastonstat/CreditScoring/blob/master/Part1_CredScoring_Processing.R
# For example, missing values are coded as 99999999
# From the R code, this is how categorical variable are formatted:
# change factor levels (i.e. categories)
# levels(dd$Status) = c("good", "bad")
# levels(dd$Home) = c("rent", "owner", "priv", "ignore", "parents", "other")
# levels(dd$Marital) = c("single", "married", "widow", "separated", "divorced")
# levels(dd$Records) = c("no_rec", "yes_rec")
# levels(dd$Job) = c("fixed", "partime", "freelance", "others")
# in R indices start with 1.That means 1 is "good" and 2 is "bad"
# Let's translate this R code into Python

# Re-encoding the categorical variables

In [10]:
df.status.value_counts()

1    3200
2    1254
0       1
Name: status, dtype: int64

In [11]:
# Translating these numbers into strings using the function map()
# Python's map() is a built-in function that allows you to process and transform all the items in an iterable without using an explicit for loop, a technique commonly known as mapping. 
# map() is useful when you need to apply a transformation function to each item in an iterable and transform them into a new iterable.

In [12]:
df.status.map({1:'ok', 2:'default',0:'unk'})

0            ok
1            ok
2       default
3            ok
4            ok
5            ok
6            ok
7            ok
8            ok
9       default
10           ok
11           ok
12           ok
13           ok
14      default
15           ok
16           ok
17           ok
18      default
19           ok
20      default
21           ok
22      default
23           ok
24           ok
25           ok
26           ok
27           ok
28      default
29      default
         ...   
4425         ok
4426         ok
4427         ok
4428         ok
4429         ok
4430    default
4431         ok
4432    default
4433         ok
4434    default
4435         ok
4436    default
4437    default
4438         ok
4439    default
4440         ok
4441         ok
4442         ok
4443    default
4444         ok
4445    default
4446         ok
4447         ok
4448         ok
4449    default
4450    default
4451         ok
4452    default
4453         ok
4454         ok
Name: status, Length: 44

In [13]:
status_values = {
    1:'ok', 
    2:'default',
    0:'unk'
}

df.status = df.status.map(status_values)

In [14]:
# now using the same code for  Marital, home, records, and job
home_values = {
    1: 'rent',
    2: 'owner',
    3: 'private',
    4: 'ignore',
    5: 'parents',
    6: 'other',
    0: 'unk'
}

df.home = df.home.map(home_values)

marital_values = {
    1: 'single',
    2: 'married',
    3: 'widow',
    4: 'separated',
    5: 'divorced',
    0: 'unk'
}

df.marital = df.marital.map(marital_values)

records_values = {
    1: 'no',
    2: 'yes',
    0: 'unk'
}

df.records = df.records.map(records_values)

job_values = {
    1: 'fixed',
    2: 'partime',
    3: 'freelance',
    4: 'others',
    0: 'unk'
}

df.job = df.job.map(job_values)

In [15]:
df.head()
# now 'status' variable has been transformed to a categorical variable (a string)

Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,ok,9,rent,60,30,married,no,freelance,73,129,0,0,800,846
1,ok,17,rent,60,58,widow,no,fixed,48,131,0,0,1000,1658
2,default,10,owner,36,46,married,yes,freelance,90,200,3000,0,2000,2985
3,ok,0,rent,60,24,single,no,fixed,63,182,2500,0,900,1325
4,ok,0,rent,36,26,single,no,fixed,46,107,0,0,310,910


In [16]:
# now before dealing with missing values, let's look at the descriptive statistics of the data
df.describe().round()
# we can see that varibles 'income', 'assessts', and 'debt' has large values (99999999.0)

Unnamed: 0,seniority,time,age,expenses,income,assets,debt,amount,price
count,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0
mean,8.0,46.0,37.0,56.0,763317.0,1060341.0,404382.0,1039.0,1463.0
std,8.0,15.0,11.0,20.0,8703625.0,10217569.0,6344253.0,475.0,628.0
min,0.0,6.0,18.0,35.0,0.0,0.0,0.0,100.0,105.0
25%,2.0,36.0,28.0,35.0,80.0,0.0,0.0,700.0,1118.0
50%,5.0,48.0,36.0,51.0,120.0,3500.0,0.0,1000.0,1400.0
75%,12.0,60.0,45.0,72.0,166.0,6000.0,0.0,1300.0,1692.0
max,48.0,72.0,68.0,180.0,99999999.0,99999999.0,99999999.0,5000.0,11140.0


In [17]:
df.income.max()

99999999

In [18]:
# we can replace the large values (99999999.0) with NA's
df.income.replace(to_replace=99999999, value=np.nan)

0       129.0
1       131.0
2       200.0
3       182.0
4       107.0
5       214.0
6       125.0
7        80.0
8       107.0
9        80.0
10      125.0
11      121.0
12      199.0
13      170.0
14       50.0
15      131.0
16      330.0
17      200.0
18      130.0
19      137.0
20      107.0
21      324.0
22      112.0
23      140.0
24      143.0
25      130.0
26      180.0
27      251.0
28       85.0
29        NaN
        ...  
4425    140.0
4426    154.0
4427    150.0
4428    175.0
4429    260.0
4430    170.0
4431    172.0
4432    150.0
4433      NaN
4434     92.0
4435    199.0
4436      0.0
4437    130.0
4438    200.0
4439     50.0
4440      0.0
4441      0.0
4442    219.0
4443     63.0
4444    242.0
4445    100.0
4446     69.0
4447    190.0
4448    160.0
4449     77.0
4450     92.0
4451     75.0
4452     90.0
4453    140.0
4454    140.0
Name: income, Length: 4455, dtype: float64

In [19]:
# check the maximum value again
df.income.replace(to_replace=99999999, value=np.nan). max()
# and it is not a large value anymore

959.0

In [20]:
# we can perfrom the above opetation in a loop for three columns (income, assets, debt) with large values
for c in ['assets','debt']:
    df[c]=df[c].replace(to_replace=99999999, value=np.nan)

In [21]:
df.describe().round()

Unnamed: 0,seniority,time,age,expenses,income,assets,debt,amount,price
count,4455.0,4455.0,4455.0,4455.0,4455.0,4408.0,4437.0,4455.0,4455.0
mean,8.0,46.0,37.0,56.0,763317.0,5403.0,343.0,1039.0,1463.0
std,8.0,15.0,11.0,20.0,8703625.0,11573.0,1246.0,475.0,628.0
min,0.0,6.0,18.0,35.0,0.0,0.0,0.0,100.0,105.0
25%,2.0,36.0,28.0,35.0,80.0,0.0,0.0,700.0,1118.0
50%,5.0,48.0,36.0,51.0,120.0,3000.0,0.0,1000.0,1400.0
75%,12.0,60.0,45.0,72.0,166.0,6000.0,0.0,1300.0,1692.0
max,48.0,72.0,68.0,180.0,99999999.0,300000.0,30000.0,5000.0,11140.0


In [22]:
# Previously we saw that the variable 'status' has an unknown value
df.status.value_counts()
# but we are interested in whether customer is defaulted 

ok         3200
default    1254
unk           1
Name: status, dtype: int64

In [23]:
# let' filter the data which is not unknwon
df = df[df.status != 'unk'].reset_index(drop=True)

# Doing the train/validation/test split 

In [24]:
from sklearn.model_selection import train_test_split
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=11)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=11)

In [25]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [26]:
# defining target variable
# because we re-encoded 'ok', and 'default' variables, now we will code them again into numbers
y_train = (df_train.status == 'default').astype('int').values # converting 'default' into 1(True) otherwise 0(False)
y_val = (df_val.status == 'default').astype('int').values
y_test = (df_test.status == 'default').astype('int').values

In [27]:
# removing default variables from the default dataframes
del df_train['status']
del df_val['status']
del df_test['status']

In [28]:
df_train

Unnamed: 0,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,10,owner,36,36,married,no,freelance,75,0,10000.0,0.0,1000,1400
1,6,parents,48,32,single,yes,fixed,35,85,0.0,0.0,1100,1330
2,1,parents,48,40,married,no,fixed,75,121,0.0,0.0,1320,1600
3,1,parents,48,23,single,no,partime,35,72,0.0,0.0,1078,1079
4,5,owner,36,46,married,no,freelance,60,100,4000.0,0.0,1100,1897
5,0,parents,48,28,divorced,yes,fixed,35,280,0.0,0.0,1300,2058
6,3,parents,60,24,single,no,fixed,35,100,0.0,0.0,1500,2100
7,1,rent,60,33,married,no,partime,85,250,5000.0,0.0,1700,1878
8,0,owner,60,36,married,no,partime,90,113,4000.0,198.0,400,790
9,1,owner,24,32,married,no,partime,45,182,4000.0,200.0,175,175


# Decision Trees

In [39]:
# creating a function which is implementing a decision tree in credit scoring data
# if and else condition: if a customer has a part-time job then they might 'default' else the case is 'ok' 
# second case, if customer has assessts then they will be 'ok', otherwise 'default'

def assess_risk(client):
    if client['records'] == 'yes':
        if client['job'] == 'parttime':
            return 'default'
        else:
            return 'ok'
    else:
        if client['assets'] > 6000:
            return 'ok'
        else:
            return 'default'

In [30]:
# turning first record of the train dataset into dictionary
df_train.iloc[0].to_dict()
# we can see that the job doesn't fall in our criteria of the decision tree

{'seniority': 10,
 'home': 'owner',
 'time': 36,
 'age': 36,
 'marital': 'married',
 'records': 'no',
 'job': 'freelance',
 'expenses': 75,
 'income': 0,
 'assets': 10000.0,
 'debt': 0.0,
 'amount': 1000,
 'price': 1400}

In [33]:
xi = df_train.iloc[0].to_dict()

In [40]:
assess_risk(xi)
# the job with freelance description shows that this customer will be fine. They will not default.

'ok'

In [41]:
xi # becasue they have 10000 assets that's why this customer is 'ok'

{'seniority': 10,
 'home': 'owner',
 'time': 36,
 'age': 36,
 'marital': 'married',
 'records': 'no',
 'job': 'freelance',
 'expenses': 75,
 'income': 0,
 'assets': 10000.0,
 'debt': 0.0,
 'amount': 1000,
 'price': 1400}

In [57]:
# The rules encoded above can be learned from data using decision tree algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score

In [49]:
# Now we will turn our data frame into list of dictionaries and then turn them into feature matrix and then train the model
train_dicts = df_train.fillna(0).to_dict(orient='records')

In [50]:
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)

In [51]:
X_train
# this is our feature matrix

array([[3.60e+01, 1.00e+03, 1.00e+04, ..., 0.00e+00, 1.00e+01, 3.60e+01],
       [3.20e+01, 1.10e+03, 0.00e+00, ..., 1.00e+00, 6.00e+00, 4.80e+01],
       [4.00e+01, 1.32e+03, 0.00e+00, ..., 0.00e+00, 1.00e+00, 4.80e+01],
       ...,
       [1.90e+01, 4.00e+02, 0.00e+00, ..., 0.00e+00, 1.00e+00, 2.40e+01],
       [4.30e+01, 2.50e+03, 1.80e+04, ..., 0.00e+00, 1.50e+01, 4.80e+01],
       [2.70e+01, 4.50e+02, 5.00e+03, ..., 1.00e+00, 1.20e+01, 4.80e+01]])

In [52]:
# looking at the feature names
dv.get_feature_names()
# as we can see that all numerical features are left intact and our categorical variables are encoded

['age',
 'amount',
 'assets',
 'debt',
 'expenses',
 'home=ignore',
 'home=other',
 'home=owner',
 'home=parents',
 'home=private',
 'home=rent',
 'home=unk',
 'income',
 'job=fixed',
 'job=freelance',
 'job=others',
 'job=partime',
 'job=unk',
 'marital=divorced',
 'marital=married',
 'marital=separated',
 'marital=single',
 'marital=unk',
 'marital=widow',
 'price',
 'records=no',
 'records=yes',
 'seniority',
 'time']

In [53]:
# training our decision tree
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
# intially it gives an error due to Nan values. Therefore, we will fillna with 0 in train_dicts

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [54]:
#let's test the validation dataset
val_dicts = df_val.fillna(0).to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [59]:
y_pred = dt.predict_proba(X_val)[:,1]

In [60]:
# calculating AUC
roc_auc_score(y_val,y_pred)
# not the best AUC score

0.6304339928798044

In [61]:
# let's see what is the AUC score for the training data
y_pred = dt.predict_proba(X_train)[:,1]
roc_auc_score(y_train,y_pred)


1.0

In [63]:
# As we can see that the AUC is 0.63 on the validation data and 1 on the training data
# This case is known as OVERFITTING
# OVERFITTING is when our model simply memorises the data in such a way that when it sees a new example and doesn't know how to deal with this example
# becasue it doesn't look like any of the memorized data points
# so the model memorizes the data but fails to generalize because for new unseen examples none of the memorized examples look like this new one
# and the output is completely incorrect
# the reason is because the depth of our tree is high. We can try now to restrict the depth of the tree and rearrange the code a bit

In [64]:
dt = DecisionTreeClassifier(max_depth=3)
dt.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [65]:
y_pred = dt.predict_proba(X_train)[:,1]
auc = roc_auc_score(y_train,y_pred)
print('train:', auc)

y_pred = dt.predict_proba(X_val)[:,1]
auc = roc_auc_score(y_val,y_pred)
print('val:', auc)

# we see that after restricting the depth of the tree, the performance is significantly better

train: 0.7727018900343643
val: 0.7376244217868301
