In [1]:
import os, sys, io, requests
import numpy as np
import pandas as pd
import sklearn
import xgboost as xgb
import matplotlib

In [2]:
#Download the Dataset and store in /data/raw
url = 'https://github.com/gastonstat/CreditScoring/raw/master/CreditScoring.csv'
save_file_path = os.path.join('data', 'raw', 'CreditScoring.csv')

#Download the file if it does not already exist
if not os.path.exists(save_file_path):
    file_stream = requests.get(url, allow_redirects=True, stream=True)
    with open(save_file_path, 'wb+') as save_file:
        #save_file.write(file_stream)
        for chunk in file_stream.iter_content(chunk_size=1024 * 8):
                    if chunk:
                        save_file.write(chunk)
                        save_file.flush()
                        os.fsync(save_file.fileno())


In [3]:
#import the csv file into a pandas dataframe
column_names = ["Status","Seniority","Home","Time","Age","Marital","Records","Job","Expenses","Income", \
                "Assets","Debt","Amount","Price"]

df = pd.read_csv(save_file_path, names=None)
df.head()

Unnamed: 0,Status,Seniority,Home,Time,Age,Marital,Records,Job,Expenses,Income,Assets,Debt,Amount,Price
0,1,9,1,60,30,2,1,3,73,129,0,0,800,846
1,1,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,2,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
3,1,0,1,60,24,1,1,1,63,182,2500,0,900,1325
4,1,0,1,36,26,1,1,1,46,107,0,0,310,910


In [4]:
df.describe()

Unnamed: 0,Status,Seniority,Home,Time,Age,Marital,Records,Job,Expenses,Income,Assets,Debt,Amount,Price
count,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0
mean,1.281257,7.987205,2.657015,46.441751,37.077666,1.879012,1.173513,1.67587,55.568799,763317.0,1060341.0,404382.0,1039.021773,1462.875645
std,0.450162,8.173444,1.610467,14.655225,10.984856,0.643748,0.378733,0.954035,19.515878,8703625.0,10217570.0,6344253.0,474.543007,628.089913
min,0.0,0.0,0.0,6.0,18.0,0.0,1.0,0.0,35.0,0.0,0.0,0.0,100.0,105.0
25%,1.0,2.0,2.0,36.0,28.0,2.0,1.0,1.0,35.0,80.0,0.0,0.0,700.0,1117.5
50%,1.0,5.0,2.0,48.0,36.0,2.0,1.0,1.0,51.0,120.0,3500.0,0.0,1000.0,1400.0
75%,2.0,12.0,4.0,60.0,45.0,2.0,1.0,3.0,72.0,166.0,6000.0,0.0,1300.0,1692.0
max,2.0,48.0,6.0,72.0,68.0,5.0,2.0,4.0,180.0,100000000.0,100000000.0,100000000.0,5000.0,11140.0


In [5]:
#Check how many nulls there are, and decide a manner of filling these nulls in.
df.isna().sum()

Status       0
Seniority    0
Home         0
Time         0
Age          0
Marital      0
Records      0
Job          0
Expenses     0
Income       0
Assets       0
Debt         0
Amount       0
Price        0
dtype: int64

In [6]:
#reduce the column names to lower case for convention's sake
df.columns = df.columns.str.lower()
df.head()

Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,1,9,1,60,30,2,1,3,73,129,0,0,800,846
1,1,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,2,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
3,1,0,1,60,24,1,1,1,63,182,2500,0,900,1325
4,1,0,1,36,26,1,1,1,46,107,0,0,310,910


# The column significance, according to the dataset source:
1 Status	--- credit status
2 Seniority	--- job seniority (years)
3 Home	    --- type of home ownership
4 Time	    --- time of requested loan
5 Age	    --- client's age
6 Marital	--- marital status
7 Records	--- existance of records
8 Job	    --- type of job
9 Expenses	--- amount of expenses
10 Income	--- amount of income
11 Assets	--- amount of assets
12 Debt	    --- amount of debt
13 Amount	--- amount requested of loan
14 Price	--- price of good

Status is a categorical field that has been encoded as follows:
   “1” means “OK”, and the value “2” means “default”, and “0” means that the value is missing.


In [7]:
#make dictionaries that will map from the encoding to their categorical values to create a dataframe that is more readable

#for column status
status_column_mapping = {
    0 : 'unknown',
    1 : 'ok',
    2 : 'default'
}

#for column home
home_column_mapping = {
    1: 'rent',
    2: 'owner',
    3: 'private',
    4: 'ignore',
    5: 'parents',
    6: 'other',
    0: 'unknown'
}

#for column marital
marital_column_mapping = {
    1: 'single',
    2: 'married',
    3: 'widow',
    4: 'separated',
    5: 'divorced',
    0: 'unknown'
}

#for column records
records_column_mapping = {
     1: 'no',
     2: 'yes',
     0: 'unknown'
}

#for column job
job_column_mapping = {
    1: 'fixed',
    2: 'partime',
    3: 'freelance',
    4: 'others',
    0: 'unknown'
}

def dictForStr(dict_name: str) :
    if dict_name=='home':
        return home_column_mapping
    elif dict_name=='job':
        return job_column_mapping
    elif dict_name=='status':
        return status_column_mapping
    elif dict_name=='marital':
        return marital_column_mapping
    elif dict_name=='records':
        return records_column_mapping

df_full = df.copy()
cols = ['home', 'job', 'status', 'marital', 'records'] 
for c in cols:
    df_full[c] = df_full[c].map(dictForStr(c))



df_full.head()

Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,ok,9,rent,60,30,married,no,freelance,73,129,0,0,800,846
1,ok,17,rent,60,58,widow,no,fixed,48,131,0,0,1000,1658
2,default,10,owner,36,46,married,yes,freelance,90,200,3000,0,2000,2985
3,ok,0,rent,60,24,single,no,fixed,63,182,2500,0,900,1325
4,ok,0,rent,36,26,single,no,fixed,46,107,0,0,310,910


In [8]:
for col in ['home', 'job', 'status', 'marital', 'records'] :
    print("value_count for column", col, "\n",df_full[col].value_counts(), "\n---------\n")

value_count for column home 
 owner      2107
rent        973
parents     783
other       319
private     247
ignore       20
unknown       6
Name: home, dtype: int64 
---------

value_count for column job 
 fixed        2806
freelance    1024
partime       452
others        171
unknown         2
Name: job, dtype: int64 
---------

value_count for column status 
 ok         3200
default    1254
unknown       1
Name: status, dtype: int64 
---------

value_count for column marital 
 married      3241
single        978
separated     130
widow          67
divorced       38
unknown         1
Name: marital, dtype: int64 
---------

value_count for column records 
 no     3682
yes     773
Name: records, dtype: int64 
---------



In [9]:
#status is the label, hence unknown values are removed
df_full = df_full[df_full.status != 'unknown']

#ensure the label col has usable values
df_full['status'].value_counts()

ok         3200
default    1254
Name: status, dtype: int64

In [10]:
#columns income, assets and debt are numeric columns that have the value 99999999 to represent missing values
#convert these to np.nan for a clearer understanding
for c in ['income', 'assets', 'debt']:
    df_full[c] = df_full[c].replace(to_replace=99999999, value=np.nan)

In [11]:
#View the distribution of the numerical columns of the full, un-encoded dataset
df_full.describe().round()

Unnamed: 0,seniority,time,age,expenses,income,assets,debt,amount,price
count,4454.0,4454.0,4454.0,4454.0,4420.0,4407.0,4436.0,4454.0,4454.0
mean,8.0,46.0,37.0,56.0,131.0,5404.0,343.0,1039.0,1463.0
std,8.0,15.0,11.0,20.0,86.0,11574.0,1246.0,475.0,628.0
min,0.0,6.0,18.0,35.0,0.0,0.0,0.0,100.0,105.0
25%,2.0,36.0,28.0,35.0,80.0,0.0,0.0,700.0,1117.0
50%,5.0,48.0,36.0,51.0,120.0,3000.0,0.0,1000.0,1400.0
75%,12.0,60.0,45.0,72.0,165.0,6000.0,0.0,1300.0,1692.0
max,48.0,72.0,68.0,180.0,959.0,300000.0,30000.0,5000.0,11140.0


At this point, since ther dataset is well-understood and labels are ok, splitting the dataset between test and training sets will be done. After which cleaning and additional data-prep for training will follow...

Split Srategy:: Training -> 60%, Validation -> 20%, testing -> 20%

In [12]:
#use train_test_split from scikit-learn to split the data. First split between 60%-40%, 
#and then the 40% block will be split in two 20% blocks.

from sklearn.model_selection import train_test_split

df_train_, df_test      = train_test_split(df_full, train_size=0.8, test_size=0.2, random_state=42, shuffle=True)
df_train, df_validation = train_test_split(df_train_, train_size=0.75, test_size=0.25)

print(f"The num of records in test set is {len(df_test)}, while the training set has {len(df_train)} records \
and the validation set has {len(df_validation)} records")



The num of records in test set is 891, while the training set has 2672 records and the validation set has 891 records


status == 'default' when the loan was defaulted on. The model is meant to prodict loan defaults. Therefore the label column will have a numeric value of 1 when there is a default (i.e., status == 1, when status == 'default' and status == 0 otherwise).

In [13]:
y_train_ = df_train[df_train['status']=='default']
y_val_   = df_validation[df_validation['status']=='default']

y_train = (df_train.status == 'default').values
y_val = (df_validation.status == 'default').values

y_train
print(y_val)

[False False False False False False False False  True False False False
 False False False  True False False False False False False False  True
 False False False False  True False  True False False False  True  True
 False False False False False False False False False False  True  True
 False  True  True False False False  True  True  True False False False
 False False  True False False False  True False False False False False
 False False  True  True False False  True False  True  True False False
 False  True False False  True  True False False False  True False  True
  True False False False False  True False False False False False False
 False False False False False  True  True False False False  True False
 False False False False False False False False False False  True False
  True False False False False False False False  True False False False
 False False  True False False  True False False False False False False
  True  True False False  True False False  True Fa

In [14]:
#drop the status column to ensure the model does not end up training on this column which is actually the target/label.
del df_train['status']
del df_validation['status']

df_train.head()

Unnamed: 0,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
2943,5,private,36,32,married,no,fixed,75,101.0,40000.0,0.0,1400,1444
3427,20,owner,24,54,married,no,fixed,35,175.0,11500.0,0.0,500,1080
3628,14,owner,48,48,married,no,fixed,45,112.0,6000.0,0.0,800,1037
3819,1,owner,48,20,single,no,fixed,35,53.0,2000.0,1600.0,1100,1100
3127,0,rent,60,36,married,no,others,47,121.0,0.0,0.0,900,1423


In [15]:
#deal with missing values: fill all NaNs with 0
#the alternative would be to plug in various kinds of cental-placement figures like mead, mode etc.
#I'm choosing to make this 0 instead of placing computed values that don't actually exist in the data.

df_train = df_train.fillna(0)
df_validation = df_validation.fillna(0)

#check how many na-s we have among the numeric columns
num_of_na = 0
for col in ['seniority', 'time', 'expenses', 'income', 'assets', 'debt', 'amount', 'price']:
    num_of_na += df_train[col].isna().sum()

print(f"The total number of NaN in the numeric columns is {num_of_na}")


The total number of NaN in the numeric columns is 0


In [16]:
#turn each row into a dictionary
dict_train = df_train.to_dict(orient='records')
dict_val = df_validation.to_dict(orient='records')

#print out one row/dict to see structure
print(dict_train[0])

{'seniority': 5, 'home': 'private', 'time': 36, 'age': 32, 'marital': 'married', 'records': 'no', 'job': 'fixed', 'expenses': 75, 'income': 101.0, 'assets': 40000.0, 'debt': 0.0, 'amount': 1400, 'price': 1444}


In [17]:
#The above created dictionaries can be fed into sklearn's DictVectorizer
#from sklear documentation:
#  This transformer turns lists of mappings (dict-like objects) of feature names to feature values into Numpy arrays or 
#  scipy.sparse matrices for use with scikit-learn estimators.
#  When feature values are strings, this transformer will do a binary one-hot (aka one-of-K) coding: one boolean-valued feature 
#  is constructed for each of the possible string values that the feature can take on
#  Note that this transformer will only do a binary one-hot encoding when feature values are of type string. If categorical 
#  features are represented as numeric values such as int or iterables of strings, the DictVectorizer can be followed by 
#  OneHotEncoder to complete binary one-hot encoding.

from sklearn.feature_extraction import DictVectorizer

#create DictVectorizer object
dv = DictVectorizer(sparse=False) #setting sparce to true would produce scipy.sparce output, we seek numpy output instead

#make feature 
X_Train = dv.fit_transform(dict_train)
X_val   = dv.transform(dict_val) 

print(X_val)

[[5.900e+01 4.000e+02 4.000e+03 ... 0.000e+00 0.000e+00 1.200e+01]
 [3.500e+01 2.000e+03 1.500e+04 ... 0.000e+00 6.000e+00 4.800e+01]
 [4.900e+01 1.800e+03 2.100e+04 ... 0.000e+00 0.000e+00 1.200e+01]
 ...
 [4.900e+01 1.650e+03 3.572e+03 ... 0.000e+00 1.500e+01 3.600e+01]
 [3.100e+01 1.100e+03 5.000e+03 ... 0.000e+00 0.000e+00 4.800e+01]
 [2.900e+01 4.500e+02 6.000e+03 ... 0.000e+00 5.000e+00 1.800e+01]]


#Decision Tree Section


In [18]:
#Since this is a classification task the exact Decision Tree algo being used is the DesicionTree Classifier
from sklearn.tree import DecisionTreeClassifier

#create the DecisionTreeClassifier objerct and fit the data
dt = DecisionTreeClassifier()
dt.fit(X_Train, y_train)

DecisionTreeClassifier()

In [19]:
#Now that the model has been trained, it's performance is to be evaluated
#AUC (area under the ROC Curve) is being used as it's a well-regarded tool for binary classification tasks
from sklearn.metrics import roc_auc_score

#obtain scores to evaluate with auc
y_pred = dt.predict_proba(X_Train)[:,1]
pred_on_training = roc_auc_score(y_train, y_pred)

y_pred = dt.predict_proba(X_val)[:, 1]
pred_on_validation = roc_auc_score(y_val, y_pred)

print(f"The score against training set is {pred_on_training*100:.4f}%,", 
      f"while the score against validation set is {pred_on_validation*100:.4f}%.")


The score against training set is 100.0000%, while the score against validation set is 73.3210%.


Observation: The great efficiency of the model on the training set, as opposed to the validation set, seems to show overfitting. Thus there seems to be a relative lack of the ability to generalize to unkown data.

In [20]:
#tweak the max_depth parameter of the DecisiosnTreeClassifier to reduce over-fitting
#max_depth controls the complexity of the tree by putting a limit to the number of levels of branches the tree can make
#lesser complexity is hoped to prevent over-fitting of the model to the training set.

dt = DecisionTreeClassifier(max_depth=2)
dt.fit(X_Train, y_train)

DecisionTreeClassifier(max_depth=2)

In [21]:
#Since the tree itself is being controlled, it makes sense to see the tree generated

from sklearn.tree import export_text

tree_as_text = export_text(dt, feature_names=dv.feature_names_)
print(tree_as_text)

|--- seniority <= 2.50
|   |--- records=no <= 0.50
|   |   |--- class: True
|   |--- records=no >  0.50
|   |   |--- class: False
|--- seniority >  2.50
|   |--- records=yes <= 0.50
|   |   |--- class: False
|   |--- records=yes >  0.50
|   |   |--- class: False



In [22]:
#obtain scores to evaluate with auc
y_pred = dt.predict_proba(X_Train)[:,1]
pred_on_training = roc_auc_score(y_train, y_pred)

y_pred = dt.predict_proba(X_val)[:, 1]
pred_on_validation = roc_auc_score(y_val, y_pred)

print(f"The score against training set is {pred_on_training*100:.4f}%,", 
      f"while the score against validation set is {pred_on_validation*100:.4f}%.")


The score against training set is 71.2424%, while the score against validation set is 72.8592%.


The lessened score againsdt the training set and the increased score against the validation set indicates lessening of overfitting due to the max_depth parameter being set to 2.

In [23]:
#To continue the process, the model will be tuned for parameter according to two features: max_depth, min_leaf_size

print('Tune Max_Depth')
#tune max_depth
for depth in list(range(1, 10, 2)):
    dt = DecisionTreeClassifier(max_depth=depth)
    dt.fit(X_Train, y_train)
    y_pred = dt.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, y_pred)
    print(f"for depth of {depth}, auc is {auc*100:.4f}%")

print('\n\nTune Min_Leaf')
#tune for min_leaf_size
for depth in list(range(1, 10, 2)):
    for min_leaf_size in list(range(1, 10, 2))+[20, 30, 40, 50]:
        dt = DecisionTreeClassifier(max_depth=depth, min_samples_leaf=min_leaf_size)
        dt.fit(X_Train, y_train)
        y_pred = dt.predict_proba(X_val)[:, 1]
        auc = roc_auc_score(y_val, y_pred)
        print(f"for depth of {depth} and min_samples_leaf of {min_leaf_size}, the auc is {auc*100:.4f}%")



Tune Max_Depth
for depth of 1, auc is 63.8137%
for depth of 3, auc is 76.7225%
for depth of 5, auc is 80.0887%
for depth of 7, auc is 78.3574%
for depth of 9, auc is 75.7999%


Tune Min_Leaf
for depth of 1 and min_samples_leaf of 1, the auc is 63.8137%
for depth of 1 and min_samples_leaf of 3, the auc is 63.8137%
for depth of 1 and min_samples_leaf of 5, the auc is 63.8137%
for depth of 1 and min_samples_leaf of 7, the auc is 63.8137%
for depth of 1 and min_samples_leaf of 9, the auc is 63.8137%
for depth of 1 and min_samples_leaf of 20, the auc is 63.8137%
for depth of 1 and min_samples_leaf of 30, the auc is 63.8137%
for depth of 1 and min_samples_leaf of 40, the auc is 63.8137%
for depth of 1 and min_samples_leaf of 50, the auc is 63.8137%
for depth of 3 and min_samples_leaf of 1, the auc is 76.7225%
for depth of 3 and min_samples_leaf of 3, the auc is 76.7225%
for depth of 3 and min_samples_leaf of 5, the auc is 76.7225%
for depth of 3 and min_samples_leaf of 7, the auc is 76.7225%

In [24]:
#The best auc is given by the following combination
#for depth of 9 and min_samples_leaf of 20, the auc is 80.4157%

#train final model with those parameters
dt = DecisionTreeClassifier(max_depth=9, min_samples_leaf=20)
dt.fit(X_Train, y_train)

DecisionTreeClassifier(max_depth=9, min_samples_leaf=20)