In [56]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [57]:
data = pd.read_csv("data/loan_data.csv")
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


Okay, I'll drop the most irrelevant column - *Loan_ID*

In [58]:
data.drop("Loan_ID", axis=1, inplace=True)
data.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


Now I'll need to encode some textual data - before that let's see the type of data in each column:

In [59]:
data.dtypes

Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

And let's see if there are any *NaN* values:

In [60]:
data.isnull().sum()

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

## Getting Rid of NaN Values

There are *NaN* values in almost every column. It's a bit problematic because this dataset isn't big.
I'll deal with missing values of non-numerical columns by replacing it with the value which occurs most number of times for that column.

For example:

In [61]:
data["Gender"].value_counts()

Male      489
Female    112
Name: Gender, dtype: int64

There are 489 males in the dataset, so I'll fill *NaN*-s with "Male":

In [62]:
data["Gender"].fillna("Male", inplace=True)

I'll do the same thing with every other column:

In [63]:
data["Married"].value_counts()

Yes    398
No     213
Name: Married, dtype: int64

In [64]:
data["Married"].fillna("Yes", inplace=True)

In [65]:
data["Dependents"].value_counts()

0     345
1     102
2     101
3+     51
Name: Dependents, dtype: int64

In [66]:
data["Dependents"].fillna(0, inplace=True)

In [67]:
data["Self_Employed"].value_counts()

No     500
Yes     82
Name: Self_Employed, dtype: int64

In [68]:
data["Self_Employed"].fillna("No", inplace=True)

And now I'll deal with missing numerical data by filling it with the mean of the column:

In [69]:
data["LoanAmount"].fillna(data["LoanAmount"].mean(), inplace=True)
data["Loan_Amount_Term"].fillna(data["Loan_Amount_Term"].mean(), inplace=True)
data["Credit_History"].fillna(data["Credit_History"].mean(), inplace=True)

And if I check the count of missing values now:

In [70]:
data.isnull().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

## Dealing with String Data

I'll basically Encode each column of type String which has only 2 possible values:

In [71]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(data["Gender"].astype(str))

data["Gender"] = le.transform(data["Gender"])
data.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,1,No,0,Graduate,No,5849,0.0,146.412162,360.0,1.0,Urban,Y
1,1,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,1,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,1,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,1,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [72]:
le.fit(data["Married"].astype(str))
data["Married"] = le.transform(data["Married"])

In [73]:
data["Education"].value_counts()

Graduate        480
Not Graduate    134
Name: Education, dtype: int64

In [74]:
le.fit(data["Education"].astype(str))
data["Education"] = le.transform(data["Education"])

In [75]:
le.fit(data["Self_Employed"].astype(str))
data["Self_Employed"] = le.transform(data["Self_Employed"])

## Dealing With Problematic Data

The values of columns "Dependents" and "Property Area" are problematic because they are made of more than 2 categories.

Here is a proof for that:

In [76]:
data["Dependents"].value_counts()

0     345
1     102
2     101
3+     51
0      15
Name: Dependents, dtype: int64

In [77]:
data["Property_Area"].value_counts()

Semiurban    233
Urban        202
Rural        179
Name: Property_Area, dtype: int64

As you can see, the "Dependants" column contains a value "3+" which isn't really good for the ML model. I'll need to replace it with just "3" - in numerical form obviously.

In [78]:
data["Dependents"].replace("3+", 3, inplace=True)

In [79]:
data["Dependents"].value_counts()

0    345
1    102
2    101
3     51
0     15
Name: Dependents, dtype: int64

Okay, that's dealt with. I'll now deal with "Property_Area" column. It's tricky because it has 3 possible string values. For that, I'll need to replace the column with the dummy variables.

Here's what's my end goal. Instead of having this:

In [80]:
data["Property_Area"][:10]

0        Urban
1        Rural
2        Urban
3        Urban
4        Urban
5        Urban
6        Urban
7    Semiurban
8        Urban
9    Semiurban
Name: Property_Area, dtype: object

I want to have this:

In [81]:
pd.get_dummies(data["Property_Area"][:10])

Unnamed: 0,Rural,Semiurban,Urban
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1
5,0,0,1
6,0,0,1
7,0,1,0
8,0,0,1
9,0,1,0


In this form ML algorithm can understand the data. **NOTE:** I need to avoid multicollinearity - having one variable to be a perfect predictor of the other - it can mess up the algorithm. Pandas has inbuild parameter just for that called `drop_first`:

In [82]:
propery_area = pd.get_dummies(data["Property_Area"], drop_first=True)
propery_area.head()

Unnamed: 0,Semiurban,Urban
0,0,1
1,0,0
2,0,1
3,0,1
4,0,1


If the value of urban is 1 - Urban<br>
If the value of suburban is 1 - Suburban<br>
If both are 0 - Rural

Now I can just concat that DataFrame to the original one:

In [83]:
data = pd.concat([data, propery_area], axis=1)
data.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Semiurban,Urban
0,1,0,0,0,0,5849,0.0,146.412162,360.0,1.0,Urban,Y,0,1
1,1,1,1,0,0,4583,1508.0,128.0,360.0,1.0,Rural,N,0,0
2,1,1,0,0,1,3000,0.0,66.0,360.0,1.0,Urban,Y,0,1
3,1,1,0,1,0,2583,2358.0,120.0,360.0,1.0,Urban,Y,0,1
4,1,0,0,0,0,6000,0.0,141.0,360.0,1.0,Urban,Y,0,1


As the "Property_Area" column is now encoded, we don't need it in the dataset anymore.

In [84]:
data.drop("Property_Area", axis=1, inplace=True)

And here we have the ready dataset!

In [85]:
data.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Semiurban,Urban
0,1,0,0,0,0,5849,0.0,146.412162,360.0,1.0,Y,0,1
1,1,1,1,0,0,4583,1508.0,128.0,360.0,1.0,N,0,0
2,1,1,0,0,1,3000,0.0,66.0,360.0,1.0,Y,0,1
3,1,1,0,1,0,2583,2358.0,120.0,360.0,1.0,Y,0,1
4,1,0,0,0,0,6000,0.0,141.0,360.0,1.0,Y,0,1


## Machine Learning Model

In [86]:
X = data.drop("Loan_Status", axis=1)
y = data["Loan_Status"]  # that's what I'm trying to predict

In [87]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)  # 30% for the training

In [122]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier(criterion="entropy", max_depth=2, min_samples_leaf=1)
dtc.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=2,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [123]:
pred = dtc.predict(X_test)

In [124]:
from sklearn.metrics import accuracy_score

print("Decision Tree Accuracy: {:.2f}%".format(accuracy_score(y_test, pred) * 100))

Decision Tree Accuracy: 80.54%


Not the best accuracy. It was around 72% originally, by tweaking the parameters I was able to get it up slightly.