In [2]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.externals.six import StringIO  
from IPython.display import Image 
from sklearn.tree import export_graphviz
import pydot

In [3]:
file = '..\data\db.csv'
default_borrower = pd.read_csv(file)

### Discretize Annual Income into  [0, 100], (100, 200], (200, )

In [4]:
# Discretize AI into specified bins
bins = pd.IntervalIndex.from_tuples([(0, 100), (101, 200), (200, np.inf)])
default_borrower['Annual Income'] = pd.cut(default_borrower['Annual Income'], bins=bins)
default_borrower

Unnamed: 0,Home Owner,Marital Status,Annual Income,Default Borrower
0,Y,Single,"(101.0, 200.0]",N
1,N,Married,"(0.0, 100.0]",N
2,N,Single,"(0.0, 100.0]",N
3,Y,Married,"(101.0, 200.0]",N
4,N,Divorced,"(0.0, 100.0]",Y
5,N,Married,"(0.0, 100.0]",N
6,Y,Divorced,"(200.0, inf]",N
7,N,Single,"(0.0, 100.0]",Y
8,N,Married,"(0.0, 100.0]",N
9,N,Single,"(0.0, 100.0]",Y


#### Entropy(data) = $$ -\frac{3}{10} \log_2 \frac{3}{10} - \frac{7}{10} \log_2 \frac{7}{10} = 0.8813 $$

### HOME OWNER

#### Entropy(Home Owner=Y) = $$ -\frac{3}{3} \log_2 \frac{3}{3} - \frac{0}{3} \log_2 \frac{0}{3} = 0 $$

#### Entropy(Home Owner=N) = $$ -\frac{3}{7} \log_2 \frac{3}{7} - \frac{4}{7} \log_2 \frac{4}{7} = 0.9852 $$

#### Info Gain(Home Owner) = $$ 0.8813 -(\frac{3}{10} \times 0 + \frac{7}{10} \times 0.9852) = 0.19166 $$


### MARITAL STATUS

#### Entropy(Marital Status=Single) = $$ -\frac{2}{4} \log_2 \frac{2}{4} - \frac{2}{4} \log_2 \frac{2}{4} = 1 $$

#### Entropy(Marital Status=Divorced) = $$ -\frac{1}{2} \log_2 \frac{1}{2} - \frac{1}{2} \log_2 \frac{4}{7} = 1 $$

#### Entropy(Marital Status=Married) = $$ -\frac{0}{4} \log_2 \frac{0}{4} - \frac{4}{4} \log_2 \frac{4}{4} = 0 $$

#### Info Gain(Marital Status) = $$ 0.8813 -(\frac{4}{10} \times 1 + \frac{2}{10} \times 1 + \frac{4}{10} \times 0) = 0.2813 $$


### ANNUAL INC

#### Entropy(Annual Inc=[0, 100]) = $$ -\frac{3}{7} \log_2 \frac{3}{7} - \frac{4}{7} \log_2 \frac{4}{7} = 0.9852 $$

#### Entropy(Annual Inc=(101, 200]) = $$ -\frac{0}{2} \log_2 \frac{0}{2} - \frac{2}{2} \log_2 \frac{2}{2} = 0 $$

#### Entropy(Annual Inc=(201, ]) = $$ -\frac{0}{1} \log_2 \frac{0}{1} - \frac{1}{1} \log_2 \frac{1}{1} = 0 $$

#### Info Gain(Annual Inc) = $$ 0.8813 -(\frac{7}{10} \times 0.9852 + \frac{2}{10} \times 0 + \frac{1}{10} \times 0) = 0.19166 $$

## Split on the attribute with the highest Info Gain: Marital Status. 


![title](img/tree1.png)

## After Splitting on Marital Status, repeat the above steps for all the children of Marital Status
## Current child: Marital Status=Married

In [5]:
display(default_borrower[default_borrower['Marital Status'] == 'Married'])

Unnamed: 0,Home Owner,Marital Status,Annual Income,Default Borrower
1,N,Married,"(0.0, 100.0]",N
3,Y,Married,"(101.0, 200.0]",N
5,N,Married,"(0.0, 100.0]",N
8,N,Married,"(0.0, 100.0]",N


## Entropy(child) = 0
## Since this subset is 100% Pure (DB=N: 4, DB=Y: 0), make this a leaf node

![title](img/tree2.png)

## Current child: Marital Status=Single

In [6]:
display(default_borrower[default_borrower['Marital Status'] == 'Single'])

Unnamed: 0,Home Owner,Marital Status,Annual Income,Default Borrower
0,Y,Single,"(101.0, 200.0]",N
2,N,Single,"(0.0, 100.0]",N
7,N,Single,"(0.0, 100.0]",Y
9,N,Single,"(0.0, 100.0]",Y


#### Entropy(child) = $$ -\frac{2}{4} \log_2 \frac{2}{4} - \frac{2}{4} \log_2 \frac{2}{4} = 1 $$

### HOME OWNER

#### Entropy(Home Owner=Y) = $$ -\frac{0}{1} \log_2 \frac{0}{1} - \frac{1}{1} \log_2 \frac{1}{1} = 0 $$

#### Entropy(Home Owner=N) = $$ -\frac{2}{3} \log_2 \frac{2}{3} - \frac{1}{3} \log_2 \frac{1}{3} = 0.9183 $$

#### Info Gain(Home Owner) = $$ 1 -(\frac{1}{4} \times 0 + \frac{3}{4} \times 0.9183) = 0.3133 $$


### ANNUAL INC

#### Entropy(Annual Inc=[0, 100]) = $$ -\frac{2}{3} \log_2 \frac{3}{3} - \frac{1}{3} \log_2 \frac{1}{3} = 0.9183 $$

#### Entropy(Annual Inc=(101, 200]) = $$ -\frac{0}{1} \log_2 \frac{0}{1} - \frac{1}{1} \log_2 \frac{1}{1} = 0 $$

#### Info Gain(Annual Inc) = $$ 1 -(\frac{3}{4} \times 0.9183 + \frac{1}{4} \times 0) = 0.3133 $$

## Split on the attribute with the highest Info Gain: Annual Income
#### Note: It is okay to select Home Owner as well

![title](img/tree3.png)

## Current child: Marital Status=Single AND Annual Inc=(101, 200]

In [7]:
default_borrower.iloc[[0]]

Unnamed: 0,Home Owner,Marital Status,Annual Income,Default Borrower
0,Y,Single,"(101.0, 200.0]",N


## Entropy(child) = 0
## Since this subset is 100% Pure (DB=N: 1, DB=Y: 0), make this a leaf node

![title](img/tree4.png)

# Keep repeating the above steps until pure subsets are reached, or the size of the subset falls below a threshold