In [18]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.externals.six import StringIO  
from IPython.display import Image 
from sklearn.tree import export_graphviz
import pydot

In [3]:
file = '..\data\db.csv'
default_borrower = pd.read_csv(file)

### Discretize Annual Income into  [0, 100], (100, 200], (200, )

In [4]:
# Discretize AI into specified bins
bins = pd.IntervalIndex.from_tuples([(0, 100), (101, 200), (200, np.inf)])
default_borrower['Annual Income'] = pd.cut(default_borrower['Annual Income'], bins=bins)
default_borrower

Unnamed: 0,Home Owner,Marital Status,Annual Income,Default Borrower
0,Y,Single,"(101.0, 200.0]",N
1,N,Married,"(0.0, 100.0]",N
2,N,Single,"(0.0, 100.0]",N
3,Y,Married,"(101.0, 200.0]",N
4,N,Divorced,"(0.0, 100.0]",Y
5,N,Married,"(0.0, 100.0]",N
6,Y,Divorced,"(200.0, inf]",N
7,N,Single,"(0.0, 100.0]",Y
8,N,Married,"(0.0, 100.0]",N
9,N,Single,"(0.0, 100.0]",Y


#### Entropy(data) = $$ -\frac{3}{10} \log_2 \frac{3}{10} - \frac{7}{10} \log_2 \frac{7}{10} = 0.8813 $$

### HOME OWNER

#### Entropy(Home Owner=Y) = $$ -\frac{3}{3} \log_2 \frac{3}{3} - \frac{0}{3} \log_2 \frac{0}{3} = 0 $$

#### Entropy(Home Owner=N) = $$ -\frac{3}{7} \log_2 \frac{3}{7} - \frac{4}{7} \log_2 \frac{4}{7} = 0.9852 $$

#### Info Gain(Home Owner) = $$ 0.8813 -(\frac{3}{10} \times 0 + \frac{7}{10} \times 0.9852) = 0.19166 $$


### MARITAL STATUS

#### Entropy(Marital Status=Single) = $$ -\frac{2}{4} \log_2 \frac{2}{4} - \frac{2}{4} \log_2 \frac{2}{4} = 1 $$

#### Entropy(Marital Status=Divorced) = $$ -\frac{1}{2} \log_2 \frac{1}{2} - \frac{1}{2} \log_2 \frac{4}{7} = 1 $$

#### Entropy(Marital Status=Married) = $$ -\frac{0}{4} \log_2 \frac{0}{4} - \frac{4}{4} \log_2 \frac{4}{4} = 0 $$

#### Info Gain(Marital Status) = $$ 0.8813 -(\frac{4}{10} \times 1 + \frac{2}{10} \times 1 + \frac{4}{10} \times 0) = 0.2813 $$


### ANNUAL INC

#### Entropy(Annual Inc=[0, 100]) = $$ -\frac{3}{7} \log_2 \frac{3}{7} - \frac{4}{7} \log_2 \frac{4}{7} = 0.9852 $$

#### Entropy(Annual Inc=(101, 200]) = $$ -\frac{0}{2} \log_2 \frac{0}{2} - \frac{2}{2} \log_2 \frac{2}{2} = 0 $$

#### Entropy(Annual Inc=(201, ]) = $$ -\frac{0}{1} \log_2 \frac{0}{1} - \frac{1}{1} \log_2 \frac{1}{1} = 0 $$

#### Info Gain(Marital Status) = $$ 0.8813 -(\frac{7}{10} \times 0.9852 + \frac{2}{10} \times 0 + \frac{1}{10} \times 0) = 0.19166 $$

## Split on the attribute with the highest Info Gain: Marital Status

In [5]:
display(default_borrower[default_borrower['Marital Status'] == 'Married'])

Unnamed: 0,Home Owner,Marital Status,Annual Income,Default Borrower
1,N,Married,"(0.0, 100.0]",N
3,Y,Married,"(101.0, 200.0]",N
5,N,Married,"(0.0, 100.0]",N
8,N,Married,"(0.0, 100.0]",N


## Current subset: Marital Status=Married
## Entropy(subset) = 0
## Since this subset is 100% Pure (DB=N: 4, DB=Y: 0), make this a leaf node

In [6]:
display(default_borrower[default_borrower['Marital Status'] == 'Single'])

Unnamed: 0,Home Owner,Marital Status,Annual Income,Default Borrower
0,Y,Single,"(101.0, 200.0]",N
2,N,Single,"(0.0, 100.0]",N
7,N,Single,"(0.0, 100.0]",Y
9,N,Single,"(0.0, 100.0]",Y


## Current subset: Marital Status=Single

#### Entropy(subset) = $$ -\frac{2}{4} \log_2 \frac{2}{4} - \frac{2}{4} \log_2 \frac{2}{4} = 1 $$

### HOME OWNER

#### Entropy(Home Owner=Y) = $$ -\frac{0}{1} \log_2 \frac{0}{1} - \frac{1}{1} \log_2 \frac{1}{1} = 0 $$

#### Entropy(Home Owner=N) = $$ -\frac{2}{3} \log_2 \frac{2}{3} - \frac{1}{3} \log_2 \frac{1}{3} = 0.9183 $$

#### Info Gain(Home Owner) = $$ 1 -(\frac{1}{4} \times 0 + \frac{3}{4} \times 0.9183) = 0.3133 $$


### ANNUAL INC

#### Entropy(Annual Inc=[0, 100]) = $$ -\frac{2}{3} \log_2 \frac{3}{3} - \frac{1}{3} \log_2 \frac{1}{3} = 0.9183 $$

#### Entropy(Annual Inc=(101, 200]) = $$ -\frac{0}{1} \log_2 \frac{0}{1} - \frac{1}{1} \log_2 \frac{1}{1} = 0 $$

#### Info Gain(Annual Inc) = $$ 1 -(\frac{3}{4} \times 0.9183 + \frac{1}{4} \times 0) = 0.3133 $$

## Split on the attribute with the highest Info Gain: Annual Income

In [7]:
default_borrower.iloc[[0]]

Unnamed: 0,Home Owner,Marital Status,Annual Income,Default Borrower
0,Y,Single,"(101.0, 200.0]",N


## Current subset: Marital Status=Single AND Annual Inc=(101, 200]
## Entropy(subset) = 0
## Since this subset is 100% Pure (DB=N: 1, DB=Y: 0), make this a leaf node

# Keep repeating the above steps until pure subsets are reached, or the size of the subset falls below a threshold

In [10]:
data_tree = default_borrower.copy()
le = preprocessing.LabelEncoder()
data_tree['Home Owner'] = le.fit_transform(data_tree['Home Owner'])
data_tree['Marital Status'] = le.fit_transform(data_tree['Marital Status'])
data_tree['Annual Income'] = le.fit_transform(data_tree['Annual Income'])
data_tree['Default Borrower'] = le.fit_transform(data_tree['Default Borrower'])

In [11]:
data_tree

Unnamed: 0,Home Owner,Marital Status,Annual Income,Default Borrower
0,1,2,1,0
1,0,1,0,0
2,0,2,0,0
3,1,1,1,0
4,0,0,0,1
5,0,1,0,0
6,1,0,2,0
7,0,2,0,1
8,0,1,0,0
9,0,2,0,1


In [25]:
import os
os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'

clf = DecisionTreeClassifier(random_state=0)
clf.fit(data_tree[['Home Owner', 'Marital Status', 'Annual Income']], data_tree['Default Borrower'])

export_graphviz(clf, out_file='tree.dot')
dot_data = StringIO() 
export_graphviz(clf, out_file=dot_data)
(graph, ) = pydot.graph_from_dot_data(dot_data.getvalue())
graph.write_pdf("iris")

FileNotFoundError: [WinError 2] "dot" not found in path.