In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
ds = pd.read_csv('titanic_x_y_train.csv')
ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 668 entries, 0 to 667
Data columns (total 11 columns):
Pclass      668 non-null int64
Name        668 non-null object
Sex         668 non-null object
Age         536 non-null float64
SibSp       668 non-null int64
Parch       668 non-null int64
Ticket      668 non-null object
Fare        668 non-null float64
Cabin       154 non-null object
Embarked    667 non-null object
Survived    668 non-null int64
dtypes: float64(2), int64(4), object(5)
memory usage: 44.4+ KB


In [3]:
cols_to_drop = [
    'Name',
    'Ticket',
    'Cabin',
    'Embarked',
]

df = ds.drop(cols_to_drop, axis=1)
df.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Survived
0,2,female,29.0,1,0,26.0,1
1,3,male,,0,0,8.05,0
2,2,male,39.0,0,0,26.0,0
3,3,female,29.0,0,4,21.075,0
4,3,male,25.0,0,0,7.05,0


In [4]:
def convert_sex_to_num(s):
    if s=='male':
        return 0
    elif s=='female':
        return 1
    else:
        return s

df.Sex = df.Sex.map(convert_sex_to_num)
df.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Survived
0,2,1,29.0,1,0,26.0,1
1,3,0,,0,0,8.05,0
2,2,0,39.0,0,0,26.0,0
3,3,1,29.0,0,4,21.075,0
4,3,0,25.0,0,0,7.05,0


In [5]:
data = df.dropna()
data.describe()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Survived
count,536.0,536.0,536.0,536.0,536.0,536.0,536.0
mean,2.227612,0.375,29.70056,0.516791,0.462687,34.080916,0.425373
std,0.831878,0.484575,14.240257,0.90466,0.910226,47.224669,0.494861
min,1.0,0.0,0.67,0.0,0.0,0.0,0.0
25%,1.0,0.0,21.0,0.0,0.0,8.05,0.0
50%,2.0,0.0,29.0,0.0,0.0,15.875,0.0
75%,3.0,1.0,38.25,1.0,1.0,34.375,1.0
max,3.0,1.0,80.0,5.0,6.0,512.3292,1.0


In [6]:
input_cols = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']
out_cols = ['Survived']

X = data[input_cols]
y = data[out_cols]

#X.head()
print (X.shape, y.shape)
len(set(y))

(536, 6) (536, 1)


1

In [7]:
data = data.reset_index(drop=True)

In [8]:
def divide_data(x_data, fkey, fval):
    x_right = pd.DataFrame([], columns=x_data.columns)
    x_left = pd.DataFrame([], columns=x_data.columns)
    
    
    for ix in range(x_data.shape[0]):
        # Retrieve the current value for the fkey column
        try:
            val = x_data[fkey].loc[ix]
        except:
            print (x_data[fkey])
            val = x_data[fkey].loc[ix]
        # print val
        
        # Check where the row needs to go
        if val > fval:
            # pass the row to right
            x_right = x_right.append(x_data.loc[ix])
        else:
            # pass the row to left
            x_left = x_left.append(x_data.loc[ix])
    
    # return the divided datasets
    return x_left, x_right


def entropy(col):
    p = []
    p.append(col.mean())
    p.append(1-p[0])
    
    ent = 0.0
    for px in p:
        ent += (-1.0 * px * np.log2(px))
    return ent

def information_gain(xdata, fkey, fval):
    left, right = divide_data(xdata, fkey, fval)
    
    if left.shape[0] == 0 or right.shape[0] == 0:
        return -10000
    
    return 2*entropy(xdata.Survived) - (entropy(left.Survived) + entropy(right.Survived))

In [9]:
for fx in X.columns:
    print (fx),
    print (information_gain(data, fx, data[fx].mean()))

Pclass
0.172352763614
Sex
0.393962059127
Age
0.000512004627284
SibSp
0.0031495197338
Parch
0.0105280402212
Fare
0.0621191419711


In [10]:
def dt(X,y,f):
    if len(set(y)) == 0 :
        print("reached leaf node")
        return
    elif len(f) == 0:
        print("Feature not available")
        return
    else:
        max_gain = 0
        for i in f:
            m = information_gain(X,i,X[i].mean())
            if m >= max_gain:
                max_gain = m
                sf = i
        return max_gain,sf

In [11]:
print(dt(data,y,X.columns))

(0.39396205912727811, 'Sex')


In [12]:
class DecisionTree:
    def __init__(self, depth=0, max_depth=5):
        self.left = None
        self.right = None
        self.fkey = None
        self.fval = None
        self.max_depth = max_depth
        self.depth = depth
        self.target = None
    
    def train(self, X_train):
        print ("Level:",self.depth, '-'*10)
        # Get the best possible feature and division value
        features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']
        gains = []
        for fx in features:
            gains.append(information_gain(X_train, fx, X_train[fx].mean()))
        
        # store the best feature (using min information gain)
        self.fkey = features[np.argmax(gains)]
        self.fval = X_train[self.fkey].mean()
        print("Feature:",self.fkey)
        print("Entropy:",self.fval)
        
        # divide the dataset
        data_left, data_right = divide_data(X_train, self.fkey, self.fval)
        data_left = data_left.reset_index(drop=True)
        data_right = data_right.reset_index(drop=True)
        
        # Check the shapes
        if data_left.shape[0] == 0 or data_right.shape[0] == 0:
            if X_train.Survived.mean() >= 0.5:
                self.target = 'Survived'
            else:
                self.target = 'Dead'
            return
        
        if self.depth >= self.max_depth:
            if X_train.Survived.mean() >= 0.5:
                self.target = 'Survived'
            else:
                self.target = 'Dead'
            return
        
        # branch to right
        self.right = DecisionTree(depth=self.depth+1, max_depth=self.max_depth)
        self.right.train(data_right)
        # branch to left
        self.left = DecisionTree(depth=self.depth+1, max_depth=self.max_depth)
        self.left.train(data_left)
        
        if X_train.Survived.mean() >= 0.5:
            self.target = 'Survived'
        else:
            self.target = 'Dead'
        
        return
    
    def predict(self, test):
        if test[self.fkey] > self.fval:
            # go right
            if self.right is None:
                return self.target
            return self.right.predict(test)
        else:
            # go left
            if self.left is None:
                return self.target
            return self.left.predict(test)

In [13]:
split = int(0.8 * data.shape[0])

training_data = data[:split]
testing_data = data[split:]

In [14]:
dt = DecisionTree()
dt.train(training_data)

Level: 0 ----------
Feature: Sex
Entropy: 0.364485981308
Level: 1 ----------
Feature: Fare
Entropy: 41.5807698718
Level: 2 ----------




Feature: SibSp
Entropy: 0.659090909091
Level: 3 ----------
Feature: Parch
Entropy: 0.833333333333
Level: 4 ----------
Feature: Parch
Entropy: 2.0
Level: 5 ----------
Feature: Pclass
Entropy: 3.0
Level: 5 ----------
Feature: Age
Entropy: 33.5555555556
Level: 4 ----------
Feature: Age
Entropy: 32.4285714286
Level: 5 ----------
Feature: Age
Entropy: 39.4444444444
Level: 5 ----------
Feature: Age
Entropy: 19.8
Level: 3 ----------
Feature: Age
Entropy: 33.2
Level: 4 ----------
Feature: Age
Entropy: 44.5
Level: 5 ----------
Feature: Parch
Entropy: 0.5
Level: 5 ----------
Feature: Age
Entropy: 40.0
Level: 4 ----------
Feature: Age
Entropy: 25.6666666667
Level: 5 ----------
Feature: Age
Entropy: 29.5
Level: 5 ----------
Feature: Age
Entropy: 21.8333333333
Level: 2 ----------
Feature: Pclass
Entropy: 2.5
Level: 3 ----------
Feature: SibSp
Entropy: 0.790322580645
Level: 4 ----------
Feature: Parch
Entropy: 1.17857142857
Level: 5 ----------
Feature: SibSp
Entropy: 3.0
Level: 5 ----------
Feature:

In [15]:
print (dt.fkey, dt.fval)
print (dt.right.fkey, dt.right.fval)
print (dt.left.fkey, dt.left.fval)

print (dt.right.right.fkey, dt.right.right.fval)
print (dt.right.left.fkey, dt.right.left.fval)


print (dt.left.right.fkey, dt.left.right.fval)
print (dt.left.left.fkey, dt.left.left.fval)


Sex 0.364485981308
Fare 41.5807698718
Pclass 2.32352941176
SibSp 0.659090909091
Pclass 2.5
Age 26.9280821918
Pclass 1.53968253968


In [17]:
for ix in testing_data.index[:10]:
    print (dt.predict(testing_data.loc[ix]))

Dead
Dead
Dead
Dead
Survived
Dead
Dead
Dead
Survived
Survived
