In [38]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [15]:
def entropy(sequence):
    value, count = np.unique(sequence,return_counts=True)
    prob = count/np.sum(count)
    return -np.sum(prob*np.log2(prob))

In [16]:
def infogain(X,y,label):
    pivot = X[label].mean()
    left,right = y.loc[X[label]<pivot],y.loc[X[label]>=pivot]
    pl,pr = len(left)/len(X),len(right)/len(X)

    return entropy(y) - pl * entropy(left) - pr * entropy(right)



In [51]:
class Node:
    def __init__(self,label = None,pivot = None,result = None):
        self.label = label
        self.pivot = pivot
        self.result = result

        self.left  = None
        self.right = None
    def __repr__(self):
        if self.label:
            return "{}:{}".format(self.label,self.pivot)
        else:
            return ["Dead","Alive"][int(self.result > 0.5)]

In [84]:
class CustomDT:
    def __init__(self,max_depth = 5):
        self.max_depth = max_depth

    def fit(self,X,y):
        self.root = self.fit_rec(X,y,0)

    def fit_rec(self,X,y,depth):
        if self.max_depth == depth:
            node = Node(result=y.mean())
            return node

        gains = []

        for col in X.columns:
            gains.append([infogain(X,y,col),col])

        maxgain,col = sorted(gains)[-1]

        if maxgain <=0:
            node = Node(result=y.mean())
            return node

        pivot = X[col].mean()
        left,right = X[col]<pivot,X[col]>=pivot

        X_left,X_right = X.loc[left],X.loc[right]
        y_left,y_right = y.loc[left],y.loc[right]

        node = Node(label=col,pivot=pivot)
        node.left = self.fit_rec(X_left,y_left,depth+1)
        node.right = self.fit_rec(X_right,y_right,depth+1)
        return  node

    def display(self,node,indent):
        if node == None:
            return

        print(indent,node)

        self.display(node.left,indent + "\t")
        self.display(node.right,indent+"\t")

    def predict_point(self,node,row):
        if node.result != None:
            return node.result
        if row[node.label] < node.pivot:
            return self.predict_point(node.left,row)
        else:
            return self.predict_point(node.right,row)

In [85]:
model = CustomDT(max_depth=5)

In [86]:
df = pd.read_csv("processed.csv")

X, y = df.drop(["Survived"], axis=1), df.Survived

In [87]:
model.fit(X,y)

In [88]:
model.display(model.root,"")

 Sex:0.6475869809203143
	 Pclass:2.159235668789809
		 Pclass:1.4470588235294117
			 Parch:0.4574468085106383
				 Age:36.11737804878049
					 Alive
					 Alive
				 Parch:1.4333333333333333
					 Alive
					 Alive
			 Parch:0.6052631578947368
				 Age:31.88137254901961
					 Alive
					 Alive
				 Age:24.096774193548388
					 Alive
					 Alive
		 Age:21.573984526112184
			 Parch:0.5789473684210527
				 Age:18.04436396559529
					 Alive
					 Alive
				 SibSp:1.896551724137931
					 Alive
					 Dead
			 SibSp:0.8382352941176471
				 Age:28.701136363636362
					 Dead
					 Dead
				 Age:31.22727272727273
					 Dead
					 Dead
	 Pclass:2.389948006932409
		 Pclass:1.4695652173913043
			 Age:41.507962913195385
				 Age:29.735000000000003
					 Alive
					 Alive
				 SibSp:0.2878787878787879
					 Dead
					 Dead
			 Parch:0.2222222222222222
				 SibSp:0.2247191011235955
					 Dead
					 Dead
				 Age:18.57
					 Alive
					 Dead
		 Age:26.581072278914565
			 Parch:0.33557046979865773
				 A

In [95]:
model.predict_point(model.root,X.iloc[10])

0.6923076923076923

In [97]:

y.iloc[10]

1.0