In [297]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder as LE
import pandas as pd

In [298]:
df=pd.read_csv("titanic_train.csv")

In [299]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [300]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [301]:
selected_cols=['Survived', 'Pclass', 'Sex', 'Age', 'SibSp',
       'Parch']

In [302]:
data=df.loc[:,['Survived', 'Pclass', 'Sex', 'Age', 'SibSp',
       'Parch']]

In [303]:
le=LE()

In [304]:
data["Sex"]=le.fit_transform(data.Sex)

In [305]:
data.Age.fillna(np.mean(data.Age,axis=0),inplace=True)

In [306]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 6 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null int32
Age         891 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
dtypes: float64(1), int32(1), int64(4)
memory usage: 38.4 KB


In [307]:
def entropy(columns):
    value,counts=np.unique(columns,return_counts=True)
    total_entropy=0
    for count in counts:
        p=count/len(columns)
        total_entropy+=p*np.log2(p)
        
    return -total_entropy
        

In [308]:
entropy(data.Sex)

0.9362046432498521

In [309]:
def info_gain(X,y,label):
    pivot=np.mean(X[label])
    left_of_pivot=y[X[label]<pivot]
    right_of_pivot=y[X[label]>=pivot]
    if (len(left_of_pivot)==0 or len(right_of_pivot))==0:
        return -1000
    p_left=len(left_of_pivot)/len(y)
    p_right=len(right_of_pivot)/len(y)
    info_gain=entropy(y)-p_left*entropy(left_of_pivot)-p_right*entropy(right_of_pivot)
    return info_gain

In [310]:
X=data.loc[:,['Pclass', 'Sex', 'Age', 'SibSp',
       'Parch']]
y=data['Survived']

In [311]:
for label in X.columns:
    print(label,info_gain(X,y,label))

Pclass 0.07579362743608165
Sex 0.2176601066606143
Age 0.001158644038169343
SibSp 0.009584541813400127
Parch 0.015380754493137666


In [312]:
# #class Node:
#     def __init__(self,label=None,value=None,result=None):
#         self.label=label
#         self.value=value
#         self.result=result

In [313]:
# class DecisionTree:
#     def __init__(self,max_depth):
#         self.max_depth=max_depth
    
#     def fit(self,X,y):
#         self.root=self.generate(X,y,self.max_depth)
        
#         if depth==1:
#             return Node(result=np.mean(y))
        
#         gains=[]
#         for label in X.columns:
#             gain=info_gain(X,y,label)
#             gains.append((gain,label))
        
#         selected_Node=max(gains)[1]
#         pivot=np.mean(x[selected_Node])
#         node=Node(selected_Node,np.mean(y))
        
#         X_left=X[X[selected_Node]<pivot]
#         X_right=X[X[selected_Node]>=pivot]
        
#         y_left=y[X[selected_Node]<pivot]
#         y_right=y[X[selected_Node]>=pivot]
        
#         if (len(y_left)==0) or (len(y_right)==0):
#             return Node(result=np.mean(y))
        
#         node=Node(selected_Node,pivot)
        
        
        
            
        

In [314]:
class Node:
    
    def __init__(self,label=None,value=None,result=None):
        self.label=label
        self.value=value
        self.result=result

In [427]:
class DeciTree:
    
    def __init__(self,max_depth=5):
        self.max_depth=max_depth
        
    def fit(self,X,y):
        self.X=X
        self.y=y
        self.root=self.generate(self.X,self.y,self.max_depth)
                
    def generate(self,X,y,depth):
        if depth==1:
            return Node(result=np.mean(y))
        
        gains=[]
        for label in X.columns:
            gain=info_gain(X,y,label)
            gains.append((gain,label))
        
        selected=max(gains)[1]
        pivot=np.mean(X[selected])
        
        right_X=X[X[selected]>=pivot]
        left_X=X[X[selected]<pivot]
        
        right_y=y[X[selected]>=pivot]
        left_y=y[X[selected]<pivot]
        
        if(len(left_y)==0) or (len(right_y)==0):
            return Node(result=np.mean(y))
        node=Node(selected,pivot)
        node.left=self.generate(left_X,left_y,depth-1)
        node.right=self.generate(right_X,right_y,depth-1)
        return node
    
    def display(self,node,indent=""):
        if node.label==None:
            if node.result>=0.5:
                print(indent,"Survived")
            else:
                print(indent,"Died")
            return
        
        print(indent,node.label,node.value)
        
        self.display(node.left,indent+"\t")
        self.display(node.right,indent+"\t")
        
    def predict_one(self,item,node):
        if node.label==None:
            return node.result
        pivot=node.value
        if(item[node.label]<pivot):
            return self.predict_one(item,node.left)
        else:
            return self.predict_one(item,node.right)
        
    def predict(self,X):
        y=[]
        for row in X.iterrows():
            results=self.predict_one(row[1],self.root)
            y.append(int(results>=0.5))
        return np.array(y)
        
         
    def score(self,X,y):
        yp=self.predict(X)
        return sum(y.values==yp)/len(y)
        

In [441]:
model=DeciTree(max_depth=5)

In [442]:
model.fit(X,y)

In [443]:
model.display(model.root)

 Sex 0.6475869809203143
	 Pclass 2.159235668789809
		 Pclass 1.4470588235294117
			 Parch 0.4574468085106383
				 Survived
				 Survived
			 Parch 0.6052631578947368
				 Survived
				 Survived
		 Parch 0.7986111111111112
			 Age 26.090266435986155
				 Survived
				 Survived
			 SibSp 1.694915254237288
				 Died
				 Died
	 Pclass 2.389948006932409
		 Pclass 1.4695652173913043
			 Age 39.287716972034715
				 Died
				 Died
			 Parch 0.2222222222222222
				 Died
				 Died
		 Parch 0.22478386167146974
			 Age 29.07305445151033
				 Died
				 Died
			 SibSp 2.607843137254902
				 Died
				 Died


In [431]:
model.predict(X)

array([0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1,
       1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1,

In [432]:
model.score(X,y)

0.8035914702581369