In [258]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder as LE

In [259]:
df=pd.read_csv("titanic_train.csv")

In [260]:
df.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [261]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [262]:
selected_cols=[ 'Survived','Pclass', 'Sex', 'Age', 'SibSp','Parch']

In [263]:
data=df.loc[:,selected_cols]

In [264]:
#data.Sex

In [265]:
le=LE()

In [268]:
data["Sex"]=le.fit_transform(data.Sex) # Used to convert names to number values

In [269]:
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,1
0,0,3,1,22.0,1,0,1
1,1,1,0,38.0,1,0,0
2,1,3,0,26.0,0,0,0
3,1,1,0,35.0,1,0,0
4,0,3,1,35.0,0,0,1


In [270]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null int32
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
1           891 non-null int32
dtypes: float64(1), int32(2), int64(4)
memory usage: 41.8 KB


As over here total each coloumn has 891 data but age has less data it means some of the data is missing over here so we will use below approach to fill the missing values

In [273]:
data.Age.fillna(np.mean(data.Age),inplace=True) # Inplace is used to update previous data if we wl not use it this will create a new table

In [274]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null int32
Age         891 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
1           891 non-null int32
dtypes: float64(1), int32(2), int64(4)
memory usage: 41.8 KB


In [220]:
def entropy(column):
    vals,counts=np.unique(column,return_counts=True)
    acc=0
    #list=[]
    for count in counts:
        p=count/len(column)
        #list.append(p)
        acc +=p*np.log2(p)
    #print(list)
    #print(vals)
    #print(counts)
    return -acc

In [276]:
entropy(data.Survived)

0.9607079018756469

In [222]:
def info_gain(X,y,label):
    pivot=np.mean(X[label])
    
    y_left=y[X[label]<pivot]
    y_right=y[X[label]>=pivot]
    
    if (len(y_left)==0 or len(y_right))==0:
        return -1000
    p_left=(len(y_left)/len(y))
    p_right=(len(y_right)/len(y))
    return entropy(y)-p_left*entropy(y_left)-p_right*entropy(y_right)
        

In [223]:
X=data.loc[:,['Pclass', 'Sex', 'Age', 'SibSp', 'Parch']]

In [224]:
y=data['Survived']

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch'], dtype='object')

In [91]:
for label in X.columns:
    print(label,info_gain(X,y,label))

Pclass 0.07579362743608165
Sex 0.2176601066606143
Age 0.001158644038169343
SibSp 0.009584541813400127
Parch 0.015380754493137666


# Decision Tree Code For The Above Data

In [92]:
class Node:
    
    def __init__(self,label=None,value=None,result=None):
        self.label=label
        self.value=value
        self.result=result
        

In [293]:
class DecisionTree:
    
    def __init__(self,max_depth=5):
        self.max_depth=max_depth # Depth of tree
        
    def fit(self,X,y):
     #   self.X=X
      #  self.y=y
        self.root=self.generate(X,y,self.max_depth)
        
    def generate(self,X,y,depth):
        if depth==1:
            return Node(result=np.mean(y))
        
        gains=[]
        for label in X.columns:
            gain=info_gain(X,y,label)
            gains.append((gain,label))
            
        selected_label=max(gains)[1]
        pivot=np.mean(X[selected_label])
        node=Node(selected_label,np.mean(y))
        
        left_X=X[X[selected_label]<pivot]
        right_X=X[X[selected_label]>=pivot]
        
        
        left_y=y[X[selected_label]<pivot]
        right_y=y[X[selected_label]>=pivot]
        
        if(len(left_y)==0) or (len(right_y)==0):
            return Node(result=np.mean(y))
        
        node=Node(selected_label,pivot)
        node.left=self.generate(left_X,left_y,depth-1)
        node.right=self.generate(right_X,right_y,depth-1)
        
        return node
    
    def display(self,node,indent=""):
        if(node.label==None):
            if node.result<0.5:
                print(indent,"Died")
            else:
                print(indent,"Survived")
            return
        
        print(indent,node.label,node.value)
        
        self.display(node.left,indent+"\t")
        self.display(node.right,indent+"\t")
        
        
    def predict_one(self,item,node):
        if node.label==None:
            return node.result
        pivot=node.value
        if(item[node.label]<pivot):
            return self.predict_one(item,node.left)
        else:
            return self.predict_one(item,node.right)
        
    def predict(self,X):
        y=[]
        for row in X.iterrows():
            results=self.predict_one(row[1],self.root)
            y.append(int(results>=0.5))
        
        #for i in y:
         #   if i==1:
          #      print("Alive")
           # else:
            #    print("Died")
        return np.array(y)
        
    def score(self,X,y):
        yp=self.predict(X)
        return sum(y.values==yp)/len(y)

In [286]:
from sklearn.model_selection import train_test_split

In [287]:
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [290]:
model=DecisionTree(max_depth=5)

In [291]:
model.fit(X_train,y_train)

In [292]:
model.display(model.root)

 Sex 0.6543624161073825
	 Pclass 2.1941747572815533
		 SibSp 0.514018691588785
			 Age 32.97252371916508
				 Survived
				 Survived
			 Age 29.644444444444446
				 Survived
				 Survived
		 SibSp 0.9797979797979798
			 Parch 0.5555555555555556
				 Survived
				 Survived
			 Parch 1.1111111111111112
				 Died
				 Died
	 Pclass 2.4128205128205127
		 Parch 0.20915032679738563
			 Pclass 1.5
				 Died
				 Died
			 Age 29.06
				 Survived
				 Died
		 Age 27.647099776619505
			 Parch 0.3548387096774194
				 Died
				 Died
			 Parch 0.14583333333333334
				 Died
				 Died


In [197]:
model.predict(X_test)

array([0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
       1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 1, 1, 0])

In [198]:
y_test.values

array([1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1,
       0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 1, 0, 0], dtype=int64)

In [199]:
model.score(X_test,y_test)

0.8135593220338984

# Try Sklearn.DecisionTree