In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [32]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [33]:
df=df.drop(['Name','PassengerId','Ticket','Cabin'],axis=1)

In [34]:
df['Embarked']=df['Embarked'].fillna('S')

In [35]:
mean = df["Age"].mean()
df.Age.fillna(mean)

0      22.000000
1      38.000000
2      26.000000
3      35.000000
4      35.000000
         ...    
886    27.000000
887    19.000000
888    29.699118
889    26.000000
890    32.000000
Name: Age, Length: 891, dtype: float64

In [36]:
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      0
dtype: int64

In [37]:
genders={'male':0,'female':1}
df['Sex']=df['Sex'].map(genders)

ports={"S":0,"C":1,"Q":2}
df['Embarked']=df['Embarked'].map(ports)
df.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,22.0,1,0,7.25,0
1,1,1,1,38.0,1,0,71.2833,1
2,1,3,1,26.0,0,0,7.925,0
3,1,1,1,35.0,1,0,53.1,0
4,0,3,0,35.0,0,0,8.05,0
5,0,3,0,,0,0,8.4583,2
6,0,1,0,54.0,0,0,51.8625,0
7,0,3,0,2.0,3,1,21.075,0
8,1,3,1,27.0,0,2,11.1333,0
9,1,2,1,14.0,1,0,30.0708,1


In [38]:
X=df.drop(columns=['Survived'],axis=1)
y=pd.DataFrame(df['Survived'])
print(X.shape)
print(y.shape)
y

(891, 7)
(891, 1)


Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0
...,...
886,0
887,1
888,0
889,1


In [39]:
# Defining Entropy and info gain
# We calculate for a given feature..

In [40]:
def  entropy(col):
    counts=np.unique(col,return_counts=True)
    N=float(col.shape[0])
    entropy=0.0
    for i in counts[1]:
        p=i/N
        entropy+=(p*np.log2(p))
    return -1*entropy
col=np.array([1,1,1,0,0,0])
print(entropy(col))

1.0


In [41]:
def divide_data(x_data,fkey,fval):
    # Works Only with pandas dataframe
    x_right = pd.DataFrame([],columns=x_data.columns)
    x_left = pd.DataFrame([],columns=x_data.columns)

    for ix in range(x_data.shape[0]):
        val=x_data[fkey].loc[ix]
        if val > fval:
            x_right = x_right.append(x_data.loc[ix])
        else:
            x_left=x_left.append(x_data.loc[ix])
    return x_left,x_right
# left,right=divide_data(y,'Survived',0.5)
# left

In [42]:
def information_gain(x_data,fkey,fval):
    left,right=divide_data(x_data,fkey,fval)
    
    # % of total samples which are on left and right
    l=float(left.shape[0]/x_data.shape[0])
    r=float(right.shape[0]/x_data.shape[0])

    # All examples come on one side
    if left.shape[0] == 0 or right.shape[0]==0:
        return -1000000
    i_gain=entropy(x_data['Survived'])-(l*entropy(left['Survived'])+r*entropy(right['Survived']))
    return i_gain

In [43]:
for colname in X.columns:
    print(colname)
    print(information_gain(df,colname,df[colname].mean()))

Pclass
0.07579362743608165
Sex
0.2176601066606142
Age
0.0008836151229467681
SibSp
0.009584541813400071
Parch
0.015380754493137694
Fare
0.042140692838995464
Embarked
0.015909401384176403


In [44]:
class Decisiontree:
    def __init__(self,depth=0,max_depth=5):
        self.left=None
        self.right=None
        self.fkey=None
        self.fval=None
        self.depth=depth
        self.max_depth=max_depth
        self.target=None
    def train(self,x_train):
        features=['Pclass','Sex','SibSp','Parch','Fare']
        info_gains=[]

        for ix in features:
            i_gain=information_gain(x_train,ix,x_train[ix].mean())
            info_gains.append(i_gain)
        self.fkey = features[np.argmax(info_gains)]
        self.fval = x_train[self.fkey].mean()
        print("Making Tree Features is", self.fkey)

        #Split Data
        data_left,data_right = divide_data(x_train,self.fkey,self.fval)
        data_left = data_left.reset_index(drop=True)
        data_right = data_right.reset_index(drop=True)

        # If pure leaf node is there
        if data_right.shape[0]==0 or data_left.shape[0] == 0:
            if x_train.Survived.mean()>=0.5:
                self.target=1
            else:
                self.target=0
            return 

        # Stop early whern depth reaches the max_depth
        if self.depth>=self.max_depth :
            if x_train.Survived.mean() >= 0.5:
                self.target = 1
            else:
                self.target = 0
            return

        # Recursive case
        self.left=Decisiontree(depth=self.depth+1,max_depth=self.max_depth)
        self.left.train(data_left)
        
        self.right = Decisiontree(depth=self.depth+1,max_depth=self.max_depth+1)
        self.right.train(data_right)

        # Setting the target at every node
        if x_train.Survived.mean() >= 0.5:
            self.target = 1
        else:
            self.target = 0

    def predict(self,test):
        if test[self.fkey] > self.fval :
            if self.right is None:
                return self.target
            return self.right.predict(test)
        else:
            if self.left is None:
                return self.target
            return self.left.predict(test)

# Submitting to Kaggle for testing

In [45]:
x_test=pd.read_csv('test.csv')
genders={"male":0,"female":1}
x_test['Sex']=x_test['Sex'].map(genders)
ports={"S":0,"C":1,"Q":2}
x_test['Embarked']=x_test['Embarked'].map(ports)
x_test
x_test.isnull().sum()
x_test.Age.fillna(mean)
Fare_mean=x_test.Fare.mean()
x_test.Fare.fillna(Fare_mean)
x_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",0,34.5,0,0,330911,7.8292,,2
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,1,0,363272,7.0,,0
2,894,2,"Myles, Mr. Thomas Francis",0,62.0,0,0,240276,9.6875,,2
3,895,3,"Wirz, Mr. Albert",0,27.0,0,0,315154,8.6625,,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,1,1,3101298,12.2875,,0


In [46]:
dt=Decisiontree()
dt.train(df)

Making Tree Features is Sex
Making Tree Features is Fare
Making Tree Features is Parch
Making Tree Features is Pclass
Making Tree Features is Pclass
Making Tree Features is Fare
Making Tree Features is SibSp
Making Tree Features is Fare
Making Tree Features is Fare
Making Tree Features is Pclass
Making Tree Features is Fare
Making Tree Features is Pclass
Making Tree Features is Pclass
Making Tree Features is SibSp
Making Tree Features is Fare
Making Tree Features is Fare
Making Tree Features is Fare
Making Tree Features is Fare
Making Tree Features is Fare
Making Tree Features is Fare
Making Tree Features is Fare
Making Tree Features is Pclass
Making Tree Features is Fare
Making Tree Features is Pclass
Making Tree Features is Pclass
Making Tree Features is Fare
Making Tree Features is SibSp
Making Tree Features is Fare
Making Tree Features is Fare
Making Tree Features is Pclass
Making Tree Features is Pclass
Making Tree Features is SibSp
Making Tree Features is Fare
Making Tree Feature

In [47]:
print(dt.fkey,dt.fval,dt.left.fkey,dt.right.fkey)

Sex 0.35241301907968575 Fare Pclass


In [55]:
cols=["PassengerId","Survived"]
y_pred=[]
for ix in range(x_test.shape[0]):
    lst=[]
    lst.append(x_test.loc[ix].PassengerId)
    lst.append(dt.predict(x_test.loc[ix]))
    y_pred.append(lst)
submission=pd.DataFrame(y_pred,columns=cols)

In [57]:
filename = 'kaggle Titanic Predictions 1.csv'

submission.to_csv(filename,index=False)

print('Saved file: ' + filename)

Saved file: kaggle Titanic Predictions 1.csv


# Testing the predictions locally

In [69]:
new_train_data=df.loc[:680,:]
new_test_data=df.loc[680:,:]

y_actual=new_test_data.Survived.to_numpy()
y_actual.shape

(211,)

In [74]:
desct=Decisiontree()
desct.train(new_train_data)

Making Tree Features is Sex
Making Tree Features is Fare
Making Tree Features is Parch
Making Tree Features is SibSp
Making Tree Features is Fare
Making Tree Features is Pclass
Making Tree Features is Fare
Making Tree Features is Pclass
Making Tree Features is Pclass
Making Tree Features is Pclass
Making Tree Features is Fare
Making Tree Features is Fare
Making Tree Features is Fare
Making Tree Features is Pclass
Making Tree Features is Fare
Making Tree Features is Pclass
Making Tree Features is Pclass
Making Tree Features is Pclass
Making Tree Features is Fare
Making Tree Features is Fare
Making Tree Features is Pclass
Making Tree Features is Fare
Making Tree Features is SibSp
Making Tree Features is Fare
Making Tree Features is Fare
Making Tree Features is Pclass
Making Tree Features is Pclass
Making Tree Features is SibSp
Making Tree Features is Fare
Making Tree Features is Fare
Making Tree Features is Fare
Making Tree Features is Pclass
Making Tree Features is Pclass
Making Tree Fe

In [76]:
lst=[]
for ix in range(new_test_data.shape[0]):
    lst.append(desct.predict(new_test_data.loc[ix+680]))
y_pred=np.asarray(lst)

# Classification measures

In [81]:
acc=np.average(y_actual==y_pred)
acc

0.7962085308056872

In [86]:
from sklearn.metrics import confusion_matrix,classification_report
confusion_matrix(y_actual,y_pred)

array([[117,  17],
       [ 26,  51]], dtype=int64)

In [89]:
x=np.unique(y_pred,return_counts=True)
x

(array([0, 1]), array([143,  68], dtype=int64))

In [87]:
target_names=['Dead',"Survived"]
print(classification_report(y_actual,y_pred,target_names=target_names))

              precision    recall  f1-score   support

        Dead       0.82      0.87      0.84       134
    Survived       0.75      0.66      0.70        77

    accuracy                           0.80       211
   macro avg       0.78      0.77      0.77       211
weighted avg       0.79      0.80      0.79       211

