## Titanic Survivor

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv("Train.csv")

In [4]:
data.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,3.0,0.0,"O'Donoghue, Ms. Bridget",female,,0.0,0.0,364856,7.75,,Q,,,
1,2.0,0.0,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",male,39.0,0.0,0.0,250655,26.0,,S,,,
2,2.0,1.0,"Smith, Miss. Marion Elsie",female,40.0,0.0,0.0,31418,13.0,,S,9,,
3,3.0,1.0,"Goldsmith, Mrs. Frank John (Emily Alice Brown)",female,31.0,1.0,1.0,363291,20.525,,S,C D,,"Strood, Kent, England Detroit, MI"
4,3.0,1.0,"McCoy, Miss. Agnes",female,,2.0,0.0,367226,23.25,,Q,16,,


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1009 entries, 0 to 1008
Data columns (total 14 columns):
pclass       1009 non-null float64
survived     1009 non-null float64
name         1009 non-null object
sex          1009 non-null object
age          812 non-null float64
sibsp        1009 non-null float64
parch        1009 non-null float64
ticket       1009 non-null object
fare         1008 non-null float64
cabin        229 non-null object
embarked     1008 non-null object
boat         374 non-null object
body         98 non-null float64
home.dest    582 non-null object
dtypes: float64(7), object(7)
memory usage: 110.5+ KB


In [6]:
data.shape

(1009, 14)

In [7]:
columns_to_drop = ["name", "ticket", "cabin", "embarked", "boat", "body", "home.dest"]

In [9]:
data_clean = data.drop(columns_to_drop, axis = 1)

In [10]:
data_clean.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare
0,3.0,0.0,female,,0.0,0.0,7.75
1,2.0,0.0,male,39.0,0.0,0.0,26.0
2,2.0,1.0,female,40.0,0.0,0.0,13.0
3,3.0,1.0,female,31.0,1.0,1.0,20.525
4,3.0,1.0,female,,2.0,0.0,23.25


In [11]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

data_clean["sex"] = le.fit_transform(data_clean["sex"])

In [12]:
data_clean.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare
0,3.0,0.0,0,,0.0,0.0,7.75
1,2.0,0.0,1,39.0,0.0,0.0,26.0
2,2.0,1.0,0,40.0,0.0,0.0,13.0
3,3.0,1.0,0,31.0,1.0,1.0,20.525
4,3.0,1.0,0,,2.0,0.0,23.25


In [13]:
data_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1009 entries, 0 to 1008
Data columns (total 7 columns):
pclass      1009 non-null float64
survived    1009 non-null float64
sex         1009 non-null int32
age         812 non-null float64
sibsp       1009 non-null float64
parch       1009 non-null float64
fare        1008 non-null float64
dtypes: float64(6), int32(1)
memory usage: 51.4 KB


In [18]:
data_clean["age"] = data_clean.fillna(int(data_clean["age"].median()))
data_clean["fare"] = data_clean.fillna(data_clean["fare"].median())

In [32]:
data_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1009 entries, 0 to 1008
Data columns (total 7 columns):
pclass      1009 non-null float64
survived    1009 non-null float64
sex         1009 non-null int32
age         1009 non-null float64
sibsp       1009 non-null float64
parch       1009 non-null float64
fare        1009 non-null float64
dtypes: float64(6), int32(1)
memory usage: 51.4 KB


In [31]:
y_train = data_clean["survived"]
x_train = data_clean.drop("survived", axis = 1)

In [33]:
x_train.columns

Index(['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare'], dtype='object')

In [23]:
print(y_train.head())
print(x_train.head())

0    0.0
1    0.0
2    1.0
3    1.0
4    1.0
Name: survived, dtype: float64
   pclass  sex  age  sibsp  parch  fare
0     3.0    0  3.0    0.0    0.0   3.0
1     2.0    1  2.0    0.0    0.0   2.0
2     2.0    0  2.0    0.0    0.0   2.0
3     3.0    0  3.0    1.0    1.0   3.0
4     3.0    0  3.0    2.0    0.0   3.0


In [24]:
# Define Entropy and Decision Gain

In [59]:
def entropy(col):

    counts = np.unique(col, return_counts = True)
    N = float(col.shape[0])
    
    ent = 0.0
    
    for ix in counts[1]:
        p = ix/N
        ent+=(-1.0*p*np.log2(p))
        
    return ent
        
    
    

In [60]:
def divide_data(x_data, fkey, fval):
    x_right = pd.DataFrame([], columns = x_data.columns)
    x_left = pd.DataFrame([], columns = x_data.columns)
    for ix in range(x_data.shape[0]):
        val = x_data[fkey].loc[ix]
        
        if val>fval:
            x_right = x_right.append(x_data.iloc[ix])
        else:
            x_left = x_left.append(x_data.iloc[ix])
    
    return x_left, x_right

In [61]:
def information_gain(x_data, fkey, fval):
    
    left,right = divide_data(x_data,fkey,fval)
    
    l = float(left.shape[0])/x_data.shape[0]
    r = float(right.shape[0])/x_data.shape[0]
    
    if left.shape[0] == 0 or right.shape[0] == 0:
        return -100000 #Min information gain
    
    i_gain = entropy(x_data.survived)-((l*entropy(left.survived))+(r*entropy(right.survived)))   
    return i_gain
    
    
    

In [81]:
class DecisionTree:
    
    def __init__(self, depth = 0, max_depth=5):
        self.left = None
        self.right = None
        self.fkey = None
        self.fval = None
        self.max_depth = max_depth
        self.depth = depth
        self.target = None
        
    def train(self, x):
        features = list(x_train.columns)
        info_gains = []
        
        for ix in features:
            i_gain = information_gain(x, ix, x[ix].mean())
            info_gains.append(i_gain)
            
        self.fkey = features[np.argmax(info_gains)]
        self.fval = x[self.fkey].mean()
        print("making a tree feature is ", self.fkey)
            
        #split data
        data_left, data_right = divide_data(x, self.fkey, self.fval)
        data_left = data_left.reset_index(drop = True)
        data_right = data_right.reset_index(drop = True)
        
        #truly a leaf node
        if data_left.shape[0] == 0 or data_right.shape[0] == 0:
            if x.survived.mean()>=0.5:
                self.target = "survive"
                
            else:
                self.target = "dead"
                
            return
        
        #Stop early whe depth >=max_depth
        if(self.depth>=self.max_depth):
            if x.survived.mean()>=0.5:
                self.target = "survive"
                
            else:
                self.target = "dead"
                
            return
        
        #recursive case
        self.left = DecisionTree(depth = self.depth+1, max_depth = self.max_depth)
        self.left.train(data_left)
        
        self.right = DecisionTree(depth = self.depth+1, max_depth = self.max_depth)
        self.right.train(data_right)
        
        #setting the target at every node
        if x.survived.mean()>=0.5:
                self.target = "survive"
                
        else:
                self.target = "dead"
                
        return
    
    def predict(self,test):
        if test[self.fkey]>self.fval:
            if self.right is None:
                return self.target
            return self.right.predict(test)
        else:
            if self.left is None:
                return self.target
            return self.left.predict(test)
        
            
            


        
        
        
        
        

In [82]:
d = DecisionTree()
d.train(data_clean)

making a tree feature is  sex
making a tree feature is  pclass
making a tree feature is  pclass
making a tree feature is  sibsp
making a tree feature is  parch
making a tree feature is  pclass
making a tree feature is  parch
making a tree feature is  parch
making a tree feature is  sibsp
making a tree feature is  parch
making a tree feature is  parch
making a tree feature is  sibsp
making a tree feature is  pclass
making a tree feature is  sibsp
making a tree feature is  parch
making a tree feature is  sibsp
making a tree feature is  sibsp
making a tree feature is  parch
making a tree feature is  sibsp
making a tree feature is  pclass
making a tree feature is  sibsp
making a tree feature is  pclass
making a tree feature is  sibsp
making a tree feature is  parch
making a tree feature is  sibsp
making a tree feature is  sibsp
making a tree feature is  sibsp
making a tree feature is  sibsp
making a tree feature is  parch
making a tree feature is  sibsp
making a tree feature is  parch
maki

In [83]:
print(d.fkey)
print(d.left.fkey)
print(d.right.fkey)

sex
pclass
parch


In [73]:
##prediction

In [94]:
df_test = pd.read_csv('Test.csv')
df_test = df_test.drop(columns_to_drop, axis = 1)
df_test["sex"] = le.fit_transform(df_test["sex"])
df_test["age"] = df_test.fillna(int(data_clean["age"].median()))
df_test["fare"] = df_test.fillna(data_clean["fare"].median())
print(df_test.head())
print(df_test.shape[0])
print(df_test.loc[0]['sex'])


   pclass  sex  age  sibsp  parch  fare
0     1.0    1  1.0    0.0    0.0   1.0
1     3.0    0  3.0    8.0    2.0   3.0
2     1.0    1  1.0    0.0    0.0   1.0
3     2.0    1  2.0    0.0    0.0   2.0
4     2.0    1  2.0    0.0    0.0   2.0
300
1.0


In [95]:
d.predict(df_test.loc[0])

'dead'

In [97]:
y_pred = []
for ix in range(df_test.shape[0]):
    y_pred.append(d.predict(df_test.loc[ix]))
    

In [98]:
print(y_pred)

['dead', 'dead', 'dead', 'dead', 'dead', 'survive', 'dead', 'dead', 'dead', 'dead', 'dead', 'dead', 'dead', 'dead', 'survive', 'dead', 'dead', 'survive', 'survive', 'dead', 'survive', 'dead', 'dead', 'dead', 'dead', 'survive', 'dead', 'survive', 'survive', 'dead', 'dead', 'survive', 'dead', 'survive', 'dead', 'survive', 'dead', 'dead', 'dead', 'dead', 'survive', 'survive', 'survive', 'survive', 'dead', 'dead', 'survive', 'dead', 'dead', 'dead', 'dead', 'survive', 'dead', 'survive', 'dead', 'dead', 'dead', 'dead', 'survive', 'dead', 'survive', 'survive', 'survive', 'dead', 'dead', 'dead', 'dead', 'dead', 'dead', 'survive', 'survive', 'dead', 'dead', 'dead', 'survive', 'dead', 'survive', 'survive', 'dead', 'dead', 'survive', 'dead', 'dead', 'dead', 'survive', 'dead', 'dead', 'dead', 'dead', 'dead', 'survive', 'dead', 'dead', 'dead', 'survive', 'survive', 'dead', 'survive', 'survive', 'dead', 'survive', 'dead', 'dead', 'dead', 'dead', 'dead', 'dead', 'dead', 'survive', 'dead', 'survive', 

In [99]:
predicted = []
for y in y_pred:
    if y == 'dead':
        predicted.append(0.0)
    else :
        predicted.append(1.0)
print(predicted)

[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0,

In [102]:
df = pd.DataFrame(predicted, columns = {'survived'})

In [103]:
df.to_csv('predicted.csv', index_label='Id')