In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("train.csv")

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1009 entries, 0 to 1008
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     1009 non-null   float64
 1   survived   1009 non-null   float64
 2   name       1009 non-null   object 
 3   sex        1009 non-null   object 
 4   age        812 non-null    float64
 5   sibsp      1009 non-null   float64
 6   parch      1009 non-null   float64
 7   ticket     1009 non-null   object 
 8   fare       1008 non-null   float64
 9   cabin      229 non-null    object 
 10  embarked   1008 non-null   object 
 11  boat       374 non-null    object 
 12  body       98 non-null     float64
 13  home.dest  582 non-null    object 
dtypes: float64(7), object(7)
memory usage: 110.5+ KB


In [4]:
columns_to_drop = ['boat','body','name','ticket','cabin','embarked']
data_clean = data.drop(columns_to_drop,axis=1)

In [5]:
data_clean.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,home.dest
0,3.0,0.0,female,,0.0,0.0,7.75,
1,2.0,0.0,male,39.0,0.0,0.0,26.0,
2,2.0,1.0,female,40.0,0.0,0.0,13.0,
3,3.0,1.0,female,31.0,1.0,1.0,20.525,"Strood, Kent, England Detroit, MI"
4,3.0,1.0,female,,2.0,0.0,23.25,


In [6]:
data_clean = data_clean.iloc[:, :-1]

In [7]:
data_clean.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare
0,3.0,0.0,female,,0.0,0.0,7.75
1,2.0,0.0,male,39.0,0.0,0.0,26.0
2,2.0,1.0,female,40.0,0.0,0.0,13.0
3,3.0,1.0,female,31.0,1.0,1.0,20.525
4,3.0,1.0,female,,2.0,0.0,23.25


In [8]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

data_clean["sex"] = le.fit_transform(data_clean["sex"])

In [9]:
data_clean.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare
0,3.0,0.0,0,,0.0,0.0,7.75
1,2.0,0.0,1,39.0,0.0,0.0,26.0
2,2.0,1.0,0,40.0,0.0,0.0,13.0
3,3.0,1.0,0,31.0,1.0,1.0,20.525
4,3.0,1.0,0,,2.0,0.0,23.25


In [10]:
data_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1009 entries, 0 to 1008
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    1009 non-null   float64
 1   survived  1009 non-null   float64
 2   sex       1009 non-null   int32  
 3   age       812 non-null    float64
 4   sibsp     1009 non-null   float64
 5   parch     1009 non-null   float64
 6   fare      1008 non-null   float64
dtypes: float64(6), int32(1)
memory usage: 51.4 KB


In [11]:
avg_age = data_clean["age"].mean()
print(avg_age)

29.838977832512317


In [12]:
data_clean = data_clean.fillna(avg_age)

In [13]:
data_clean.describe()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare
count,1009.0,1009.0,1009.0,1009.0,1009.0,1009.0,1009.0
mean,2.295342,0.378593,0.64222,29.838978,0.521308,0.39445,33.643638
std,0.835704,0.485277,0.479585,13.04223,1.077269,0.888087,51.426734
min,1.0,0.0,0.0,0.3333,0.0,0.0,0.0
25%,2.0,0.0,0.0,22.0,0.0,0.0,7.8958
50%,3.0,0.0,1.0,29.838978,0.0,0.0,14.4583
75%,3.0,1.0,1.0,35.0,1.0,0.0,31.3875
max,3.0,1.0,1.0,80.0,8.0,9.0,512.3292


In [14]:
input_cols = ['pclass',"sex","age","sibsp","parch","fare"]
output_cols = ["survived"]

X = data_clean[input_cols]
Y = data_clean[output_cols]

print(X.shape,Y.shape)
print(type(X))

(1009, 6) (1009, 1)
<class 'pandas.core.frame.DataFrame'>


In [15]:
def entropy(col):
    
    counts = np.unique(col,return_counts=True)
    N = float(col.shape[0])
    
    ent = 0.0
    
    for ix in counts[1]:
        p  = ix/N
        ent += (-1.0*p*np.log2(p))
    
    return ent

In [16]:
def divide_data(x_data,fkey,fval):
    #Work with Pandas Data Frames- create two empty data frames
    x_right = pd.DataFrame([],columns=x_data.columns)
    x_left = pd.DataFrame([],columns=x_data.columns)
    
    #copy data to those empty dataframes acc to condition
    for ix in range(x_data.shape[0]):
        val = x_data[fkey].loc[ix]
        
        if val > fval:
            x_right = x_right.append(x_data.loc[ix])
        else:
            x_left = x_left.append(x_data.loc[ix])
            
    return x_left,x_right

In [17]:
def information_gain(x_data,fkey,fval):
    
    left,right = divide_data(x_data,fkey,fval)
    
    #% of total samples are on left and right
    l = float(left.shape[0])/x_data.shape[0]
    r = float(right.shape[0])/x_data.shape[0]
    
    #All examples come to one side!
    if left.shape[0] == 0 or right.shape[0] ==0:
        return -1000000 #Min Information Gain
    
    i_gain = entropy(x_data.survived) - (l*entropy(left.survived)+r*entropy(right.survived))
    return i_gain

In [18]:
# Test our function
for fx in X.columns:
    print(fx)
    print(information_gain(data_clean,fx,data_clean[fx].mean()))

pclass
0.055456910002982474
sex
0.19274737190850932
age
0.0010525742338489685
sibsp
0.006492394392888956
parch
0.01975608012294816
fare
0.04242793401428169


In [24]:
class DecisionTree:
    
    #Constructor
    def __init__(self,depth=0,max_depth=5):
        self.left = None
        self.right = None
        self.fkey = None
        self.fval = None
        self.max_depth = max_depth
        self.depth = depth
        self.target = None
        
    def train(self,X_train):
        
        features = ['pclass','sex','age','sibsp', 'parch', 'fare']
        info_gains = []
        
        for ix in features:
            i_gain = information_gain(X_train,ix,X_train[ix].mean())
            info_gains.append(i_gain)
            
        self.fkey = features[np.argmax(info_gains)]
        self.fval = X_train[self.fkey].mean()
        print("Making Tree Features is",self.fkey)
        
        #Split Data
        data_left,data_right = divide_data(X_train,self.fkey,self.fval)
        data_left = data_left.reset_index(drop=True)
        data_right = data_right.reset_index(drop=True)
         
        #Truly a left node
        if data_left.shape[0]  == 0 or data_right.shape[0] ==0:
            if X_train.survived.mean() >= 0.5:
                self.target = "survive"
            else:
                self.target = "dead"
            return
        #Stop earyly when depth >=max depth
        if(self.depth>=self.max_depth):
            if X_train.survived.mean() >= 0.5:
                self.target = "survive"
            else:
                self.target = "dead"
            return
        
        #Recursive Case
        self.left = DecisionTree(depth=self.depth+1,max_depth=self.max_depth)
        self.left.train(data_left)
        
        self.right = DecisionTree(depth=self.depth+1,max_depth=self.max_depth)
        self.right.train(data_right)
        
        #You can set the target at every node
        if X_train.survived.mean() >= 0.5:
            self.target = "survive"
        else:
            self.target = "dead"
        return
    
    def predict(self,test):
        if test[self.fkey]>self.fval:
            #go to right
            if self.right is None:
                return self.target
            return self.right.predict(test)
        else:
            if self.left is None:
                return self.target
            return self.left.predict(test)

In [38]:
split = int(0.7*data_clean.shape[0])
train_data = data_clean[:split]
test_data = data_clean[split:]
test_data = test_data.reset_index(drop=True)

In [39]:
print(train_data.shape,test_data.shape)

(706, 7) (303, 7)


In [40]:
dt = DecisionTree()

In [41]:
dt.train(train_data)

Making Tree Features is sex
Making Tree Features is pclass
Making Tree Features is parch
Making Tree Features is fare
Making Tree Features is fare
Making Tree Features is fare
Making Tree Features is fare
Making Tree Features is age
Making Tree Features is age
Making Tree Features is age
Making Tree Features is fare
Making Tree Features is pclass
Making Tree Features is age
Making Tree Features is age
Making Tree Features is age
Making Tree Features is age
Making Tree Features is age
Making Tree Features is sibsp
Making Tree Features is fare
Making Tree Features is fare
Making Tree Features is parch
Making Tree Features is age
Making Tree Features is age
Making Tree Features is age
Making Tree Features is parch
Making Tree Features is fare
Making Tree Features is parch
Making Tree Features is age
Making Tree Features is fare
Making Tree Features is fare
Making Tree Features is age
Making Tree Features is age
Making Tree Features is fare
Making Tree Features is parch
Making Tree Feature

In [42]:
print(dt.fkey)
print(dt.fval)
print(dt.left.fkey)
print(dt.right.fkey)

sex
0.6543909348441926
pclass
fare


In [43]:
y_pred = []
for ix in range(test_data.shape[0]):
    y_pred.append(dt.predict(test_data.loc[ix]))

In [44]:
y_pred

['dead',
 'survive',
 'dead',
 'dead',
 'dead',
 'dead',
 'survive',
 'survive',
 'dead',
 'dead',
 'dead',
 'dead',
 'survive',
 'dead',
 'dead',
 'dead',
 'survive',
 'dead',
 'dead',
 'dead',
 'dead',
 'dead',
 'survive',
 'dead',
 'dead',
 'dead',
 'dead',
 'dead',
 'survive',
 'survive',
 'dead',
 'survive',
 'dead',
 'dead',
 'dead',
 'dead',
 'survive',
 'survive',
 'dead',
 'survive',
 'survive',
 'dead',
 'survive',
 'survive',
 'dead',
 'dead',
 'dead',
 'dead',
 'dead',
 'dead',
 'dead',
 'survive',
 'dead',
 'dead',
 'dead',
 'dead',
 'survive',
 'survive',
 'dead',
 'dead',
 'dead',
 'dead',
 'dead',
 'survive',
 'survive',
 'dead',
 'dead',
 'dead',
 'dead',
 'dead',
 'survive',
 'dead',
 'dead',
 'dead',
 'survive',
 'survive',
 'dead',
 'dead',
 'survive',
 'survive',
 'survive',
 'dead',
 'dead',
 'dead',
 'dead',
 'dead',
 'dead',
 'dead',
 'dead',
 'survive',
 'survive',
 'dead',
 'dead',
 'dead',
 'dead',
 'dead',
 'dead',
 'dead',
 'survive',
 'dead',
 'dead',
 'su

In [45]:
y_actual = test_data[output_cols]

In [46]:
le = LabelEncoder()
y_pred = le.fit_transform(y_pred)

In [47]:
y_pred = np.array(y_pred).reshape((-1,1))

In [48]:
acc = np.sum(y_pred==y_actual)/y_pred.shape[0]

In [49]:
acc = np.sum(np.array(y_pred)==np.array(y_actual))/y_pred.shape[0]

In [50]:
print(acc)

0.7557755775577558


# Using Sklearn

In [51]:
from sklearn.tree import DecisionTreeClassifier

In [53]:
sk_tree = DecisionTreeClassifier(criterion='gini',max_depth=5)

In [54]:
sk_tree.fit(train_data[input_cols],train_data[output_cols])

DecisionTreeClassifier(max_depth=5)

In [55]:
sk_tree.score(test_data[input_cols],test_data[output_cols])

0.7755775577557755

# Random Forests

In [59]:
X_train = train_data[input_cols]
Y_train = np.array(train_data[output_cols]).reshape((-1,))
X_test = test_data[input_cols]
Y_test = np.array(test_data[output_cols]).reshape((-1,))

In [60]:
sk_tree = DecisionTreeClassifier(criterion='entropy',max_depth=5)
sk_tree.fit(X_train,Y_train)
sk_tree.score(X_train,Y_train)

0.8526912181303116

In [61]:
sk_tree.score(X_test,Y_test)#overfitting

0.7953795379537953

In [62]:
from sklearn.ensemble import RandomForestClassifier

In [63]:
rf = RandomForestClassifier(n_estimators=10,criterion='entropy',max_depth=5)

In [64]:
rf.fit(X_train,Y_train)

RandomForestClassifier(criterion='entropy', max_depth=5, n_estimators=10)

In [65]:
rf.score(X_train,Y_train)

0.8427762039660056

In [66]:
rf.score(X_test,Y_test)

0.7854785478547854

In [67]:
rf = RandomForestClassifier(n_estimators=22,max_depth=5,criterion='entropy')
rf.fit(X_train,Y_train)

RandomForestClassifier(criterion='entropy', max_depth=5, n_estimators=22)

In [68]:
rf.score(X_train,Y_train)

0.8512747875354107

In [69]:
rf.score(X_test,Y_test)

0.7755775577557755