### Titanic Survival
- Using Decision Trees for predicting survival on Titanic

In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from math import log2

plt.style.use("dark_background")

In [30]:
df = pd.read_csv("Titanic Dataset/train.csv")
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


### Data Preprocessing

#### Data Cleaning
- Dropping out non useful features
- By observation we do not need
  - PassengerID 
  - Name
  - Ticket
  - Cabin 
  - Embarked

> We are keeping fare, as it is possible higher the fare, higher the safety

In [32]:
columns_to_drop = ["PassengerId", "Name", "Ticket", "Cabin", "Embarked"]
data = df.drop(columns=columns_to_drop)
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,male,22.0,1,0,7.25
1,1,1,female,38.0,1,0,71.2833
2,1,3,female,26.0,0,0,7.925
3,1,1,female,35.0,1,0,53.1
4,0,3,male,35.0,0,0,8.05


`Sex` is a string data, hence we pre process it to numeric
- Using `LabelEncoder`

In [33]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

In [34]:
data["Sex"] = le.fit_transform(data["Sex"])
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,1,22.0,1,0,7.25
1,1,1,0,38.0,1,0,71.2833
2,1,3,0,26.0,0,0,7.925
3,1,1,0,35.0,1,0,53.1
4,0,3,1,35.0,0,0,8.05


We see that `Age` has 177 null values, we need to treat them, this can be done by
- Data Imputation
  - Filling the missing values (mean, mode, etc)
- Dropping the rows having null values

For data imputation, we use `fillna` function provided by pandas itself

In [35]:
data = data.fillna(int(data["Age"].mean()))
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    int64  
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
dtypes: float64(2), int64(5)
memory usage: 48.9 KB


Now we have a cleaned up, label encoded and imputed dataset ready to work on

#### Dividing in X and Y

In [36]:
X = data[["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare"]]
y = data[["Survived"]]

X.shape, y.shape

((891, 6), (891, 1))

### Starting with Decision Tree

In [37]:
# Defining Entropy
def entropy(col):
    
    values, counts = np.unique(col, return_counts=True)
    N = col.shape[0]

    ent = 0.0

    for c in counts:
        p = c/N

        ent += (-1 * p * log2(p))

    return ent

In [45]:
def divide_data(x_data: pd.DataFrame, fkey, fval):
    """
    x_data : The Data to Divide
    fkey : The feature we are dividing on
    fval : The value on which to split
    """

    # Essentially does a binary split

    left = x_data[x_data[fkey] < fval]
    right = x_data[x_data[fkey] >= fval]

    return left, right

In [42]:
def information_gain(x_data, fkey, fval):

    left, right = divide_data(x_data, fkey, fval)

    # Ratio of samples
    l = left.shape[0]/x_data.shape[0]
    r = right.shape[0]/x_data.shape[0]

    # If all come in one class, best classification
    if left.shape[0] == 0 or right.shape[0] == 0:
        return -1000000

    i_gain = entropy(x_data["Survived"]) - l * entropy(left["Survived"]) - r * entropy(right["Survived"])

    return i_gain

In [44]:
# Testing function
# Getting the max information gain

for f in data.columns:
    print(f, information_gain(data, f, data[f].mean()))

Survived 0.9607079018756469
Pclass 0.07579362743608165
Sex 0.2176601066606143
Age 0.0008836151229467681
SibSp 0.009584541813400127
Parch 0.015380754493137666
Fare 0.04214069283899541


It is clear, that division on `Sex` yields the best result, hence it is eligible for being the root node

### Creating classifier

In [51]:
class DecisionTree:

    def __init__(self, depth = 0, max_depth = 5):
        # Max depth is kept, because if depth goes too much
        # There is a risk of overfitting
        # The model will classify even to extent of having single element

        self.left = None
        self.right = None
        self.fkey = None
        self.fval = None
        self.max_depth = max_depth
        self.depth = depth
        self.target = None

    def train(self, x_data):

        features = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare"]
        info_gains = []

        for f in features:
            info_gain = information_gain(x_data, f, x_data[f].mean())
            info_gains.append(info_gain)

        # Setting the node as per the info gain
        self.fkey = features[np.argmax(info_gains)]
        self.fval = x_data[self.fkey].mean()

        print("Making tree, current feature:", self.fkey)

        # Split data
        data_left, data_right = divide_data(x_data, self.fkey, self.fval)

        # Reorder index
        data_left.reset_index(drop=True)
        data_right.reset_index(drop=True)

        # If leaf node
        if data_left.shape[0] == 0 or data_right.shape[0] == 0:
            self.target = "Survive" if x_data["Survived"].mean() >= 0.6 else "Dead"
            return

        # Stop if more than max depth
        if self.depth >= self.max_depth:
            self.target = "Survive" if x_data["Survived"].mean() >= 0.6 else "Dead"
            return

        
        # Else recursive case
        self.left = DecisionTree(depth=self.depth + 1, max_depth = self.max_depth)
        self.left.train(data_left)

        self.right = DecisionTree(depth=self.depth + 1, max_depth=self.max_depth)
        self.right.train(data_right)

        # Setting target on every node for visualisation
        self.target = "Survive" if x_data["Survived"].mean() >= 0.6 else "Dead"

    def predict(self, x_test):

        if x_test[self.fkey] >= self.fval:
            if self.right is None:
                return self.target
            else:
                return self.right.predict(x_test)

        else:
            if self.left is None:
                return self.target
            else:
                return self.left.predict(x_test)


In [52]:
# Testing
dt = DecisionTree()
dt.train(data)

Making tree, current feature: Sex
Making tree, current feature: Pclass
Making tree, current feature: Pclass
Making tree, current feature: Parch
Making tree, current feature: Age
Making tree, current feature: Age
Making tree, current feature: Age
Making tree, current feature: Parch
Making tree, current feature: Age
Making tree, current feature: Fare
Making tree, current feature: Parch
Making tree, current feature: Age
Making tree, current feature: Age
Making tree, current feature: Age
Making tree, current feature: Age
Making tree, current feature: Age
Making tree, current feature: Age
Making tree, current feature: Fare
Making tree, current feature: SibSp
Making tree, current feature: Fare
Making tree, current feature: Fare
Making tree, current feature: Age
Making tree, current feature: SibSp
Making tree, current feature: Parch
Making tree, current feature: Age
Making tree, current feature: SibSp
Making tree, current feature: Fare
Making tree, current feature: Parch
Making tree, current 

As We can see the division began on `sex`, our model is working

In [58]:
dt.predict(data.loc[3]), data.loc[3]["Survived"]

('Survive', 1.0)

Our model works!

#### Testing

In [61]:
split = int(data.shape[0] * 0.7)

train_data = data[:split]
test_data = data[split:]

test_data = test_data.reset_index(drop=True)

In [62]:
train_data.shape, test_data.shape

((623, 7), (268, 7))

In [63]:
dt = DecisionTree()
dt.train(train_data)

Making tree, current feature: Sex
Making tree, current feature: Pclass
Making tree, current feature: Age
Making tree, current feature: SibSp
Making tree, current feature: Pclass
Making tree, current feature: Age
Making tree, current feature: Age
Making tree, current feature: SibSp
Making tree, current feature: Parch
Making tree, current feature: Pclass
Making tree, current feature: SibSp
Making tree, current feature: Fare
Making tree, current feature: Parch
Making tree, current feature: Age
Making tree, current feature: Pclass
Making tree, current feature: Age
Making tree, current feature: Age
Making tree, current feature: Parch
Making tree, current feature: Age
Making tree, current feature: Fare
Making tree, current feature: Fare
Making tree, current feature: SibSp
Making tree, current feature: Age
Making tree, current feature: Age
Making tree, current feature: Fare
Making tree, current feature: Fare
Making tree, current feature: Age
Making tree, current feature: Parch
Making tree, cu

In [64]:
preds = []
for i in range(test_data.shape[0]):

    preds.append(dt.predict(test_data.loc[i]))


In [65]:
preds

['Dead',
 'Dead',
 'Dead',
 'Dead',
 'Survive',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Survive',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Survive',
 'Dead',
 'Dead',
 'Survive',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Survive',
 'Dead',
 'Survive',
 'Dead',
 'Survive',
 'Survive',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Survive',
 'Survive',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Survive',
 'Dead',
 'Dead',
 'Survive',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Survive',
 'Dead',
 'Survive',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Survive',
 'Dead',
 'Dead',
 'Survive',
 'Dead',
 'Survive',
 'Dead',
 'Dead',
 'Dead',
 'Survive',
 'Dead',
 'Survive',
 'Dead',
 'Survive',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Survive',
 'Survive',
 'Dead',
 'Dead',
 'Survive',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Su

`Label Encode` these to get comparable data

In [66]:
preds = le.fit_transform(preds)
preds

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0])

In [68]:
y_actual = test_data["Survived"].values
y_actual

array([0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0])

In [69]:
accuracy = np.sum(preds == y_actual)/preds.shape[0]
accuracy

0.8171641791044776

Our model gives us a 81% accuracy

### Random Forests
Decision Trees have some problems 
- High Variance
  - A small change in the data can cause huge changes in the model
  - A single change can change the root node

- Overfitting
  - Common to get high accuracy on train, but not on test

So to overcome this, `Random Forests` were implemented

- It is basically a **collection** of trees (hence a forest)
- We tune and create different trees for the data
- And we return the class that maximum trees classify
- This acts like cross validating the result

In [None]:
from sklearn.ensemble import RandomForestClassifier