### Titanic Survival

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from math import log2

plt.style.use("dark_background")

In [2]:
df = pd.read_csv("Dataset/train.csv")
df

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,3.0,0.0,"O'Donoghue, Ms. Bridget",female,,0.0,0.0,364856,7.7500,,Q,,,
1,2.0,0.0,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",male,39.0,0.0,0.0,250655,26.0000,,S,,,
2,2.0,1.0,"Smith, Miss. Marion Elsie",female,40.0,0.0,0.0,31418,13.0000,,S,9,,
3,3.0,1.0,"Goldsmith, Mrs. Frank John (Emily Alice Brown)",female,31.0,1.0,1.0,363291,20.5250,,S,C D,,"Strood, Kent, England Detroit, MI"
4,3.0,1.0,"McCoy, Miss. Agnes",female,,2.0,0.0,367226,23.2500,,Q,16,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1004,1.0,1.0,"Blank, Mr. Henry",male,40.0,0.0,0.0,112277,31.0000,A31,C,7,,"Glen Ridge, NJ"
1005,3.0,0.0,"Laitinen, Miss. Kristina Sofia",female,37.0,0.0,0.0,4135,9.5875,,S,,,
1006,1.0,1.0,"Newell, Miss. Marjorie",female,23.0,1.0,0.0,35273,113.2750,D36,C,6,,"Lexington, MA"
1007,3.0,1.0,"Nicola-Yarred, Master. Elias",male,12.0,1.0,0.0,2651,11.2417,,C,C,,


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1009 entries, 0 to 1008
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     1009 non-null   float64
 1   survived   1009 non-null   float64
 2   name       1009 non-null   object 
 3   sex        1009 non-null   object 
 4   age        812 non-null    float64
 5   sibsp      1009 non-null   float64
 6   parch      1009 non-null   float64
 7   ticket     1009 non-null   object 
 8   fare       1008 non-null   float64
 9   cabin      229 non-null    object 
 10  embarked   1008 non-null   object 
 11  boat       374 non-null    object 
 12  body       98 non-null     float64
 13  home.dest  582 non-null    object 
dtypes: float64(7), object(7)
memory usage: 110.5+ KB


In [6]:
data = df.drop(columns=["name", "ticket", "embarked", "boat", "body", "home.dest", "cabin"])
data.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare
0,3.0,0.0,female,,0.0,0.0,7.75
1,2.0,0.0,male,39.0,0.0,0.0,26.0
2,2.0,1.0,female,40.0,0.0,0.0,13.0
3,3.0,1.0,female,31.0,1.0,1.0,20.525
4,3.0,1.0,female,,2.0,0.0,23.25


In [7]:
data = data.fillna(data["age"].mean())
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1009 entries, 0 to 1008
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    1009 non-null   float64
 1   survived  1009 non-null   float64
 2   sex       1009 non-null   object 
 3   age       1009 non-null   float64
 4   sibsp     1009 non-null   float64
 5   parch     1009 non-null   float64
 6   fare      1009 non-null   float64
dtypes: float64(6), object(1)
memory usage: 55.3+ KB


In [9]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

data["sex"] = le.fit_transform(data["sex"])
data.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare
0,3.0,0.0,0,29.838978,0.0,0.0,7.75
1,2.0,0.0,1,39.0,0.0,0.0,26.0
2,2.0,1.0,0,40.0,0.0,0.0,13.0
3,3.0,1.0,0,31.0,1.0,1.0,20.525
4,3.0,1.0,0,29.838978,2.0,0.0,23.25


In [14]:
class DecisionTree:

    def __init__(self, depth = 0, max_depth = 5):
        # Max depth is kept, because if depth goes too much
        # There is a risk of overfitting
        # The model will classify even to extent of having single element

        self.left = None
        self.right = None
        self.fkey = None
        self.fval = None
        self.max_depth = max_depth
        self.depth = depth
        self.target = None

    # Defining Entropy
    def entropy(self, col):
        
        values, counts = np.unique(col, return_counts=True)
        N = col.shape[0]

        ent = 0.0

        for c in counts:
            p = c/N

            ent += (-1 * p * log2(p))

        return ent

    def divide_data(self, x_data: pd.DataFrame, fkey, fval):
        """
        x_data : The Data to Divide
        fkey : The feature we are dividing on
        fval : The value on which to split
        """

        # Essentially does a binary split

        left = x_data[x_data[fkey] < fval]
        right = x_data[x_data[fkey] >= fval]

        return left, right

    def information_gain(self, x_data, fkey, fval):

        left, right = self.divide_data(x_data, fkey, fval)

        # Ratio of samples
        l = left.shape[0]/x_data.shape[0]
        r = right.shape[0]/x_data.shape[0]

        # If all come in one class, best classification
        if left.shape[0] == 0 or right.shape[0] == 0:
            return -1000000

        i_gain = self.entropy(x_data["survived"]) - l * self.entropy(left["survived"]) - r * self.entropy(right["survived"])

        return i_gain

    def train(self, x_data):

        features = ["pclass", "sex", "age", "sibsp", "parch", "fare"]
        info_gains = []

        for f in features:
            info_gain = self.information_gain(x_data, f, x_data[f].mean())
            info_gains.append(info_gain)

        # Setting the node as per the info gain
        self.fkey = features[np.argmax(info_gains)]
        self.fval = x_data[self.fkey].mean()

        print("Making tree, current feature:", self.fkey)

        # Split data
        data_left, data_right = self.divide_data(x_data, self.fkey, self.fval)

        # Reorder index
        data_left.reset_index(drop=True)
        data_right.reset_index(drop=True)

        # If leaf node
        if data_left.shape[0] == 0 or data_right.shape[0] == 0:
            self.target = "Survive" if x_data["survived"].mean() >= 0.6 else "Dead"
            return

        # Stop if more than max depth
        if self.depth >= self.max_depth:
            self.target = "Survive" if x_data["survived"].mean() >= 0.6 else "Dead"
            return

        
        # Else recursive case
        self.left = DecisionTree(depth=self.depth + 1, max_depth = self.max_depth)
        self.left.train(data_left)

        self.right = DecisionTree(depth=self.depth + 1, max_depth=self.max_depth)
        self.right.train(data_right)

        # Setting target on every node for visualisation
        self.target = "Survive" if x_data["survived"].mean() >= 0.6 else "Dead"

    def predict(self, x_test):

        if x_test[self.fkey] >= self.fval:
            if self.right is None:
                return self.target
            else:
                return self.right.predict(x_test)

        else:
            if self.left is None:
                return self.target
            else:
                return self.left.predict(x_test)


In [15]:
dt = DecisionTree()
dt.train(data)

Making tree, current feature: sex
Making tree, current feature: pclass
Making tree, current feature: pclass
Making tree, current feature: fare
Making tree, current feature: sibsp
Making tree, current feature: age
Making tree, current feature: age
Making tree, current feature: sibsp
Making tree, current feature: age
Making tree, current feature: sibsp
Making tree, current feature: parch
Making tree, current feature: fare
Making tree, current feature: sibsp
Making tree, current feature: fare
Making tree, current feature: age
Making tree, current feature: age
Making tree, current feature: parch
Making tree, current feature: parch
Making tree, current feature: sibsp
Making tree, current feature: fare
Making tree, current feature: fare
Making tree, current feature: fare
Making tree, current feature: age
Making tree, current feature: sibsp
Making tree, current feature: sibsp
Making tree, current feature: fare
Making tree, current feature: fare
Making tree, current feature: age
Making tree, c

In [16]:
df = pd.read_csv("Dataset/test.csv")
df.head()

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,"Flynn, Mr. John Irwin (""Irving"")",male,36.0,0.0,0.0,PC 17474,26.3875,E25,S,5.0,,"Brooklyn, NY"
1,3.0,"Sage, Miss. Constance Gladys",female,,8.0,2.0,CA. 2343,69.55,,S,,,
2,1.0,"Rood, Mr. Hugh Roscoe",male,,0.0,0.0,113767,50.0,A32,S,,,"Seattle, WA"
3,2.0,"Gillespie, Mr. William Henry",male,34.0,0.0,0.0,12233,13.0,,S,,,"Vancouver, BC"
4,2.0,"Collander, Mr. Erik Gustaf",male,28.0,0.0,0.0,248740,13.0,,S,,,"Helsinki, Finland Ashtabula, Ohio"


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     300 non-null    float64
 1   name       300 non-null    object 
 2   sex        300 non-null    object 
 3   age        234 non-null    float64
 4   sibsp      300 non-null    float64
 5   parch      300 non-null    float64
 6   ticket     300 non-null    object 
 7   fare       300 non-null    float64
 8   cabin      66 non-null     object 
 9   embarked   299 non-null    object 
 10  boat       112 non-null    object 
 11  body       23 non-null     float64
 12  home.dest  163 non-null    object 
dtypes: float64(6), object(7)
memory usage: 30.6+ KB


In [19]:
data = df.drop(columns=["name", "ticket", "embarked", "boat", "body", "home.dest", "cabin"])
data = data.fillna(data["age"].mean())

In [20]:
data["sex"] = le.fit_transform(data["sex"])

In [21]:
preds = []

for i in range(data.shape[0]):
    preds.append(dt.predict(data.loc[i]))

preds = np.array(preds)
preds

array(['Dead', 'Dead', 'Dead', 'Dead', 'Dead', 'Survive', 'Dead', 'Dead',
       'Dead', 'Dead', 'Dead', 'Dead', 'Dead', 'Dead', 'Survive', 'Dead',
       'Dead', 'Survive', 'Dead', 'Dead', 'Survive', 'Dead', 'Dead',
       'Dead', 'Dead', 'Survive', 'Dead', 'Dead', 'Survive', 'Dead',
       'Dead', 'Survive', 'Dead', 'Dead', 'Dead', 'Survive', 'Dead',
       'Dead', 'Dead', 'Dead', 'Survive', 'Survive', 'Dead', 'Dead',
       'Dead', 'Dead', 'Survive', 'Dead', 'Dead', 'Dead', 'Dead',
       'Survive', 'Dead', 'Survive', 'Dead', 'Dead', 'Dead', 'Dead',
       'Survive', 'Dead', 'Survive', 'Survive', 'Dead', 'Dead', 'Dead',
       'Dead', 'Dead', 'Dead', 'Dead', 'Survive', 'Survive', 'Dead',
       'Dead', 'Dead', 'Survive', 'Dead', 'Survive', 'Survive', 'Dead',
       'Dead', 'Survive', 'Dead', 'Dead', 'Dead', 'Dead', 'Dead', 'Dead',
       'Dead', 'Dead', 'Dead', 'Dead', 'Dead', 'Dead', 'Dead', 'Survive',
       'Survive', 'Dead', 'Dead', 'Survive', 'Dead', 'Dead', 'Dead',
       'Dea

In [22]:
preds = le.fit_transform(preds)

In [23]:
preds

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0])

In [25]:
out_df = pd.DataFrame(preds, columns=["Survived"])
out_df

Unnamed: 0,Survived
0,0
1,0
2,0
3,0
4,0
...,...
295,0
296,1
297,0
298,0


In [27]:
out_df.to_csv("output.csv", index_label="Id")