# Analysing the titanic dataset

1. Basic data cleaning and exploration
2. Exploratory data analysis
3. Model Experimentation

In [20]:
# import relevant packages
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from IPython.display import display, HTML

## Basic data exploration

1. Import the data
2. Look at summary statistics
3. Evaluate null values if any


In [21]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [22]:
# Function to create scrollable table within a small window
def create_scrollable_table(df, table_id, title):
    html = f'<h3>{title}</h3>'
    html += f'<div id="{table_id}" style="height:200px; overflow:auto;">'
    html += df.to_html()
    html += '</div>'
    return html

In [23]:
df_head = train_df.head()
html_df_head = create_scrollable_table(df_head, 'df_head', 'View the structure of data in the dataframe ')

display(HTML(html_df_head))

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [24]:
train_df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

### Now check for NULL values

In [25]:
# null values in each column
train_df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [29]:
# percentage null values

round((train_df.isna().sum() / len(train_df)) * 100, 2)

PassengerId     0.00
Survived        0.00
Pclass          0.00
Name            0.00
Sex             0.00
Age            19.87
SibSp           0.00
Parch           0.00
Ticket          0.00
Fare            0.00
Cabin          77.10
Embarked        0.22
dtype: float64

We will not use the age column and cabin column for this analysis since their proportion of null values is large

In [26]:
train_df.shape 

(891, 12)

## What questions do I want to ask of the data

1. How passenger class correlates with survival rate
2. How ports of embarkation correlates with survival rate
3. Distribution of male and female
4. How age affected survival rate


In [31]:
train_df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [None]:
# How passenger

## Data cleaning

1. Setting the right data types to each column
2.