### Exploring the Titanic dataset

In [34]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import Imputer

#### Get the data

In [24]:
# Load the data
train_df = pd.read_csv("TitanicTrain.csv")

train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


#### Clean the Data
- filling in or dropping missing values.

In [25]:
train_df.shape

(891, 12)

In [26]:
# How much data are we missing?
train_df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [27]:
# What columns do we have?
train_df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [28]:
# What data types?
train_df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [29]:
# Drop the Cabin feature
train_df = train_df.drop('Cabin', axis=1)

In [30]:
# Look at the age distrubtion
train_df.Age.value_counts(dropna=False).sort_index(ascending=False)

NaN       177
 80.00      1
 74.00      1
 71.00      2
 70.50      1
 70.00      2
 66.00      1
 65.00      3
 64.00      2
 63.00      2
 62.00      4
 61.00      3
 60.00      4
 59.00      2
 58.00      5
 57.00      2
 56.00      4
 55.50      1
 55.00      2
 54.00      8
 53.00      1
 52.00      6
 51.00      7
 50.00     10
 49.00      6
 48.00      9
 47.00      9
 46.00      3
 45.50      2
 45.00     12
         ... 
 23.00     15
 22.00     27
 21.00     24
 20.50      1
 20.00     15
 19.00     25
 18.00     26
 17.00     13
 16.00     17
 15.00      5
 14.50      1
 14.00      6
 13.00      2
 12.00      1
 11.00      4
 10.00      2
 9.00       8
 8.00       4
 7.00       3
 6.00       3
 5.00       4
 4.00      10
 3.00       6
 2.00      10
 1.00       7
 0.92       1
 0.83       2
 0.75       2
 0.67       1
 0.42       1
Name: Age, dtype: int64

In [32]:
# Fill the Age feature w/ the average age
train_df['Age'] = train_df.Age.fillna(np.mean)

In [33]:
# Are we missing any values?
train_df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       2
dtype: int64

In [37]:
train_df.Embarked.value_counts(dropna=False)

S      644
C      168
Q       77
NaN      2
Name: Embarked, dtype: int64

In [38]:
# Fill the Embarked feature w/ the most occuring embarked location
train_df.Embarked = train_df.Embarked.fillna("S")

In [39]:
# check to see any missing values
train_df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

#### Explore the Data
- Are there any relationships we can find?
- What visuals and conclusions can we draw?

In [40]:
# How many men and women are in our training set?
train_df.Sex.value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [56]:
# On a % basis?
male_pct = train_df.Sex.value_counts(normalize=True)[0]
female_pct = train_df.Sex.value_counts(normalize=True)[1]

print("{:.1f}% of men aboard; {:.1f}% of women aboard.".format(100*male_pct, 100*female_pct))

64.8% of men aboard; 35.2% of women aboard.


Let's look at the relationship between sex and survivorship...

In [101]:
# Groupby Sex and Survived
train_df.groupby(['Sex', 'Survived']).size()

Sex     Survived
female  0            81
        1           233
male    0           468
        1           109
dtype: int64

In [103]:
# Let's compare the values
sex_survival = train_df.groupby(['Sex', 'Survived']).size().unstack(1)
sex_survival

Survived,0,1
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1
female,81,233
male,468,109


In [104]:
# Get the female survivorship (out of all/total survivors)
sex_survival['survive_ratio'] = sex_survival.loc['female', 1] / (sex_survival.loc['male', 1] + sex_survival.loc['female', 1])

In [105]:
# Get the male survivorship (out of all/total survivors)
sex_survival.loc['male','survive_ratio'] = sex_survival.loc['male', 1] / (sex_survival.loc['male', 1] + sex_survival.loc['female', 1])

In [106]:
# Check
sex_survival

Survived,0,1,survive_ratio
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,81,233,0.681287
male,468,109,0.318713


So we can conclude that although women make up 35% of those onboard, they represent 68.1% of those who survived.