## Titanic Data Preprocessing

### Feature extraction, NaN removal, and categorical variable manipulation

In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import re

In [2]:
cd Dropbox/Portfolio/DataScience-Portfolio/Titanic

/Users/Capgemini/Dropbox/Portfolio/DataScience-Portfolio/Titanic


In [3]:
# load data and test set
titanic = pd.read_csv('train.csv')
titanic_test = pd.read_csv('test.csv')

### Remove features not useful for predictions

* Cabin: too many NaN
* Ticket: too many different values

In [4]:
# from previous analysis we know that deck cabin contains mostly NaN
titanic.drop('Cabin', axis = 1, inplace = True)
titanic_test.drop('Cabin', axis = 1, inplace = True)

In [5]:
# Ticket has too many different values, I doubt that it will have predictive power
titanic.drop('Ticket', axis = 1, inplace = True)
titanic_test.drop('Ticket', axis = 1, inplace = True)

### Make additional column with the title of the person (Mr, Mrs, Miss, etc)

* Impute the column age with the mean for each title

In [6]:
# function to extract title from Name feature
def get_title(passenger):
    line = passenger
    if re.search('Mrs', line):
        return 'Mrs'
    elif re.search('Mr', line):
        return 'Mr'
    elif re.search('Miss', line):
        return 'Miss'
    elif re.search('Master', line):
        return 'Master'
    else:
        return 'Other'

In [7]:
# extract title  
titanic['Title'] = titanic['Name'].apply(get_title)
titanic_test['Title'] = titanic_test['Name'].apply(get_title)

In [8]:
# fill missing age, with median from title segregation: funtion
def fill_age(passenger):
    
    # determine age by group 
    temp = titanic.groupby(titanic.Title).median()
    
    age, title = passenger
    
    if age == age:
        return age
    else:
        if title == 'Mr':
            return temp.Age['Mr']
        elif title == 'Miss':
            return temp.Age['Miss']
        elif title == ['Mrs']:
            return temp.Age['Mrs']
        elif title == 'Master':
            return temp.Age['Master']
        else:
            return temp.Age['Other']        

In [9]:
# fill age
titanic['Age'] = titanic[['Age', 'Title']].apply(fill_age, axis = 1)
titanic_test['Age'] = titanic_test[['Age', 'Title']].apply(fill_age, axis = 1)

In [10]:
# Remove column Name, it is not useful for predictions and we extracted the title already
titanic.drop('Name', axis = 1, inplace = True)
titanic_test.drop('Name', axis = 1, inplace = True)

In [11]:
# Remove column Title, it is not useful for predictions and we imputed the age already
titanic.drop('Title', axis = 1, inplace = True)
titanic_test.drop('Title', axis = 1, inplace = True)

### Impute Embarked with the most frequent port (S)

In [12]:
titanic["Embarked"].fillna("S", inplace = True)
titanic_test['Embarked'].fillna("S", inplace = True)

### Check that all NaN are removed

In [13]:
titanic.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64

In [14]:
titanic_test.isnull().sum()

PassengerId    0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           1
Embarked       0
dtype: int64

In [15]:
# there is a null value in Fare for titanic_test, I will replace it with the mean fare for titanic train
titanic_test['Fare'].fillna(titanic.Fare.mean(), inplace = True)

### Remove PassengerId

* Removed from train set
* Keep the passengerID column from test set for submission later

In [16]:
titanic.drop('PassengerId', axis = 1, inplace = True)
titanic_test.drop('PassengerId', axis = 1, inplace = True)

In [17]:
titanic.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22,1,0,7.25,S
1,1,1,female,38,1,0,71.2833,C
2,1,3,female,26,0,0,7.925,S
3,1,1,female,35,1,0,53.1,S
4,0,3,male,35,0,0,8.05,S


In [18]:
titanic_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,34.5,0,0,7.8292,Q
1,3,female,47.0,1,0,7.0,S
2,2,male,62.0,0,0,9.6875,Q
3,3,male,27.0,0,0,8.6625,S
4,3,female,22.0,1,1,12.2875,S


### Make dummy variables from categorical variables

* Sex
* Embarked
* Person

In [19]:
# Sex is binary so I code male = 1, female = 0
titanic['Sex'].loc[titanic['Sex'] == 'male'] = 1
titanic['Sex'].loc[titanic['Sex'] == 'female'] = 0

titanic_test['Sex'].loc[titanic_test['Sex'] == 'male'] = 1
titanic_test['Sex'].loc[titanic_test['Sex'] == 'female'] = 0

In [20]:
titanic.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22,1,0,7.25,S
1,1,1,0,38,1,0,71.2833,C
2,1,3,0,26,0,0,7.925,S
3,1,1,0,35,1,0,53.1,S
4,0,3,1,35,0,0,8.05,S


In [21]:
# Sex is binary so I code male = 1, female = 0
titanic['Embarked'].loc[titanic['Embarked'] != 'S'] = 1
titanic['Embarked'].loc[titanic['Embarked'] == 'S'] = 0

titanic_test['Embarked'].loc[titanic_test['Embarked'] != 'S'] = 1
titanic_test['Embarked'].loc[titanic_test['Embarked'] == 'S'] = 0

In [22]:
titanic.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22,1,0,7.25,0
1,1,1,0,38,1,0,71.2833,1
2,1,3,0,26,0,0,7.925,0
3,1,1,0,35,1,0,53.1,0
4,0,3,1,35,0,0,8.05,0


In [23]:
titanic_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,34.5,0,0,7.8292,1
1,3,0,47.0,1,0,7.0,0
2,2,1,62.0,0,0,9.6875,1
3,3,1,27.0,0,0,8.6625,0
4,3,0,22.0,1,1,12.2875,0


In [24]:
# save to csv to use in following notebook
titanic.to_csv('titanic_train_ready2.csv', header = True, index = False)
titanic_test.to_csv('titanic_test_ready2.csv', header = True, index = False)

In [25]:
titanic.sum()

Survived      342.0000
Pclass       2057.0000
Sex           577.0000
Age         26337.1700
SibSp         466.0000
Parch         340.0000
Fare        28693.9493
Embarked      245.0000
dtype: float64