In [1]:
# Load in the Libraries
import pickle
import pandas as pd
import numpy as np
import re
import os
import warnings
warnings.filterwarnings('ignore')

## Feature Exploration, Engineering and Cleaning¶


In [2]:
root = '../..'

In [3]:
# Load in the train and test datasets
train = pd.read_csv(os.path.join(root, 'data/train.csv'))
test = pd.read_csv(os.path.join(root, 'data/test.csv'))

# Store our passenger ID for easy access
PassengerId = test['PassengerId']

train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


### Feature Enginnering

In [4]:
full_data = [train, test]

In [5]:
# Some features of my own that I have added in
# Gives the length of the name
train['Name_length'] = train['Name'].apply(len)
test['Name_length'] = test['Name'].apply(len)

In [6]:
#Feature that tells whether a passenger had a cabin on the Titanic
train['Has_Cabin'] = train['Cabin'].apply(lambda x: 0 if type(x) == float else 1)
test['Has_Cabin'] = test['Cabin'].apply(lambda x: 0 if type(x) == float else 1)

In [7]:
# Feature engineering steps taken from SIna
# Create new feature FamilySize as a combination of SibSp and Parch
for dataset in full_data:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

In [8]:
# Create new feature IsAlone from FamilySize
for dataset in full_data:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1

In [9]:
# Remove all NULLS in the Embarked column
for dataset in full_data:
    dataset['Embarked'] = dataset['Embarked'].fillna('S')

In [10]:
# Remove all NULLS in the Fare colunn and create a new feature CategoricalFare
for dataset in full_data:
    dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())
train['CategoricalFare'] = pd.qcut(train['Fare'], 4)

In [11]:
# Create an New feature CategoricalAge
for dataset in full_data:
    age_avg = dataset['Age'].mean()
    print(age_avg)
    age_std = dataset['Age'].std()
    print(age_std)
    age_null_count = dataset['Age'].isnull().sum()
    print(age_null_count)
    age_null_random_list = np.random.randint(age_avg - age_std,
                                             age_avg + age_std,
                                             size=age_null_count)
    print(age_null_random_list)
    dataset['Age'][np.isnan(dataset['Age'])] = age_null_random_list
    dataset['Age'] = dataset['Age'].astype(int)
train['CategoricalAge'] = pd.cut(train['Age'], 5)

29.69911764705882
14.526497332334042
177
[15 29 23 39 39 18 41 32 34 31 34 38 29 33 35 40 33 21 19 41 20 27 41 24
 38 43 24 41 16 29 17 26 28 21 19 25 16 41 27 35 35 29 38 16 26 31 21 35
 20 32 39 23 25 33 15 38 41 26 38 15 33 41 17 33 35 18 15 20 35 23 17 28
 42 30 23 15 16 18 15 16 41 22 22 27 22 40 41 20 32 38 41 43 25 36 29 39
 34 37 38 36 32 38 27 35 31 36 35 41 22 20 43 15 37 40 29 40 33 41 43 31
 35 39 33 34 33 24 18 34 40 27 20 21 41 32 24 34 33 32 29 33 30 36 34 34
 17 43 20 33 36 41 40 28 39 33 36 20 31 26 30 20 17 26 37 19 33 16 28 15
 34 19 40 43 40 29 20 35 30]
30.272590361445783
14.181209235624422
86
[26 43 27 35 25 34 39 16 32 25 23 35 30 27 25 32 34 31 26 20 32 16 25 29
 35 17 41 39 37 17 33 43 28 16 19 24 37 31 23 34 36 34 21 35 18 21 32 28
 34 29 26 31 32 26 17 20 24 26 37 39 22 16 22 26 27 33 18 30 16 23 31 36
 31 39 38 28 21 26 18 33 41 18 31 39 20 36]


In [12]:
# Define function to extract tites from passenger names
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""

In [13]:
# Create a new feature Title, containing the titles of passenger name
for dataset in full_data:
    dataset['Title'] = dataset['Name'].apply(get_title)

In [14]:
# Group all non-common titles into one single grouping "Rare"
for dataset in full_data:
    dataset['Title'] = dataset['Title'].replace([
        'Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 
        'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

In [15]:
# Mapping Variable to Categoricals
for dataset in full_data:
    # Sex
    dataset['Sex'] = dataset['Sex'].map({'female': 0, 'male': 1}).astype(int)
    
    # titles
    title_mapping = {'Mr': 0, 'Miss': 1, 'Mrs': 2, 'Master': 4, 'Rare': 5}
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)
    
    # Embarked
    dataset['Embarked'] = dataset['Embarked'].map({
        'S': 0, 'C': 1, 'Q': 2}).astype(int)
    
    # Fare
    fare = dataset['Fare']
    dataset.loc[fare <= 7.91, 'Fare'] = 0
    dataset.loc[(fare > 7.91) & (fare <= 14.454), 'Fare'] = 1
    dataset.loc[(fare > 14.454) & (fare <= 31), 'Fare'] = 2
    dataset.loc[fare > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)
    
    # Age
    age = dataset['Age']
    dataset.loc[age <= 16, 'Age'] = 0
    dataset.loc[(age > 16) & (age <= 32), 'Age'] = 1
    dataset.loc[(age > 32) & (age <= 48), 'Age'] = 2
    dataset.loc[(age > 48) & (age <= 64), 'Age'] = 3
    dataset.loc[age > 64, 'Age'] = 4

In [16]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Name_length,Has_Cabin,FamilySize,IsAlone,CategoricalFare,CategoricalAge,Title
0,1,0,3,"Braund, Mr. Owen Harris",1,1,1,0,A/5 21171,0,,0,23,0,2,0,"(-0.001, 7.91]","(16.0, 32.0]",0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,0,PC 17599,3,C85,1,51,1,2,0,"(31.0, 512.329]","(32.0, 48.0]",2
2,3,1,3,"Heikkinen, Miss. Laina",0,1,0,0,STON/O2. 3101282,1,,0,22,0,1,1,"(7.91, 14.454]","(16.0, 32.0]",1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,2,1,0,113803,3,C123,0,44,1,2,0,"(31.0, 512.329]","(32.0, 48.0]",2
4,5,0,3,"Allen, Mr. William Henry",1,2,0,0,373450,1,,0,24,0,1,1,"(7.91, 14.454]","(32.0, 48.0]",0


In [17]:
# Feature selection
drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp']
train = train.drop(drop_elements, axis=1)
train = train.drop(['CategoricalAge', 'CategoricalFare'], axis=1)
test = test.drop(drop_elements, axis=1)

In [18]:
with open('./ckpt/train_proc.pkl', mode='wb') as f:
    pickle.dump(train, f)
with open('./ckpt/test_proc.pkl', mode='wb') as f:
    pickle.dump(test, f)
with open('./ckpt/test_passengerid.pkl', mode='wb') as f:
    pickle.dump(PassengerId, f)