In [1]:
# Load in the Libraries
import pickle
import pandas as pd
import numpy as np
import re
import os
import warnings
warnings.filterwarnings('ignore')

## Feature Exploration, Engineering and Cleaning¶


In [2]:
root = '../..'

In [3]:
# Load in the train and test datasets
train = pd.read_csv(os.path.join(root, 'data/train.csv'))
test = pd.read_csv(os.path.join(root, 'data/test.csv'))

# Store our passenger ID for easy access
PassengerId = test['PassengerId']

train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


### Feature Enginnering

In [4]:
full_data = [train, test]

In [5]:
# Some features of my own that I have added in
# Gives the length of the name
train['Name_length'] = train['Name'].apply(len)
test['Name_length'] = test['Name'].apply(len)

In [6]:
#Feature that tells whether a passenger had a cabin on the Titanic
train['Has_Cabin'] = train['Cabin'].apply(lambda x: 0 if type(x) == float else 1)
test['Has_Cabin'] = test['Cabin'].apply(lambda x: 0 if type(x) == float else 1)

In [7]:
# Feature engineering steps taken from SIna
# Create new feature FamilySize as a combination of SibSp and Parch
for dataset in full_data:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

In [8]:
# Create new feature IsAlone from FamilySize
for dataset in full_data:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1

In [9]:
# Remove all NULLS in the Embarked column
for dataset in full_data:
    dataset['Embarked'] = dataset['Embarked'].fillna('S')

In [10]:
# Remove all NULLS in the Fare colunn and create a new feature CategoricalFare
for dataset in full_data:
    dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())
train['CategoricalFare'] = pd.qcut(train['Fare'], 4)

In [11]:
# Create an New feature CategoricalAge
for dataset in full_data:
    age_avg = dataset['Age'].mean()
    print(age_avg)
    age_std = dataset['Age'].std()
    print(age_std)
    age_null_count = dataset['Age'].isnull().sum()
    print(age_null_count)
    age_null_random_list = np.random.randint(age_avg - age_std,
                                             age_avg + age_std,
                                             size=age_null_count)
    print(age_null_random_list)
    dataset['Age'][np.isnan(dataset['Age'])] = age_null_random_list
    dataset['Age'] = dataset['Age'].astype(int)
train['CategoricalAge'] = pd.cut(train['Age'], 5)

29.69911764705882
14.526497332334042
177
[35 27 27 36 36 35 42 23 43 23 21 30 20 27 40 22 28 28 41 43 34 29 28 32
 15 40 39 17 21 31 25 16 29 29 39 32 22 15 39 31 29 19 21 41 25 37 23 41
 21 22 26 20 19 15 15 16 21 16 26 28 25 37 20 38 42 19 36 37 21 28 15 16
 22 22 36 39 17 32 33 21 38 24 38 27 22 20 25 36 22 25 23 18 33 37 33 43
 29 18 25 20 25 16 29 23 21 43 25 25 34 37 38 30 19 38 16 19 21 39 24 33
 39 34 28 22 18 17 16 37 24 30 23 35 28 35 21 17 31 29 25 25 22 16 20 21
 33 18 24 38 42 28 22 19 15 37 21 19 25 41 40 35 21 20 40 34 41 18 27 21
 25 26 30 29 22 24 20 39 31]
30.272590361445783
14.181209235624422
86
[33 26 40 32 41 19 36 17 32 27 20 33 40 16 20 29 16 39 31 16 39 35 28 22
 37 25 38 30 20 20 38 28 26 19 29 29 42 37 41 33 25 36 41 34 24 24 41 25
 36 42 26 30 40 26 25 29 38 24 39 42 33 41 30 28 19 22 17 18 36 41 40 17
 21 32 35 38 29 25 20 41 31 41 26 25 23 35]


In [12]:
# Define function to extract tites from passenger names
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""

In [13]:
# Create a new feature Title, containing the titles of passenger name
for dataset in full_data:
    dataset['Title'] = dataset['Name'].apply(get_title)

In [14]:
# Group all non-common titles into one single grouping "Rare"
for dataset in full_data:
    dataset['Title'] = dataset['Title'].replace([
        'Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 
        'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

In [15]:
# Mapping Variable to Categoricals
for dataset in full_data:
    # Sex
    dataset['Sex'] = dataset['Sex'].map({'female': 0, 'male': 1}).astype(int)
    
    # titles
    title_mapping = {'Mr': 0, 'Miss': 1, 'Mrs': 2, 'Master': 4, 'Rare': 5}
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)
    
    # Embarked
    dataset['Embarked'] = dataset['Embarked'].map({
        'S': 0, 'C': 1, 'Q': 2}).astype(int)
    
    # Fare
    fare = dataset['Fare']
    dataset.loc[fare <= 7.91, 'Fare'] = 0
    dataset.loc[(fare > 7.91) & (fare <= 14.454), 'Fare'] = 1
    dataset.loc[(fare > 14.454) & (fare <= 31), 'Fare'] = 2
    dataset.loc[fare > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)
    
    # Age
    age = dataset['Age']
    dataset.loc[age <= 16, 'Age'] = 0
    dataset.loc[(age > 16) & (age <= 32), 'Age'] = 1
    dataset.loc[(age > 32) & (age <= 48), 'Age'] = 2
    dataset.loc[(age > 48) & (age <= 64), 'Age'] = 3
    dataset.loc[age > 64, 'Age'] = 4

In [16]:
# Feature selection
drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp']
train = train.drop(drop_elements, axis=1)
train = train.drop(['CategoricalAge', 'CategoricalFare'], axis=1)
test = test.drop(drop_elements, axis=1)

In [17]:
with open('./ckpt/train_proc.pkl', mode='wb') as f:
    pickle.dump(train, f)
with open('./ckpt/test_proc.pkl', mode='wb') as f:
    pickle.dump(test, f)
with open('./ckpt/test_passengerid.pkl', mode='wb') as f:
    pickle.dump(PassengerId, f)