# Feature Engineering

We attempt to create some additional features from the Name and Ticket columns of the data.

We also perform one more cleaning of the data if we do not extract any useful information from the columns.

# Load Data and Import libs

In [1]:
import os

project_root = os.path.join(os.getcwd(), os.pardir)
clean_data_dir = os.path.join(project_root, 'data', 'clean')

In [2]:
import pandas as pd

In [3]:
train_df = pd.read_csv(os.path.join(clean_data_dir, 'train.csv'))

In [4]:
test_df = pd.read_csv(os.path.join(clean_data_dir, 'test.csv'))

In [5]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Name,Age,SibSp,Parch,Ticket,Fare,male,Q,S,2,3
0,1,0,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,1,0,1,0,1
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,0,0,0,0,0
2,3,1,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,0,0,1,0,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,0,0,1,0,0
4,5,0,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,1,0,1,0,1


In [6]:
train_df.describe()

Unnamed: 0,PassengerId,Survived,Age,SibSp,Parch,Fare,male,Q,S,2,3
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,29.258328,0.523008,0.381594,32.204208,0.647587,0.08642,0.722783,0.20651,0.551066
std,257.353842,0.486592,13.546307,1.102743,0.806057,49.693429,0.47799,0.281141,0.447876,0.405028,0.497665
min,1.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,223.5,0.0,21.0,0.0,0.0,7.9104,0.0,0.0,0.0,0.0,0.0
50%,446.0,0.0,27.0,0.0,0.0,14.4542,1.0,0.0,1.0,0.0,1.0
75%,668.5,1.0,36.0,1.0,0.0,31.0,1.0,0.0,1.0,0.0,1.0
max,891.0,1.0,80.0,8.0,6.0,512.3292,1.0,1.0,1.0,1.0,1.0


In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Name           891 non-null object
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
male           891 non-null int64
Q              891 non-null int64
S              891 non-null int64
2              891 non-null int64
3              891 non-null int64
dtypes: float64(2), int64(9), object(2)
memory usage: 90.6+ KB


# Feature Engineer Name

Extract the Courtesy Title from the name ie. Mr, Ms, etc

In [8]:
train_df['Name'].head(5)

0                              Braund, Mr. Owen Harris
1    Cumings, Mrs. John Bradley (Florence Briggs Th...
2                               Heikkinen, Miss. Laina
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                             Allen, Mr. William Henry
Name: Name, dtype: object

In [9]:
def courtesy_title(name):
    return name.split(',')[-1].split('.')[0].strip()

In [10]:
train_df['Courtesy Title'] = train_df['Name'].apply(courtesy_title)

In [11]:
train_df['Courtesy Title'].value_counts()

Mr              517
Miss            182
Mrs             125
Master           40
Dr                7
Rev               6
Mlle              2
Col               2
Major             2
the Countess      1
Sir               1
Don               1
Capt              1
Jonkheer          1
Mme               1
Lady              1
Ms                1
Name: Courtesy Title, dtype: int64

In [12]:
test_df['Courtesy Title'] = test_df['Name'].apply(courtesy_title)

In [13]:
test_df['Courtesy Title'].value_counts()

Mr        240
Miss       78
Mrs        72
Master     21
Col         2
Rev         2
Dr          1
Ms          1
Dona        1
Name: Courtesy Title, dtype: int64

# Group Titles

Titles with > 5 values are kept as is

Titles with < 5 values are grouped into 'Other'

In [14]:
def group_courtesy_title(title):
    keep_titles = ('Mr', 'Miss', 'Mrs', 'Master', 'Dr', 'Rev')
    if title not in keep_titles:
        title = 'Other'
        
    return title

In [15]:
train_df['Courtesy Title'] = train_df['Courtesy Title'].apply(group_courtesy_title)

In [16]:
train_df['Courtesy Title'].value_counts()

Mr        517
Miss      182
Mrs       125
Master     40
Other      14
Dr          7
Rev         6
Name: Courtesy Title, dtype: int64

In [17]:
test_df['Courtesy Title'] = test_df['Courtesy Title'].apply(group_courtesy_title)

In [18]:
test_df['Courtesy Title'].value_counts()

Mr        240
Miss       78
Mrs        72
Master     21
Other       4
Rev         2
Dr          1
Name: Courtesy Title, dtype: int64

# No other useful things to engineer

Drop passengerid, name and ticket.

Also we create new columns based on category of courtesy title.

In [19]:
drop_cols = ['Name', 'Ticket', 'PassengerId']

train_df.drop(drop_cols, axis=1, inplace=True)
test_df.drop(drop_cols, axis=1, inplace=True)

In [20]:
ct = pd.get_dummies(train_df['Courtesy Title'], drop_first=True)
train_df = pd.concat([train_df, ct], axis=1)

In [21]:
train_df.head()

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,male,Q,S,2,3,Courtesy Title,Master,Miss,Mr,Mrs,Other,Rev
0,0,22.0,1,0,7.25,1,0,1,0,1,Mr,0,0,1,0,0,0
1,1,38.0,1,0,71.2833,0,0,0,0,0,Mrs,0,0,0,1,0,0
2,1,26.0,0,0,7.925,0,0,1,0,1,Miss,0,1,0,0,0,0
3,1,35.0,1,0,53.1,0,0,1,0,0,Mrs,0,0,0,1,0,0
4,0,35.0,0,0,8.05,1,0,1,0,1,Mr,0,0,1,0,0,0


In [22]:
ct = pd.get_dummies(test_df['Courtesy Title'], drop_first=True)
test_df = pd.concat([test_df, ct], axis=1)

In [23]:
test_df.head()

Unnamed: 0,Age,SibSp,Parch,Fare,male,Q,S,2,3,Courtesy Title,Master,Miss,Mr,Mrs,Other,Rev
0,34.5,0,0,7.8292,1,1,0,0,1,Mr,0,0,1,0,0,0
1,47.0,1,0,7.0,0,0,1,0,1,Mrs,0,0,0,1,0,0
2,62.0,0,0,9.6875,1,1,0,1,0,Mr,0,0,1,0,0,0
3,27.0,0,0,8.6625,1,0,1,0,1,Mr,0,0,1,0,0,0
4,22.0,1,1,12.2875,0,0,1,0,1,Mrs,0,0,0,1,0,0


In [24]:
train_df.drop(['Courtesy Title'], axis=1, inplace=True)
test_df.drop(['Courtesy Title'], axis=1, inplace=True)

# Save Data

In [25]:
processed_data_dir = os.path.join(project_root, 'data', 'processed')

train_df.to_csv(os.path.join(processed_data_dir, 'train.csv'), index=False)
test_df.to_csv(os.path.join(processed_data_dir, 'test.csv'), index=False)