In [59]:
import pandas as pd
import numpy as np
import random as rnd

In [60]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
complete_data = [train_data,test_data]
train_data.head()
# test_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [61]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [62]:
# Droping 'Cabin', 'Ticket' and 'PassengerID'
train_data = train_data.drop(['Cabin', 'Ticket', 'PassengerId'], axis=1)
test_data = test_data.drop(['Cabin', 'Ticket', 'PassengerId'], axis=1)
complete_data = [train_data, test_data]

In [63]:
# Creating 'Titles' based on names
map_titles = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
for dataset in complete_data:
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    dataset['Title'] = dataset['Title'].replace(['Mlle', 'Ms'], 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    dataset['Title'] = dataset['Title'].map(map_titles)
#     this is not working because we cannot change dataset itself when iterating (i guess)
#     dataset = dataset.drop(['Name'], axis=1)
train_data = train_data.drop(['Name'], axis=1)
test_data = test_data.drop(['Name'], axis=1)
train_data.head()
complete_data = [train_data, test_data]
print(train_data.shape)
print(test_data.shape)

(891, 9)
(418, 8)


In [64]:
# 'Age' Attribute
# fill NAs in age column with average 
for dataset in complete_data:
    dataset['Age'] = dataset['Age'].fillna(dataset['Age'].mean())
# train_data.info()
# test_data.info()
train_data['AgeBand'] = pd.cut(train_data['Age'].astype(int), 5)
train_data[['AgeBand', 'Survived']].groupby('AgeBand', as_index=False).mean()

Unnamed: 0,AgeBand,Survived
0,"(-0.08, 16]",0.55
1,"(16, 32]",0.344762
2,"(32, 48]",0.403226
3,"(48, 64]",0.434783
4,"(64, 80]",0.090909


In [65]:
# Convert 'Age' into ordinal
complete_dataset = [train_data, test_data]
for ds in complete_dataset:
    ds.loc[ds["Age"] <= 16, 'Age'] = 0
    ds.loc[(ds['Age'] > 16) & (ds['Age'] <=32), 'Age']=1
    ds.loc[(ds['Age'] > 32) & (ds['Age'] <=48), 'Age']=2
    ds.loc[(ds['Age'] > 48) & (ds['Age'] <=64), 'Age']=3
    ds.loc[(ds['Age'] > 64), 'Age']=4
    ds.Age = ds.Age.astype(int)
train_data = train_data.drop('AgeBand', axis=1)

In [66]:
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,male,1,1,0,7.25,S,1
1,1,1,female,2,1,0,71.2833,C,3
2,1,3,female,1,0,0,7.925,S,2
3,1,1,female,2,1,0,53.1,S,3
4,0,3,male,2,0,0,8.05,S,1


In [67]:
# Convert 'SibSp' and 'Parch' into 0 and 1 (meaning 'hasSibSp' and 'hasParch')
complete_data = [train_data, test_data]
mapping = lambda x: 0 if x==0 else 1
for ds in complete_data:
    ds['SibSp'] = ds['SibSp'].map(mapping)
    ds['Parch'] = ds['Parch'].map(mapping)

In [68]:
# 'Embarked' Attribute: fill missing and convert to ordinal
mode = train_data['Embarked'].mode()[0]
complete_data = [train_data, test_data]
for ds in complete_data:
    ds['Embarked'] = ds['Embarked'].fillna(mode)
train_data[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean()

Unnamed: 0,Embarked,Survived
0,C,0.553571
1,Q,0.38961
2,S,0.339009


In [69]:
# convert to ordinal
mapping = {'C': 0, 'Q': 1, 'S': 2}
for ds in complete_data:
    ds['Embarked'] = ds['Embarked'].map(mapping).astype(int)
ds['Embarked'].head()

0    1
1    2
2    1
3    2
4    2
Name: Embarked, dtype: int64

In [70]:
# train_data.info()
# test_data.info()

In [71]:
# 'Fare' Attribute: filling one missing and convert to ordinal 
for ds in complete_data:
    ds['Fare'] = ds['Fare'].fillna(ds['Fare'].median())
train_data[['Fare', 'Survived']].groupby('Fare', as_index=False).mean()
train_data['FareBand'] = pd.qcut(train_data['Fare'], 4)
train_data[['FareBand', 'Survived']].groupby('FareBand', as_index=False).mean()
# train_data['FareBand'].value_counts()

Unnamed: 0,FareBand,Survived
0,"[0, 7.91]",0.197309
1,"(7.91, 14.454]",0.303571
2,"(14.454, 31]",0.454955
3,"(31, 512.329]",0.581081


In [72]:
for dataset in complete_data:
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)
train_data = train_data.drop(['FareBand'], axis=1)

In [73]:
train_data.tail()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
886,0,2,male,1,0,0,1,2,5
887,1,1,female,1,0,0,2,2,2
888,0,3,female,1,1,1,2,2,2
889,1,1,male,1,0,0,2,0,1
890,0,3,male,1,0,0,0,1,1


In [74]:
# Gender
complete_data = [train_data, test_data]
for ds in complete_data:
    ds.loc[ds['Sex']=='male', 'Sex']=0
    ds.loc[ds['Sex']=='female', 'Sex']=1
    ds['Sex'] = ds['Sex'].astype(int)

In [75]:
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,0,1,1,0,0,2,1
1,1,1,1,2,1,0,3,0,3
2,1,3,1,1,0,0,1,2,2
3,1,1,1,2,1,0,3,2,3
4,0,3,0,2,0,0,1,2,1


In [78]:
test_data.columns

Index([u'Pclass', u'Sex', u'Age', u'SibSp', u'Parch', u'Fare', u'Embarked',
       u'Title'],
      dtype='object')

In [80]:
train_data.to_csv('train_processed.csv')

In [81]:
test_data.to_csv('test_processed.csv')