In [51]:
import numpy as np
import pandas as pd
import sklearn.linear_model as lm
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
%matplotlib inline

In [52]:
train = pd.read_csv('/Users/yan/Documents/652C-Hu Yifan/final project/Titanic/train.csv')

In [53]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Data Cleaning

#### how many missing values

In [55]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

#### replace the missing values of 'Age' with median

In [62]:
train['Age'] = train['Age'].fillna(train['Age'].median())

In [64]:
train['Embarked'] = train['Embarked'].fillna('C')

In [65]:
train['Cabin'] = train['Cabin'].fillna('U0')

In [66]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,U0,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,U0,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,U0,S


# Feature Engineering

#### create a new feature--Group_num to represent the size of family

In [67]:
train['Group_num'] = train.Parch + train.SibSp + 1

#### create a new feature--Group_size to represent the group size

In [68]:
train['Group_size'] = pd.Series('M', index=train.index)
_ = train.set_value(train.Group_num>4, 'Group_size', 'L')
_ = train.set_value(train.Group_num==1, 'Group_size', 'S')

#### From train data, there are more survivals in group size M than other two, so we assume that when group number between 2 and 4, more prople are survived

In [45]:
train[(train['Survived'] == 0) & (train['Group_size'] == 'S') ].shape[0]

374

In [46]:
train[(train['Survived'] == 1) & (train['Group_size'] == 'S') ].shape[0]

163

In [47]:
train[(train['Survived'] == 0) & (train['Group_size'] == 'M') ].shape[0]

123

In [48]:
train[(train['Survived'] == 1) & (train['Group_size'] == 'M') ].shape[0]

169

In [49]:
train[(train['Survived'] == 0) & (train['Group_size'] == 'L') ].shape[0]

52

In [50]:
train[(train['Survived'] == 1) & (train['Group_size'] == 'L') ].shape[0]

10

#### encoding feature 'Sex'

In [69]:
#female--0;male--1
train.Sex = np.where(train.Sex=='female', 0, 1)

#### Create a feature--Title

In [71]:
#Create a feature--Names to store the length of name
import re
names = train.Name.map(lambda x: len(re.split(' ', x)))
_ = train.set_value(train.index, 'Names', names)
del names

title = train.Name.map(lambda x: re.compile(', (.*?)\.').findall(x)[0])
title[title=='Mme'] = 'Mrs'
title[title.isin(['Ms','Mlle'])] = 'Miss'
title[title.isin(['Don', 'Jonkheer'])] = 'Sir'
title[title.isin(['Dona', 'Lady', 'the Countess'])] = 'Lady'
title[title.isin(['Capt', 'Col', 'Major', 'Dr', 'Officer', 'Rev'])] = 'Officer'
_ = train.set_value(train.index, 'Title', title)
del title

#### Create a feature--Deck to represent the socioeconomic status.

In [72]:
deck = train[~train.Cabin.isnull()].Cabin.map( lambda x : re.compile("([a-zA-Z]+)").search(x).group())
deck = pd.factorize(deck)[0]
_ = train.set_value(train.index, 'Deck', deck)
del deck

#### Create a feature--Room to represent the geo lacation

In [73]:
checker = re.compile("([0-9]+)")
def roomNum(x):
    nums = checker.search(x)
    if nums:
        return int(nums.group())+1
    else:
        return 1
rooms = train.Cabin.map(roomNum)
_ = train.set_value(train.index, 'Room', rooms)
del checker, roomNum
train['Room'] = train.Room/train.Room.sum()

In [75]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Group_num,Group_size,Names,Title,Deck,Room
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,U0,S,2,M,4,Mr,0,9.1e-05
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,C,2,M,7,Mrs,1,0.007826
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,U0,S,1,S,3,Miss,0,9.1e-05
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,S,2,M,7,Mrs,1,0.011284
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,U0,S,1,S,4,Mr,0,9.1e-05
