## Load Liabraries

In [44]:
import numpy as np
import pandas as pd
import os
import re

from sklearn.preprocessing import LabelEncoder
from sklearn import cross_validation
from sklearn.naive_bayes import GaussianNB
from sklearn import svm, grid_search, datasets
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score

project_path = "%s" % os.getcwd()
print os.getcwd()

C:\Users\yi.chen\GitHub\training_prework_titanic


## Load Data

In [2]:
train = pd.read_csv("%s/data/train.csv" % project_path, sep=",")
test = pd.read_csv("%s/data/test.csv" % project_path, sep=",")
print "====== train dataset :"
print train.shape
print train.columns.values
print "====== test dataset :"
print test.shape
print test.columns.values

# create Survived column for test dataset
test['Survived'] = 0
data = pd.concat([train, test], ignore_index=True)
data.shape

(891, 12)
['PassengerId' 'Survived' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch'
 'Ticket' 'Fare' 'Cabin' 'Embarked']
(418, 11)
['PassengerId' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch' 'Ticket' 'Fare'
 'Cabin' 'Embarked']


(1309, 12)

## Feature Engineering

#### Create new feature - Salutation

In [3]:
data['Salutation'] = data.apply(lambda row: row.Name.split(',')[1].split('.')[0].strip(), axis=1)
data.Salutation.value_counts(dropna=False)

# group certain usual titles
data.loc[data.Salutation.isin(['Dona', 'the Countess', 'Jonkheer']), 'Salutation'] = 'Lady'
data.loc[data.Salutation.isin(['Capt', 'Don', 'Major']), 'Salutation'] = 'Sir'

data.Salutation.value_counts(dropna=False)

Mr        757
Miss      260
Mrs       197
Master     61
Dr          8
Rev         8
Sir         5
Col         4
Lady        4
Ms          2
Mlle        2
Mme         1
Name: Salutation, dtype: int64

#### Create new feature - Surname

In [4]:
data['Surname'] = data.apply(lambda row: row.Name.split(',')[0].strip(), axis=1)
counts = data['Surname'].value_counts(dropna=False)

# set surname to a tag 'small' for those whose occurence is less than 3
surname_small_list = counts[counts <= 2].index.tolist()
data.loc[data.Surname.isin(surname_small_list), 'Surname'] = 'Small'
data['Surname'].value_counts(dropna=False)

Small         903
Andersson      11
Sage           11
Goodwin         8
Asplund         8
Davies          7
Skoog           6
Fortune         6
Rice            6
Brown           6
Ford            6
Panula          6
Johnson         6
Carter          6
Smith           6
Palsson         5
Lefebre         5
Ryerson         5
Williams        5
Kelly           5
Thomas          5
Goldsmith       4
Herman          4
Hocking         4
Baclini         4
Olsen           4
Johansson       4
Dean            4
West            4
Hart            4
             ... 
Hays            3
Cor             3
Sandstrom       3
Compton         3
Chapman         3
Keane           3
Peacock         3
Navratil        3
Moran           3
Thayer          3
Newell          3
Collyer         3
Frauenthal      3
Jensen          3
Spedden         3
Widener         3
Oreskovic       3
Karlsson        3
Crosby          3
O'Brien         3
Hickman         3
Moubarek        3
Peter           3
Touma           3
Danbom    

#### Fill missing values in column Age

In [5]:
# get the mean of age
age_mean = data.Age.mean()
# fill missing ages by the mean
data.loc[data.Age.isnull(), 'Age'] = age_mean

#### Create a new feature - AgeRange

In [6]:
def get_age_range(age):
    if age < 20:
        return 'young'
    elif age < 50:
        return 'middle'
    elif age >= 50:
        return 'aged'
    else:
        return age
    
data['AgeRange'] = data.apply(lambda row: get_age_range(row.Age), axis=1)
data.AgeRange.value_counts(dropna=False)

middle    974
young     225
aged      110
Name: AgeRange, dtype: int64

#### Create a new feature - FamilySize

In [7]:
data['FamilySize'] = data['SibSp'] + data['Parch']
data.FamilySize.value_counts(dropna=False)

# for those whose family size is less than 3, set the tag 'Small'
#data.loc[data.FamilySize <=2, 'FamilySize'] = 'Small'
#data.FamilySize.value_counts(dropna=False)

0     790
1     235
2     159
3      43
5      25
4      22
6      16
10     11
7       8
Name: FamilySize, dtype: int64

#### Create a new feature - TicketCategory

In [8]:
def get_ticket_category(ticket):
    if ticket.isdigit(): # contain only digital numbers
        return 'Digital'
    elif ticket.isalpha(): # contain only letters
        return ticket
    else:
        return ticket.split()[0]

data['TicketCategory'] = data.apply(lambda row: get_ticket_category(row.Ticket), axis=1)
#print data.TicketCategory.value_counts(dropna=False)

# replace certain similar letters by a base one
data = data.replace({
    'TicketCategory': {
            'A./5.' : 'A.5.',
            'A.5' : 'A.5.',
            'A/5' : 'A.5.',
            'A/5.' : 'A.5.',
            'A/S' : 'A.5.',
            'A/4' : 'A.4.',
            'A/4.' : 'A.4.',
            'A4.' : 'A.4.',
            'AQ/4': 'A.Q.',
            'AQ/3.': 'A.Q.',
            'C.A./SOTON' : 'C.A.',
            'CA' : 'C.A.',
            'CA.' : 'C.A.',
            'C.A./SOTON' : 'C.A.',
            'S.C./A.4.': 'S.C.',
            'S.C./PARIS': 'S.C.',
            'S.O./P.P.': 'S.O.',
            'S.O.C.': 'S.O.',
            'S.O.P.': 'S.O.',
            'SO/C': 'S.O.',
            'SC': 'S.C.',
            'SC/AH': 'S.C.',
            'SC/AH': 'S.C.',
            'SC/PARIS': 'S.C.',
            'SC/Paris': 'S.C.',
            'SC/A4': 'S.C.',
            'SC/A.3': 'S.C.',
            'SCO/W': 'S.C.',
            'SOTON/O.Q.': 'SOTON/O',
            'SOTON/O2': 'SOTON/O',
            'SOTON/OQ': 'SOTON/O',
            'STON/O2.': 'STON/O',
            'STON/OQ.': 'STON/O',
            'S.W./PP': 'S.W.',
            'SW/PP': 'S.W.',
            'W./C.' : 'W.C.',
            'W/C' : 'W.C.',
            'WE/P' : 'W.E.P.',
    }
})

data.TicketCategory.value_counts(dropna=False)

Digital    957
PC          92
C.A.        69
S.C.        30
A.5.        29
SOTON/O     27
STON/O      22
S.O.        16
W.C.        15
A.4.        10
F.C.C.       9
C            8
PP           4
W.E.P.       4
LINE         4
F.C.         3
P/PP         2
A.Q.         2
S.W.         2
LP           1
A.           1
Fa           1
S.P.         1
Name: TicketCategory, dtype: int64

#### Create a new feature - TicketFirstLetter

In [9]:
data['TicketFirstLetter'] = data.apply(lambda row: row.TicketCategory[0], axis=1)
data.TicketFirstLetter.value_counts(dropna=False)

D    957
S     98
P     98
C     77
A     42
W     19
F     13
L      5
Name: TicketFirstLetter, dtype: int64

#### Create a new feature - TicketNumberLength

In [10]:
def get_ticket_number_length(ticket):
    if ticket.isdigit(): # contain only digital numbers
        return len(ticket)
    elif ticket.isalpha(): # contain only letters
        return 0
    else:
        return len(ticket.split()[1])
    
data['TicketNumberLength'] = data.apply(lambda row: get_ticket_number_length(row.Ticket), axis=1)
data.TicketNumberLength.value_counts(dropna=False)

6    606
5    377
4    245
7     46
2     15
3     12
1      4
0      4
Name: TicketNumberLength, dtype: int64

#### Fill missing values in Fare

In [71]:
data.loc[data.Fare.isnull(), 'Fare'] = np.mean(data['Fare'])

#### Create a new feature - FareLevel

In [72]:
def get_fare_level(fare):
    if fare < 8:
        return '[0-8)'
    elif fare < 16:
        return '[8-16)'
    elif fare < 32:
        return '[16-32)'
    else:
        return '32+'

data['FareLevel'] =  data.apply(lambda row: get_fare_level(row.Fare), axis=1)
data.FareLevel.value_counts(dropna=False)

[0-8)      360
[8-16)     354
32+        312
[16-32)    283
Name: FareLevel, dtype: int64

#### Create a new feature - CabinCategory

In [12]:
def get_letters(s):
    
    try:
        match = re.search('[A-Z]+', s)
        if match:
            return match.group()
        else:
            return s
    except Exception as e:
        return 'N'

data['CabinCategory'] = data.apply(lambda row: get_letters(row.Cabin), axis=1)
data.CabinCategory.value_counts(dropna=False)

N    1014
C      94
B      65
D      46
E      41
A      22
F      21
G       5
T       1
Name: CabinCategory, dtype: int64

#### Fill missing values in column Cabin

In [21]:
data.loc[data.Cabin.isnull(), 'Cabin'] = 'N'

### Transform labels data to numeric data

In [73]:
le = LabelEncoder()
data_transformed = data.copy()

# Cabin
data_transformed['Cabin'] = le.fit_transform(data_transformed['Cabin'])
# Embarked
data_transformed['Embarked'] = le.fit_transform(data_transformed['Embarked'])
# Name
data_transformed['Name'] = le.fit_transform(data_transformed['Name'])
# Sex
data_transformed['Sex'] = le.fit_transform(data_transformed['Sex'])
# Ticket
data_transformed['Ticket'] = le.fit_transform(data_transformed['Ticket'])
# Salutation
data_transformed['Salutation'] = le.fit_transform(data_transformed['Salutation'])
# Surname
data_transformed['Surname'] = le.fit_transform(data_transformed['Surname'])
# AgeRange
data_transformed['AgeRange'] = le.fit_transform(data_transformed['AgeRange'])
# TicketCategory
data_transformed['TicketCategory'] = le.fit_transform(data_transformed['TicketCategory'])
# TicketFirstLetter
data_transformed['TicketFirstLetter'] = le.fit_transform(data_transformed['TicketFirstLetter'])
# FareLevel
data_transformed['FareLevel'] = le.fit_transform(data_transformed['FareLevel'])
# CabinCategory
data_transformed['CabinCategory'] = le.fit_transform(data_transformed['CabinCategory'])

## Split train & test data from train.csv

In [74]:
# get train data
train_data = data_transformed.head(len(train))

# split features and target
features_list = train_data.columns.values.tolist()
features_list.remove('Survived')
features_list.remove('PassengerId')
features = train_data[features_list]
labels = train_data['Survived']

# split train and test data
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(features, labels, test_size=0.1, random_state=42)

## Try classification models on train data

#### Naive bayes

In [75]:
clf = GaussianNB()
clf = clf.fit(features_train, labels_train)
pred = clf.predict(features_test)

# compute scores
acc = accuracy_score(pred, labels_test)
precision = precision_score(pred, labels_test)
recall = recall_score(pred, labels_test)
print "Scores - Accuracy: %.5f, Precision: %.5f, Recall: %.5f" % (acc, precision, recall)

Scores - Accuracy: 0.81111, Precision: 0.86111, Recall: 0.72093


## Predict on test.csv

In [86]:
# get test data
test_data = data_transformed.tail(len(test))
features_test = test_data[features_list]

# predict the test data
pred = clf.predict(features_test)
passenger_id = np.arange(len(train) + 1, len(data_transformed) + 1).tolist()

# construct result data and write to csv
result = pd.DataFrame({'PassengerId' : passenger_id, 'Survived' : pred})
result.to_csv("%s/data/prediction.csv" % project_path, index=False)