## Titanic - Machine Learning from Disaster

### model selection & test

In [65]:
import pandas as pd 
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [66]:
train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [67]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


## Feature engineering
### Name
- 성별, 사회적 지위를 알 수 있는 최소한의 word로 mapping
- Title column 생성 및 분류 후 Name column 삭제

In [68]:
train_test_data = [train, test] # combining train and dataset
for dataset in train_test_data:
    dataset['Title'] = dataset['Name'].str.extract('([A-Za-z]+)\.',expand=False)
train['Title'].value_counts()
test['Title'].value_counts()
title_mapping = {'Mr' : 0, 'Miss':1, "Mrs":2,
                    'Master':3, "Dr":3, "Rev":3, 'Col':3, 'Major':3, 'Mlle':3, 'Countess':3, 'Ms':3, 'Lady':3, 
                    'Jonkheer':3, 'Don':3, 'Dona':3, 'Mme':3, 'Capt':3, 'Sir':3}
for dataset in train_test_data:
    dataset['Title'] = dataset['Title'].map(title_mapping)
    
# delete unnecessary feature from dataset
train.drop('Name', axis=1, inplace=True)
test.drop('Name', axis=1, inplace=True)

### Sex
- 중요 feature
- mapping

In [69]:
sex_mapping = {"male":0, "female":1}
for dataset in train_test_data:
    dataset['Sex'] = dataset['Sex'].map(sex_mapping)

### Age
- null 값 존재
- title 의 중간값으로 fillna
- 구간별로 0 - 4 값 부여

In [70]:
# fill missing age with median age for each title (Mr, Mrs, Miss, Others)
train["Age"].fillna(train.groupby("Title")["Age"].transform("median"), inplace=True)
test["Age"].fillna(train.groupby("Title")["Age"].transform("median"), inplace=True)

for dataset in train_test_data:
    dataset.loc[dataset['Age'] <= 16, 'Age'] = 0,
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 26), 'Age'] = 1,
    dataset.loc[(dataset['Age'] > 26) & (dataset['Age'] <= 36), 'Age'] = 2,
    dataset.loc[(dataset['Age'] > 36) & (dataset['Age'] <= 62), 'Age'] = 3,
    dataset.loc[dataset['Age'] > 62 , 'Age'] = 4
    

### Embarked
- null 존재
- count 비율 고려 S로 fillna
- mapping

In [71]:
for dataset in train_test_data:
    dataset['Embarked'] = dataset['Embarked'].fillna('S')

embarked_mapping = {"S":0, "C":1, "Q":2}
for dataset in train_test_data:
    dataset['Embarked'] = dataset['Embarked'].map(embarked_mapping)

### Fare
- null 존재
- Pclass의 Fare의 중간값으로 fillna
- 구간별로 0 - 3 값 부여

In [72]:
# fill missing Fare with median fare for each Pclass
train["Fare"].fillna(train.groupby("Pclass")["Fare"].transform("median"), inplace=True)
test["Fare"].fillna(test.groupby("Pclass")["Fare"].transform("median"), inplace=True)

for dataset in train_test_data:
    dataset.loc[dataset['Fare'] <= 17, 'Fare'] = 0,
    dataset.loc[(dataset['Fare'] > 17) & (dataset['Fare'] <= 30), 'Fare'] = 1,
    dataset.loc[(dataset['Fare'] > 30) & (dataset['Fare'] <= 100), 'Fare'] = 2,
    dataset.loc[dataset['Fare'] > 100 , 'Fare'] = 3

### Cabin
- null 존재
- 숫자 제외 문자열 첫문자로 분류 후 mapping
- Pclass별 Canbin의 중간값으로 fillna

In [73]:
for dataset in train_test_data:
    dataset['Cabin'] = dataset['Cabin'].str[:1]
cabin_mapping = {"A":0, "B":0.4, "C":0.8, "D":1.2, "E":1.6, "F":2.0, "G":2.4, "T":2.8}

for dataset in train_test_data:
    dataset['Cabin'] = dataset['Cabin'].map(cabin_mapping)

# fill missing Fare with median fare for each Pclass
train["Cabin"].fillna(train.groupby("Pclass")["Cabin"].transform("median"), inplace=True)
test["Cabin"].fillna(test.groupby("Pclass")["Cabin"].transform("median"), inplace=True)


### FammilySize
- FammilySize = SibSp + Parch + 1(본인)
- 구간별로 0 - 4 값 mapping 

In [74]:
train["FamilySize"] = train["SibSp"] + train["Parch"] + 1
test["FamilySize"] = test["SibSp"] + test["Parch"] + 1

family_mapping = {1:0, 2:0.4, 3:0.8, 4:1.2, 5:1.6, 6:2.0, 7:2.4, 8:2.8, 9:3.2, 10:3.6, 11:4}
for dataset in train_test_data:
    dataset['FamilySize'] = dataset['FamilySize'].map(family_mapping)


### PassengerId & Ticket
- 유의미한 내용 추출하기 어려워 삭제
- 그외 불필요 컬럼 삭제

In [75]:
features_drop = ['Ticket', 'SibSp', 'Parch']
train = train.drop(features_drop, axis=1)
test = test.drop(features_drop, axis=1)
train = train.drop(['PassengerId'], axis=1)

### Survived
- 결과값으로 target에 분리 저장

In [76]:
train_data = train.drop('Survived', axis=1)
target = train['Survived']
train_data.shape, target.shape
train_data.head(10)

Unnamed: 0,Pclass,Sex,Age,Fare,Cabin,Embarked,Title,FamilySize
0,3,0,1.0,0.0,2.0,0,0,0.4
1,1,1,3.0,2.0,0.8,1,2,0.4
2,3,1,1.0,0.0,2.0,0,1,0.0
3,1,1,2.0,2.0,0.8,0,2,0.4
4,3,0,2.0,0.0,2.0,0,0,0.0
5,3,0,2.0,0.0,2.0,2,0,0.0
6,1,0,3.0,2.0,1.6,0,0,0.0
7,3,0,0.0,1.0,2.0,0,3,1.6
8,3,1,2.0,0.0,2.0,0,2,0.8
9,2,1,0.0,2.0,1.8,1,2,0.4


## Model Selection


In [77]:
# Importing Classifier Modules
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import LeaveOneOut

from sklearn.neighbors import KNeighborsClassifier

import numpy as np

clf = KNeighborsClassifier(n_neighbors = 13)
scoring = 'accuracy'

### Hold-out - Stratified sampling

In [78]:
StratifiedKFold = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
%time score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs=1, scoring=scoring)
round(np.mean(score)*100, 2)

Wall time: 222 ms


82.6

### Cross Validation - K-fold

In [79]:
# K-fold(10)
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)
%time score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs=1, scoring=scoring)
round(np.mean(score)*100, 2)

Wall time: 220 ms


82.6

In [80]:
# K-fold(5)
k_fold = KFold(n_splits=5, shuffle=True, random_state=0)
%time score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs=1, scoring=scoring)
round(np.mean(score)*100, 2)


Wall time: 149 ms


81.82

In [81]:
# K-fold(20)
k_fold = KFold(n_splits=20, shuffle=True, random_state=0)
%time score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs=1, scoring=scoring)
round(np.mean(score)*100, 2)

Wall time: 247 ms


82.15

### Cross Validation - LOOCV

In [82]:
loo = LeaveOneOut()
%time score = cross_val_score(clf, train_data, target, cv=loo, n_jobs=1, scoring=scoring)
round(np.mean(score)*100, 2)   

Wall time: 6.64 s


83.73

### Bootstrapping