In [1]:
import pandas as pd

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [2]:
train_test_data = [train, test]
for dataset in train_test_data:
    dataset['Title'] = dataset['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

# Title
title_mapping={
    "Mr": 0, "Miss": 1, "Mrs": 2
}
def title_mapper(title):
    try:
        return title_mapping[title]
    except:
        return 3
for dataset in train_test_data:
    dataset['Title'] = dataset['Title'].map(title_mapper)

# Drop Name Field
for dataset in train_test_data:
    dataset.drop('Name', axis=1, inplace=True)

# Sex
sex_mapping = {"male": 0, "female": 0}
for dataset in train_test_data:
    dataset['Sex'] = dataset['Sex'].map(sex_mapping) 

# fill missing age
for dataset in train_test_data:
    dataset['Age'].fillna(dataset.groupby("Title")['Age'].transform("median"), inplace=True)

def age_binning(age):
    if age <=16: return 0
    elif age <=26: return 1
    elif age <= 36: return 2
    elif age <= 62: return 3
    else: return 4

for dataset in train_test_data:
    dataset['Age'] = dataset['Age'].map(age_binning)


# Embarked - fill Na and mapping
embarked_mapping = {
    'S': 0, 'C': 1, 'Q': 2
}
for dataset in train_test_data:
    dataset['Embarked'] = dataset['Embarked'].fillna('S').map(embarked_mapping)


# fill missing fare : median of same pclass
for dataset in train_test_data:
    dataset['Fare'].fillna(dataset.groupby("Pclass")['Fare'].transform("median"), inplace=True)

def fare_binning(fare):
    if fare <=17: return 0
    elif fare <=30: return 1
    elif fare <= 100: return 2
    else: return 3

for dataset in train_test_data:
    dataset['Fare'] = dataset['Fare'].map(fare_binning)

# Get first Character
for dataset in train_test_data:
    dataset['Cabin'] = dataset['Cabin'].str[:1]

# cabin mapping
# scale: 0.3
cabin_mapping = {
    'A': 0, 'B': 0.3, 'C': 0.6,
    'D': 0.9, 'E': 1.2, 'F': 1.5,
    'G': 1.8, 'T': 2.1
}

for dataset in train_test_data:
    dataset['Cabin'] = dataset['Cabin'].map(cabin_mapping)

for dataset in train_test_data:
    dataset['Cabin'].fillna(dataset.groupby('Pclass')['Cabin'].transform("median"), inplace=True)

# Family Size
for dataset in train_test_data:
    dataset['FamilySize'] = dataset['Parch'] + dataset['SibSp'] + 1

# Family Size scaling
scale = 0.5
for dataset in train_test_data:
    dataset['FamilySize'] = (dataset['FamilySize']-1)*scale

# Drop Fields : Parch, SibSp, Ticket
features_drop = ['Parch', 'SibSp', 'Ticket']
train = train.drop(features_drop, axis=1)
test = test.drop(features_drop, axis=1)

train = train.drop('PassengerId', axis=1)

# Extract target(Survived) data
train_data = train.drop('Survived', axis=1)
train_target = train['Survived']

train_data.shape, train_target.shape

((891, 8), (891,))

In [3]:
train_data.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Cabin,Embarked,Title,FamilySize
0,3,0,1,0,1.5,0,0,0.5
1,1,0,3,2,0.6,1,2,0.5
2,3,0,1,0,1.5,0,1,0.0
3,1,0,2,2,0.6,0,2,0.5
4,3,0,2,0,1.5,0,0,0.0


In [4]:
from sklearn.neighbors import KNeighborsClassifier
import numpy as np

scoring='accuracy'
classifier = KNeighborsClassifier(n_neighbors=13)

 # Cross Validation (K-fold)

# K-fold (n=10)

In [17]:
from sklearn.model_selection import cross_val_score, KFold

k_fold = KFold(n_splits=10, shuffle=True, random_state=0)
%time score = cross_val_score(classifier, train_data, train_target, cv=k_fold, n_jobs=1)
print('Average Score: ', round(100*sum(score)/len(score), 2))


CPU times: user 80.8 ms, sys: 2.24 ms, total: 83.1 ms
Wall time: 81.9 ms
Average Score:  80.92


# K-fold (n=5)

In [18]:
k_fold = KFold(n_splits=5, shuffle=True, random_state=0)
%time score = cross_val_score(classifier, train_data, train_target, cv=k_fold, n_jobs=1)
print('Average Score: ', round(100*sum(score)/len(score), 2))

CPU times: user 57 ms, sys: 2.36 ms, total: 59.4 ms
Wall time: 57.9 ms
Average Score:  80.36


# K-fold (n=20)

In [19]:
k_fold = KFold(n_splits=20, shuffle=True, random_state=0)
%time score = cross_val_score(classifier, train_data, train_target, cv=k_fold, n_jobs=1)
print('Average Score: ', round(100*sum(score)/len(score), 2))

CPU times: user 122 ms, sys: 2.21 ms, total: 124 ms
Wall time: 123 ms
Average Score:  79.91


# K-fold (n=10) : Not Shuffled

In [20]:
k_fold = KFold(n_splits=10, shuffle=False)
%time score = cross_val_score(classifier, train_data, train_target, cv=k_fold, n_jobs=1)
print('Average Score: ', round(100*sum(score)/len(score), 2))

CPU times: user 78.8 ms, sys: 2.12 ms, total: 80.9 ms
Wall time: 79.6 ms
Average Score:  79.92


# LOOCV
## Leave-One-Out Cross Validation

In [21]:
from sklearn.model_selection import LeaveOneOut

loocv = LeaveOneOut()
%time score = cross_val_score(classifier, train_data, train_target, cv=loocv)
print('Average Score: ', round(100*sum(score)/len(score), 2))

CPU times: user 3.54 s, sys: 36.1 ms, total: 3.57 s
Wall time: 3.58 s
Average Score:  81.03


In [101]:
k_fold = KFold(n_splits=891, shuffle=False)
%time score = cross_val_score(classifier, train_data, train_target, cv=k_fold, n_jobs=1)
print('Average Score: ', round(100*sum(score)/len(score), 2))

CPU times: user 3.56 s, sys: 36.3 ms, total: 3.6 s
Wall time: 3.61 s
Average Score:  81.03


#  Stratified k-fold Cross Validation

In [99]:
from sklearn.model_selection import StratifiedKFold

sk_fold = StratifiedKFold(n_splits=10, shuffle=True)
%time score = cross_val_score(classifier, train_data, train_target, cv=sk_fold)
print('Average Score: ', round(100*sum(score)/len(score), 2))

CPU times: user 80.7 ms, sys: 2.04 ms, total: 82.7 ms
Wall time: 81.7 ms
Average Score:  81.03


# Shuffle Split Cross Validation
## 임의 분할 교차 검증

In [57]:
from sklearn.model_selection import ShuffleSplit

shuffle_split = ShuffleSplit(test_size=0.5, train_size=0.5, n_splits=10)
%time score = cross_val_score(classifier, train_data, train_target, cv=shuffle_split)
print('Average Score: ', round(100*sum(score)/len(score), 2))

CPU times: user 172 ms, sys: 2.23 ms, total: 174 ms
Wall time: 174 ms
Average Score:  78.39
