In [97]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns # seaborn 또한 matplot이 베이스

# ----개인 지정----
plt.style.use('seaborn')
sns.set(font_scale=2.5)

import missingno as msno

# ignore warnings
import warnings
warnings.filterwarnings('ignore')
# 새 창으로 matplot이 열리지 않게 
%matplotlib inline

# Intro

In [98]:
df_train = pd.read_csv('../input/titanic/train.csv')
df_test = pd.read_csv('../input/titanic/test.csv') # df_test 라는 객체. head() 메소드 있고 등등 

In [99]:
df_train.head()

1. Pclass = 카테고리(분류) , ordinary(순서가 있음)
2. Sex = 바이너리 (두가지)(수치화시켜서 바꿀 것)
3. Age = continuous
4. Fare = continuous 이어져있는 거
5. Embarked = S,C,Q 카테고리(분류) , 순서가 없음
* 카테고리는 나중에 원핫인코딩해서 데이터 처리
* 데이터 타입마다 처리가 달라지므로 주의

In [100]:
df_train.shape

In [101]:
df_train.describe() # 간단한 통계적 수치 

age는 null인듯 하다.

In [102]:
df_test.describe()

In [103]:
for col in df_train.columns:
    msg = f'column: {col:>10}\t Percent of NaN value: {100 * (df_train[col].isnull().sum() / df_train[col].shape[0]):.2f}%'
    print(msg)

In [104]:
for col in df_test.columns:
    msg = f'column: {col:>10}\t Percent of NaN value: {100 * (df_test[col].isnull().sum() / df_test[col].shape[0]):.2f}%'
    print(msg)

In [105]:
msno.matrix(df=df_train.iloc[:,:], figsize=(8,8), color=(0.8, 0.5, 0.2))
# 빈 칸이 null
# 위치, 분포 알 수 있음

In [106]:
msno.bar(df=df_train.iloc[:,:], figsize=(8,8), color=(0.8, 0.5, 0.2))
# 퍼센트 알 수 있음

In [107]:
f, ax = plt.subplots(1, 2, figsize=(18, 8))

df_train['Survived'].value_counts().plot.pie(explode=[0, 0.1], autopct='%1.1f%%', ax=ax[0], shadow=True)
ax[0].set_title('Pie plot - Survived')
ax[0].set_ylabel('')

sns.countplot('Survived', data=df_train, ax=ax[1])
ax[1].set_title('Count plot - Survived')
plt.show()

# EDA
어떤 열이 중요한 지, 열 간의 어떤 상관관계가 있는 지 확인. -> 실제 모델에 어떤 열을 쓰면 좋을 지 중요한 인사이트를 얻을 수 있음. 

In [108]:
df_train.head()

## 2.1 Pclass

In [109]:
df_train[['Pclass', 'Survived']].groupby(['Pclass'], as_index=True).count()

In [110]:
df_train[['Pclass', 'Survived']].groupby(['Pclass'], as_index=True).mean()

In [111]:
pd.crosstab(df_train['Pclass'], df_train['Survived'], margins=True).style.background_gradient(cmap='cool')

In [112]:
df_train[['Pclass', 'Survived']].groupby(['Pclass'], as_index=True).mean().sort_values(by='Survived', ascending=False).plot.bar()

클래스가 높을 수록 생존률이 높다

In [113]:
y_position = 1.02
f, ax = plt.subplots(1, 2, figsize=(18, 8))

df_train['Pclass'].value_counts().plot.bar(color=['#CD7F31', '#FFDF00', '#D3D3D3'], ax=ax[0])
ax[0].set_title('Number of passengers By Pclass', y=y_position)
ax[0].set_ylabel('Count')

sns.countplot('Pclass', hue='Survived', data=df_train, ax=ax[1])
ax[1].set_title('Pclass: Survived vs Dead', y=y_position)
plt.show()

## 2.2 Sex

In [114]:
f, ax = plt.subplots(1, 2, figsize=(18, 8))
# 성별에 따른 생존 확률
df_train[['Sex', 'Survived']].groupby(['Sex'], as_index=True).mean().plot.bar(ax=ax[0])
ax[0].set_title('Survived vs Sex')

sns.countplot('Sex', hue='Survived', data=df_train, ax=ax[1])
ax[1].set_title('Sex: Survived vs Dead')
plt.show()

In [115]:
pd.crosstab(df_train['Sex'], df_train['Survived'], margins=True).style.background_gradient(cmap='summer_r')

## 2.3 Both Sex and Pclass 

In [116]:
sns.factorplot('Pclass', 'Survived', hue='Sex', data=df_train, size=6, aspect=1.5)

- Lady first
- Money brings survival?

In [117]:
sns.factorplot('Sex', 'Survived', hue='Pclass', data=df_train, size=6, aspect=1.5)

## 2.4 Age 

In [118]:
print('제일 나이 많은 탑승객 : {:.1f} years'.format(df_train['Age'].max()))
print('제일 나이 어린 탑승객 : {:.1f} years'.format(df_train['Age'].min()))
print('탑승객 평균 나이 : {:.1f} years'.format(df_train['Age'].mean()))

In [119]:
# kdeplt -> 커널밀도추정. histogram을 곡선화함
f, ax = plt.subplots(1, 1, figsize=(9,5))
sns.kdeplot(df_train[df_train['Survived'] == 1]['Age'], ax=ax)
sns.kdeplot(df_train[df_train['Survived'] == 0]['Age'], ax=ax)
plt.legend(['Survived == 1', 'Survived == 0'])
plt.show()

나이가 어릴 수록 생존확률이 높아진다

In [120]:
# 같은 결과지만 도화지 그리는 방법이 위와 다름. 위는 ax을 반환, 여기는 plt (그래프가 하니일 때 주로 사용함)  

# 생존확률과 관계없는 단순한 분포
# 나이인데 음수가 나오는 이유는 밀도함수(근사값)이기 때문
plt.figure(figsize=(8, 6))
df_train['Age'][df_train['Pclass'] == 1].plot(kind='kde')
df_train['Age'][df_train['Pclass'] == 2].plot(kind='kde')
df_train['Age'][df_train['Pclass'] == 3].plot(kind='kde')

plt.xlabel('Age')
plt.title('Age Distribution within classes')
plt.legend(['1st Class', '2nd Class', '3rd Class'])

In [121]:
# Pclass별 Age에 따른 생존확률
plt.figure(figsize=(9, 5))
df_train['Age'][(df_train['Survived'] == 1) & (df_train['Pclass'] == 1)].plot(kind='kde')
df_train['Age'][(df_train['Survived'] == 0) & (df_train['Pclass'] == 1)].plot(kind='kde')

plt.title('1st Class')
plt.legend(['Survived == 1', 'Survived == 0'])
plt.show()

In [122]:
plt.figure(figsize=(9, 5))
df_train['Age'][(df_train['Survived'] == 1) & (df_train['Pclass'] == 2)].plot(kind='kde')
df_train['Age'][(df_train['Survived'] == 0) & (df_train['Pclass'] == 2)].plot(kind='kde')

plt.title('2nd Class')
plt.legend(['Survived == 1', 'Survived == 0'])
plt.show()

In [123]:
plt.figure(figsize=(9, 5))
df_train['Age'][(df_train['Survived'] == 1) & (df_train['Pclass'] == 3)].plot(kind='kde')
df_train['Age'][(df_train['Survived'] == 0) & (df_train['Pclass'] == 3)].plot(kind='kde')

plt.title('3rd Class')
plt.legend(['Survived == 1', 'Survived == 0'])
plt.show()

In [124]:
change_age_range_survived_ratio = []

for i in range(1, 80):
    change_age_range_survived_ratio.append(df_train[df_train['Age'] < i]['Survived'].sum() / len(df_train[df_train['Age'] < i]['Survived']))
                                           
plt.figure(figsize=(7, 7))
plt.plot(change_age_range_survived_ratio)
plt.title('Survival rate change depending on range of Age', y=1.02)
plt.ylabel('Survival rate')
plt.xlabel('Range of Age(0~x)')
plt.show()

## 2.5 Fare

In [125]:
# SKewness 왜도. 얼마나 쏠렸냐. 비대칭이냐. 양수면 좌측으로 치우침
fig, ax = plt.subplots(1, 1, figsize=(8,8))
g = sns.distplot(df_train['Fare'], color='b', label='Skewness: {:.2f}'.format(df_train['Fare'].skew()), ax=ax)
g = g.legend(loc='best')

In [126]:
df_train['Fare'] = df_train['Fare'].map(lambda i: np.log(i) if i>0 else 0)

In [127]:
# log 변환한 후 skewness가 0에 가까워짐 
fig, ax = plt.subplots(1, 1, figsize=(8,8))
g = sns.distplot(df_train['Fare'], color='b', label='Skewness: {:.2f}'.format(df_train['Fare'].skew()), ax=ax)
g = g.legend(loc='best')

데이터 엔지니어링 - 모델의 성능을 좋게하기 위해서 데이터를 만지작거리는 것
* 케빈은 null도 많아서 명확한 정보를 얻기 힘들기 때문에 제외하겠다

## 2.5 Ticket

In [128]:
df_train['Ticket'].value_counts()

일단 제외

# Feature engineering

### 3.1 Name을 이용해 Age 결측치 대체 (평균값)

In [129]:
# 정규 표현식
df_train['Initial'] = df_train['Name'].str.extract('([A-Za-z]+)\.')
df_test['Initial'] = df_test['Name'].str.extract('([A-Za-z]+)\.')

In [130]:
df_train.head()

In [131]:
pd.crosstab(df_train['Initial'], df_train['Sex']).T.style.background_gradient(cmap='cool')

In [132]:
# 치환
df_train['Initial'].replace(['Capt','Col','Countess','Don','Dr','Jonkheer','Lady','Major','Mlle','Mme','Ms','Rev','Sir','Dona'],
                           ['Mr','Other','Mrs','Mr','Mr','Other','Mrs','Mr','Miss','Miss','Miss','Other','Mr','Mr'], inplace=True)

df_test['Initial'].replace(['Capt','Col','Countess','Don','Dr','Jonkheer','Lady','Major','Mlle','Mme','Ms','Rev','Sir','Dona'],
                           ['Mr','Other','Mrs','Mr','Mr','Other','Mrs','Mr','Miss','Miss','Miss','Other','Mr','Mr'], inplace=True)

In [133]:
df_train.groupby('Initial').mean()

In [134]:
df_train.groupby('Initial')['Survived'].mean().plot.bar()

In [135]:
df_all = pd.concat([df_train, df_test])

In [136]:
df_all.groupby('Initial').mean()

In [137]:
df_train.loc[(df_train['Age'].isnull()) & (df_train['Initial'] == 'Master'),'Age'] = 5
df_train.loc[(df_train['Age'].isnull()) & (df_train['Initial'] == 'Miss'),'Age'] = 22
df_train.loc[(df_train['Age'].isnull()) & (df_train['Initial'] == 'Mr'),'Age'] = 33
df_train.loc[(df_train['Age'].isnull()) & (df_train['Initial'] == 'Mrs'),'Age'] = 37
df_train.loc[(df_train['Age'].isnull()) & (df_train['Initial'] == 'Other'),'Age'] = 45

df_test.loc[(df_test['Age'].isnull()) & (df_test['Initial'] == 'Master'),'Age'] = 5
df_test.loc[(df_test['Age'].isnull()) & (df_test['Initial'] == 'Miss'),'Age'] = 22
df_test.loc[(df_test['Age'].isnull()) & (df_test['Initial'] == 'Mr'),'Age'] = 33
df_test.loc[(df_test['Age'].isnull()) & (df_test['Initial'] == 'Mrs'),'Age'] = 37
df_test.loc[(df_test['Age'].isnull()) & (df_test['Initial'] == 'Other'),'Age'] = 45

### 3.2 Embarked 결측치 대체 (최빈값)

In [138]:
df_train['Embarked'].fillna('S', inplace=True)

### 3.3 Age 범주화

In [139]:
df_train['Age_cat'] = 0

In [140]:
df_train.head()

1. 하드코딩

In [141]:
df_train.loc[df_train['Age'] < 10, 'Age_cat'] = 0
df_train.loc[(10 <= df_train['Age']) & (df_train['Age'] < 20), 'Age_cat'] = 1
df_train.loc[(20 <= df_train['Age']) & (df_train['Age'] < 30), 'Age_cat'] = 2
df_train.loc[(30 <= df_train['Age']) & (df_train['Age'] < 40), 'Age_cat'] = 3
df_train.loc[(40 <= df_train['Age']) & (df_train['Age'] < 50), 'Age_cat'] = 4
df_train.loc[(50 <= df_train['Age']) & (df_train['Age'] < 60), 'Age_cat'] = 5
df_train.loc[(60 <= df_train['Age']) & (df_train['Age'] < 70), 'Age_cat'] = 6
df_train.loc[70 <= df_train['Age'], 'Age_cat'] = 7

In [142]:
df_test.loc[df_test['Age'] < 10, 'Age_cat'] = 0
df_test.loc[(10 <= df_test['Age']) & (df_test['Age'] < 20), 'Age_cat'] = 1
df_test.loc[(20 <= df_test['Age']) & (df_test['Age'] < 30), 'Age_cat'] = 2
df_test.loc[(30 <= df_test['Age']) & (df_test['Age'] < 40), 'Age_cat'] = 3
df_test.loc[(40 <= df_test['Age']) & (df_test['Age'] < 50), 'Age_cat'] = 4
df_test.loc[(50 <= df_test['Age']) & (df_test['Age'] < 60), 'Age_cat'] = 5
df_test.loc[(60 <= df_test['Age']) & (df_test['Age'] < 70), 'Age_cat'] = 6
df_test.loc[70 <= df_test['Age'], 'Age_cat'] = 7

In [143]:
df_train.head()

2. 함수

In [144]:
def category_age(x):
    if x < 10:
        return 0
    elif x < 20:
        return 1
    elif x < 30:
        return 2
    elif x < 40:
        return 3
    elif x < 50:
        return 4
    elif x < 60:
        return 5
    elif x < 70:
        return 6
    else:
        return 7

In [145]:
df_train['Age_cat_2'] = df_train['Age'].apply(category_age)

In [146]:
(df_train['Age_cat'] == df_train['Age_cat_2']).all()

In [147]:
df_train.drop(['Age', 'Age_cat_2'], axis=1, inplace=True)
df_test.drop(['Age'], axis=1, inplace=True)

### 3.4 Initial 매핑

In [148]:
df_train.Initial.unique()

In [149]:
# 수치화
df_train['Initial'] = df_train['Initial'].map({'Master': 0, 'Miss': 1, 'Mr': 2, 'Mrs': 3, 'Other': 4})
df_test['Initial'] = df_test['Initial'].map({'Master': 0, 'Miss': 1, 'Mr': 2, 'Mrs': 3, 'Other': 4})

### 3.5 Embarked 매핑

In [150]:
# df_train.Initial.unique()
df_train['Embarked'].value_counts()

In [151]:
df_train['Embarked'] = df_train['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})
df_test['Embarked'] = df_test['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})

### 3.6 Sex 매핑

In [152]:
df_train['Sex'] = df_train['Sex'].map({'female': 0, 'male': 1})
df_test['Sex'] = df_test['Sex'].map({'female': 0, 'male': 1})

### 3.7 피어슨 상관분석

In [153]:
df_train.head()

In [154]:
heatmap_data = df_train[['Survived', 'Pclass', 'Sex', 'Fare', 'Embarked', 'SibSp', 'Initial', 'Age_cat']]

In [155]:
colormap = plt.cm.BuGn
plt.figure(figsize=(12,10))
plt.title('Pearson Correalation of Features', y=1.05, size=15)
sns.heatmap(heatmap_data.astype(float).corr(), linewidths=1, vmax=1,
           square=True, cmap=colormap, linecolor='Blue', annot=True, annot_kws={'size':16}, fmt='.2f')

feature 간의 상관관계가 1일 경우, 하나는 불필요

### 3.8 Initial 원핫인코딩

In [156]:
df_train = pd.get_dummies(df_train, columns=['Initial'], prefix='Initial')
df_test = pd.get_dummies(df_test, columns=['Initial'], prefix='Initial')

In [157]:
df_train

### 3.9 Embarked 원핫인코딩

In [158]:
df_train = pd.get_dummies(df_train, columns=['Embarked'], prefix='Embarked')
df_test = pd.get_dummies(df_test, columns=['Embarked'], prefix='Embarked')

### 3.10 Drop

In [159]:
df_train.drop(['PassengerId', 'Name', 'SibSp', 'Ticket', 'Cabin'], axis=1, inplace=True)
df_test.drop(['PassengerId', 'Name', 'SibSp', 'Ticket', 'Cabin'], axis=1, inplace=True)

In [160]:
df_train.head()

In [161]:
# Fare null 평균값대체
df_test.loc[df_test['Fare'].isnull(), 'Fare'] = df_train['Fare'].mean()

# Modeling

## 4.1 Randomforest

지도학습 - 목적이 되는 레이블을 줌

In [162]:
from sklearn.ensemble import RandomForestClassifier # 이진분류. 의사결정트리를 랜덤화하고 앙상블 (averaging)
from sklearn import metrics # 모델 평가 함수들
from sklearn.model_selection import train_test_split # train set, Validation set 나누는 거

In [163]:
X_train = df_train.drop('Survived', axis=1).values
target_label = df_train['Survived'].values
X_test = df_test.values

In [164]:
X_tr, X_vld, y_tr, y_vld = train_test_split(X_train, target_label, test_size=0.3, random_state=2018) # 70% train, 30% Validation

In [165]:
random_forest = RandomForestClassifier()
random_forest.fit(X_tr, y_tr) # tr 학습시킴

In [166]:
prediction = random_forest.predict(X_vld)
prediction # x vld 로 예측

In [167]:
print('총 {}명 중 {:.2f}% 정확도로 생존을 맞추었습니다.'.format(y_vld.shape[0], 100 * metrics.accuracy_score(prediction, y_vld))) # x vld(예측값)과 y vld(실제값) 비교

## 4.2 kNN

In [168]:
from sklearn.neighbors import KNeighborsClassifier

In [169]:
## Cross Validation (K-fold) , CV
# Validation set 과적합을 방지한다. 교차 검증.

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)

In [170]:
knn = KNeighborsClassifier(n_neighbors=13)
scoring = 'accuracy'
score = cross_val_score(knn, X_train, target_label, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)

In [171]:
# kNN Score
round(np.mean(score)*100, 2)

## 4.3 Decision Tree

In [172]:
from sklearn.tree import DecisionTreeClassifier

In [173]:
DT = DecisionTreeClassifier()
scoring = 'accuracy'
score = cross_val_score(DT, X_train, target_label, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)

In [174]:
# decision tree Score
round(np.mean(score)*100, 2)

## 4.4 Naive Bayes

In [175]:
from sklearn.naive_bayes import GaussianNB

In [176]:
NB = GaussianNB()
scoring = 'accuracy'
score = cross_val_score(NB, X_train, target_label, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)

In [177]:
# Naive Bayes Score
round(np.mean(score)*100, 2)

## 4.5 SVM

In [178]:
from sklearn.svm import SVC

In [179]:
svm = SVC()
scoring = 'accuracy'
score = cross_val_score(svm, X_train, target_label, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)

In [180]:
# SVM score
round(np.mean(score)*100, 2)

In [181]:
svm.fit(X_train, target_label)
prediction_svm = svm.predict(X_test)
prediction_svm

# Feature importance

어떤 Feature가 가장 영향을 크게 주는가

In [182]:
random_forest.feature_importances_

In [183]:
df_train.head()

In [184]:
from pandas import Series

In [185]:
feature_importance = random_forest.feature_importances_
Series_feat_imp = Series(feature_importance, index=df_test.columns)

In [186]:
plt.figure(figsize=(8,8))
Series_feat_imp.sort_values(ascending=True).plot.barh()
plt.xlabel('Feature importance')
plt.ylabel('Feature')
plt.show()

이를 토대로 가설을 세울 수 있고,
 불필요한 feature을 제거할 수도 있음 

---

In [187]:
submission = pd.read_csv('../input/titanic/gender_submission.csv')

In [188]:
submission.head()

In [189]:
# prediction = model.predict(X_test)

In [190]:
# submission['Survived'] = prediction
submission['Survived'] = prediction_svm

In [191]:
submission

In [192]:
submission.to_csv('./submission.csv', index=False)