In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno


plt.style.use('seaborn')
sns.set(font_scale=2.5)

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

## Note

pclass: A proxy for socio-economic status (SES)
> 1st = Upper  
> 2nd = Middle  
> 3rd = Lower  

age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5

sibsp: The dataset defines family relations in this way...
> Sibling = brother, sister, stepbrother, stepsister  
> Spouse = husband, wife (mistresses and fiancés were ignored)  


parch: The dataset defines family relations in this way...
> Parent = mother, father  
> Child = daughter, son, stepdaughter, stepson  
> Some children travelled only with a nanny, therefore parch=0 for them.

In [None]:
df_train = pd.read_csv('../input/titanic/train.csv')
df_test = pd.read_csv('../input/titanic/test.csv')

In [None]:
# 분석용 차트
import matplotlib.pyplot as plt

# survived_crosstab과 pie_chart이용해서 파이 차트 그리기
def show_pie(df, cols):
    crosstab_dataframe = survived_crosstab(df, cols)
    pie_chart(crosstab_dataframe)

# cols와 survived간의 교차도표 생성
def survived_crosstab(df, cols):
    crosstab_dataframe = pd.crosstab( df[cols], df['Survived'])
    crosstab_dataframe.columns = crosstab_dataframe.columns.map({0:'Dead', 1:'Survived'})
    return crosstab_dataframe


# 파이 그래프 그리기
def pie_chart(df):
    # pie chart
    # pcol, prow = 차트를 출력할 개수, pcol * prow 만큼의 차트 출력
    
    frows, fcols = df.shape
    pcol = 3
    prow = (frows/pcol + frows%pcol)
    plot_height = prow * 2.5
    plt.figure(figsize=(8, plot_height))
    
    for row in range(0, frows):
        plt.subplot(prow, pcol, row+1)
        
        index_name = df.index[row]
        plt.pie(df.loc[index_name], labels=df.loc[index_name].index, autopct='%1.1f%%')
        plt.title("{}' survived".format(index_name))
        
    plt.show()
    

In [None]:
# PassengerId : index
# Survived : Dead / Alive
# Pclass :  1st = Upper, 2nd = Middle, 3rd = Lower
show_pie(df_train, 'Pclass')

# Upper class survived: 63.0%
# Middle class survived: 47.3%
# Lower class survived: 24.2%

In [None]:
# Name : Name of passenger => to Title
df_train['Title'] = df_train.Name.str.extract(' ([A-Za-z]+)\.')
df_train.Title.head()

df_test['Title'] = df_test.Name.str.extract(' ([A-Za-z]+)\.')
df_test.Title.head()

In [None]:
df_train.Title.unique()
df_test.Title.unique()

In [None]:
df_train[df_train['Title']== 'Mme'][['Sex','Age']] # Mme : female / 24
df_train[df_train['Title']=='Mlle'][['Sex','Age']] # Mlle : female / 24
df_train[df_train['Title']=='Sir'][['Sex','Age']] # Sir : male / 49
df_train[df_train['Title']=='Jonkheer'][['Sex','Age']] # Jonkheer : male / 38
df_train[df_train['Title']=='Lady'][['Sex','Age']] # Lady : female/48
df_train[df_train['Title']=='Capt'][['Sex','Age']] # Capt : male / 70
df_train[df_train['Title']=='Don'][['Sex','Age']] # Don : male / 40
df_train[df_train['Title']=='Ms'][['Sex','Age']] # female : female/28
df_train[df_train['Title']=='Countess'][['Sex','Age']] # Countess : female / 33
df_train[df_train['Title']== 'Major'][['Sex','Age']] # Major449 : male / 52   Major536 : male / 45
df_train[df_train['Title']== 'Col'][['Sex','Age']] # Col647 : male / 56   Col694 : male / 60

# Dr / Rev => others




In [None]:
df_test[df_test['Title']== 'Mme'][['Sex','Age']] # Mme : female / 24
df_test[df_test['Title']=='Mlle'][['Sex','Age']] # Mlle : female / 24
df_test[df_test['Title']=='Sir'][['Sex','Age']] # Sir : male / 49
df_test[df_test['Title']=='Jonkheer'][['Sex','Age']] # Jonkheer : male / 38
df_test[df_test['Title']=='Lady'][['Sex','Age']] # Lady : female/48
df_test[df_test['Title']=='Capt'][['Sex','Age']] # Capt : male / 70
df_test[df_test['Title']=='Don'][['Sex','Age']] # Don : male / 40
df_test[df_test['Title']=='Ms'][['Sex','Age']] # female : female/28
df_test[df_test['Title']=='Countess'][['Sex','Age']] # Countess : female / 33
df_test[df_test['Title']== 'Major'][['Sex','Age']] # Major449 : male / 52   Major536 : male / 45
df_test[df_test['Title']== 'Col'][['Sex','Age']] # Col647 : male / 56   Col694 : male / 60

# Dr / Rev => others




In [None]:
df_train[df_train['Title']=='Mr']['Age'].mean() # 32.4
df_train[df_train['Title']=='Miss']['Age'].mean() # 21.8
df_train[df_train['Title']=='Mrs']['Age'].mean() # 35.9
df_train[df_train['Title']=='Master']['Age'].mean() # 4.6

df_train['Title'] = df_train['Title'].replace('Mme', 'Miss')
df_train['Title'] = df_train['Title'].replace('Mlle', 'Miss')
df_train['Title'] = df_train['Title'].replace('Sir', 'Mr')
df_train['Title'] = df_train['Title'].replace('Jonkheer', 'Mr')
df_train['Title'] = df_train['Title'].replace('Sir', 'Mr')
df_train['Title'] = df_train['Title'].replace('Lady', 'Mrs')
df_train['Title'] = df_train['Title'].replace('Capt', 'Mr')
df_train['Title'] = df_train['Title'].replace('Don', 'Mr')
df_train['Title'] = df_train['Title'].replace('Ms', 'Miss')
df_train['Title'] = df_train['Title'].replace('Countess', 'Mrs')
df_train['Title'] = df_train['Title'].replace('Major', 'Mr')
df_train['Title'] = df_train['Title'].replace('Col', 'Mr')

df_train['Title'] = df_train['Title'].replace(['Dr', 'Rev'], 'Others')


In [None]:
df_test[df_test['Title']=='Mr']['Age'].mean() # 32.2
df_test[df_test['Title']=='Miss']['Age'].mean() # 21.8
df_test[df_test['Title']=='Mrs']['Age'].mean() # 38.9
df_test[df_test['Title']=='Master']['Age'].mean() # 7.4

df_test['Title'] = df_test['Title'].replace('Mme', 'Miss')
df_test['Title'] = df_test['Title'].replace('Mlle', 'Miss')
df_test['Title'] = df_test['Title'].replace('Sir', 'Mr')
df_test['Title'] = df_test['Title'].replace('Jonkheer', 'Mr')
df_test['Title'] = df_test['Title'].replace('Sir', 'Mr')
df_test['Title'] = df_test['Title'].replace('Lady', 'Mrs')
df_test['Title'] = df_test['Title'].replace('Capt', 'Mr')
df_test['Title'] = df_test['Title'].replace('Don', 'Mr')
df_test['Title'] = df_test['Title'].replace('Ms', 'Miss')
df_test['Title'] = df_test['Title'].replace('Countess', 'Mrs')
df_test['Title'] = df_test['Title'].replace('Major', 'Mr')
df_test['Title'] = df_test['Title'].replace('Col', 'Mr')
df_test['Title'] = df_test['Title'].replace('Dona', 'Mrs')

df_test['Title'] = df_test['Title'].replace(['Dr', 'Rev'], 'Others')


In [None]:
# Age : Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5

# 호칭별 나이 평균
mean_age = df_train[['Title', 'Age']].groupby('Title').mean()


# 호칭과 호칭별 나이 평균값을 하나씩 가져온다 (ex :  index = Master, row = Age 4.574167)
# train_set.Title == Master 이고 train_set.Age가 null인 것의 index 들을 가져와서
# train_set.loc['65', 'Age'] = 4.574167
for index, row in mean_age.iterrows():
#     print('index: {}, row: {}'.format(index, row))
    nullIndex = df_train[(df_train.Title == index) & (df_train.Age.isna())].index
#     print('nullIndex: ', nullIndex)
#     print('row[0] : ', row[0])
    df_train.loc[nullIndex, 'Age'] = row[0]

    
df_train['AgeCategory'] = pd.qcut(df_train.Age, 8, labels=range(1, 9))
df_train.AgeCategory = df_train.AgeCategory.astype(int)


In [None]:
# Age : Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5

# 호칭별 나이 평균
mean_age = df_test[['Title', 'Age']].groupby('Title').mean()


# 호칭과 호칭별 나이 평균값을 하나씩 가져온다 (ex :  index = Master, row = Age 4.574167)
# train_set.Title == Master 이고 train_set.Age가 null인 것의 index 들을 가져와서
# train_set.loc['65', 'Age'] = 4.574167
for index, row in mean_age.iterrows():
#     print('index: {}, row: {}'.format(index, row))
    nullIndex = df_test[(df_test.Title == index) & (df_test.Age.isna())].index
#     print('nullIndex: ', nullIndex)
#     print('row[0] : ', row[0])
    df_test.loc[nullIndex, 'Age'] = row[0]

    
df_test['AgeCategory'] = pd.qcut(df_test.Age, 8, labels=range(1, 9))
df_test.AgeCategory = df_test.AgeCategory.astype(int)


In [None]:
# Sex col need One-hot encoding
# female = 0, male = 1

df_train['Sex'] = df_train['Sex'].map({'female':0, 'male':1})
df_train

df_test['Sex'] = df_test['Sex'].map({'female':0, 'male':1})
df_test

In [None]:
# SibSp: The dataset defines family relations in this way...
# Sibling = brother, sister, stepbrother, stepsister
# Spouse = husband, wife (mistresses and fiancés were ignored)



show_pie(df_train, 'SibSp')

# meaning 'havnig 1~2 sibling or sprouse seem to have more survivability'

In [None]:
# Parch

show_pie(df_train, 'Parch')

df_train.Parch.value_counts()

# 1 ~ 2명의 Parch 즉, 부모님과 함께 탑승 했을때 생존률이 좀더 높다

## Family = Sibsp + Parch


In [None]:
df_train['FamilySize'] = df_train['SibSp'] + df_train['Parch']
df_test['FamilySize'] = df_test['SibSp'] + df_test['Parch']


In [None]:
print('Maximum size of Family: ', df_train['FamilySize'].max())
print('Minimum size of Family: ', df_train['FamilySize'].min())

In [None]:
f, ax = plt.subplots(1, 3, figsize=(40, 10))
sns.countplot('FamilySize', data=df_train, ax=ax[0])
ax[0].set_title('(1)No. Of Passenger Boarded', y=1.02)

sns.countplot('FamilySize', hue='Survived', data=df_train, ax=ax[1])
ax[1].set_title('(2)Survived countplot depending on FamilySize', y=1.02)

df_train[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=True).mean().sort_values(by='Survived', ascending=False).plot.bar(ax=ax[2])
ax[2].set_title('(3)Survived rate  depending on FamilySize', y=1.02)

plt.subplots_adjust(wspace=0.2, hspace=0.5)
plt.show()

In [None]:
# ticket

df_train.Ticket.value_counts()
df_train.Ticket.isna().sum() # 0

In [None]:
# Fare
df_train.Fare.value_counts()
df_train.Fare.isna().sum() # 0

fig, ax = plt.subplots(1, 1, figsize=(8, 8))
g = sns.distplot(df_train['Fare'], color="b", label="Skewness: {:.2f}".format(df_train['Fare'].skew()), ax=ax)
g = g.legend(loc='best')

# 

In [None]:
# Fare 로그화를 통해 skewness
# Feature Engineering => 모델 성능 상향을 위해 피처를 이리저리 만저보는 것

# df_train['Fare'] = df_train['Fare'].map(lambda i:np.log(i) if i > 0 else 0)
df_test.loc[df_test['Fare'].isna()] = df_test.Fare.mean()
# df_test['Fare'] = df_test['Fare'].map(lambda i:np.log(i) if i > 0 else 0)



In [None]:
# Cabin
df_train.Cabin.value_counts()
df_train.Cabin.isna().sum() # 687

df_train.Cabin.unique()

In [None]:
# Embarked
df_train.Embarked.value_counts()
df_train.Embarked.isna().sum() # 2


show_pie(df_train, 'Embarked')


In [None]:
df_train.groupby('Title')['Survived'].mean().plot.bar()

## Feature Engineering

In [None]:
df_all = pd.concat([df_train, df_test])
df_all

In [None]:
df_all.reset_index(drop=True)

In [None]:
df_all.groupby(['Title']).mean()

In [None]:
# indexing 문법
# loc
df_all.loc[:,:]
df_all.loc[1,:]


# df_train.loc[(df_train['Age'].isnull()) & (df_train['Title']=='Mr'), 'Age']

In [None]:
df_all.loc[(df_all['Age'].isna()) & (df_all['Title'] == 'Mr'), 'Age']

In [None]:
# Fill Null in Embarked
df_train.Embarked.isna().sum()
df_test.Embarked.isna().sum()

In [None]:
df_train['Embarked'].fillna('S', inplace=True)

In [None]:
df_train.Embarked.isna().sum()

In [None]:
df_train['Age_Cat'] = 0

In [None]:
# Categorize Age

df_train.loc[df_train['Age'] < 10, 'Age_Cat'] = 0
df_train.loc[(10 <= df_train['Age']) & (df_train['Age'] < 20), 'Age_Cat' ] = 1
df_train.loc[(20 <= df_train['Age']) & (df_train['Age'] < 30), 'Age_Cat' ] = 2
df_train.loc[(30 <= df_train['Age']) & (df_train['Age'] < 40), 'Age_Cat' ] = 3
df_train.loc[(40 <= df_train['Age']) & (df_train['Age'] < 50), 'Age_Cat' ] = 4
df_train.loc[(50 <= df_train['Age']) & (df_train['Age'] < 60), 'Age_Cat' ] = 5
df_train.loc[(60 <= df_train['Age']) & (df_train['Age'] < 70), 'Age_Cat' ] = 6
df_train.loc[70 <= df_train['Age'], 'Age_Cat' ] = 7



In [None]:
df_test.loc[df_test['Age'] < 10, 'Age_Cat'] = 0
df_test.loc[(10 <= df_test['Age']) & (df_test['Age'] < 20), 'Age_Cat' ] = 1
df_test.loc[(20 <= df_test['Age']) & (df_test['Age'] < 30), 'Age_Cat' ] = 2
df_test.loc[(30 <= df_test['Age']) & (df_test['Age'] < 40), 'Age_Cat' ] = 3
df_test.loc[(40 <= df_test['Age']) & (df_test['Age'] < 50), 'Age_Cat' ] = 4
df_test.loc[(50 <= df_test['Age']) & (df_test['Age'] < 60), 'Age_Cat' ] = 5
df_test.loc[(60 <= df_test['Age']) & (df_test['Age'] < 70), 'Age_Cat' ] = 6
df_test.loc[70 <= df_test['Age'], 'Age_Cat' ] = 7

In [None]:
def category_age(x):
    if x < 10:
        return 0
    elif x < 20:
        return 1
    elif x < 30:
        return 2
    elif x < 40:
        return 3
    elif x < 50:
        return 4
    elif x < 60:
        return 5
    elif x < 70:
        return 6
    else:
        return 7

In [None]:
df_train['Age_cat2'] = df_train['Age'].apply(category_age)

In [None]:
# any : 하나라도 true 이면 true
# all : 모든게 true 일때 True
(df_train['Age_Cat'] == df_train['Age_cat2']).all() 

In [None]:
df_train.drop(['Age_cat2', 'Age','AgeCategory'], axis=1, inplace=True)
df_test.drop(['Age','AgeCategory'], axis=1, inplace=True)



In [None]:
df_train.head()
df_test.head()

In [None]:
# str to num
# Title, Sex, Embarked etc

df_train.Title.unique()

In [None]:
df_train['Title'] = df_train['Title'].map({'Master':0, 'Miss':1, 'Mr':2, 'Mrs':3, 'Others': 4})


In [None]:
df_test['Title'] = df_test['Title'].map({'Master':0, 'Miss':1, 'Mr':2, 'Mrs':3, 'Others': 4})

In [None]:
df_test['Title'].unique()

In [None]:
df_train.Embarked.unique()

In [None]:
df_train['Embarked'] = df_train['Embarked'].map({'C':0, 'Q':1, 'S':2})
df_test['Embarked'] = df_test['Embarked'].map({'C':0, 'Q':1, 'S':2})

In [None]:
df_train['Sex'].unique()
# df_train['Sex'].map({'female':0, 'male':1})
# df_test['Sex'].map({'female':0, 'male':1})

In [None]:
df_train.drop(['Name','Ticket','Cabin'], axis=1, inplace=True)

In [None]:
df_test.drop(['Name','Ticket','Cabin'], axis=1, inplace=True)

In [None]:
heatmap_data = df_train[['Survived','Pclass','Sex','Fare','Embarked','FamilySize','Title','Age_Cat']]

In [None]:
colormap = plt.cm.RdBu
plt.figure(figsize=(10, 8))
plt.title('Pearson Correlation of Features', y=1.05, size=15)
sns.heatmap(heatmap_data.astype(float).corr(), linewidths=0.1, vmax=1.0,
           square=True, cmap=colormap, linecolor='white', annot=True, annot_kws={'size': 16}, fmt='.2f')

In [None]:
df_train = pd.get_dummies(df_train, columns=['Title'], prefix='Title')
df_test = pd.get_dummies(df_test, columns=['Title'], prefix='Title')

In [None]:
df_train = pd.get_dummies(df_train, columns=['Embarked'], prefix='Embarked')
df_test = pd.get_dummies(df_test, columns=['Embarked'], prefix='Embarked')

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
df_train.drop(['PassengerId','SibSp','Parch'], axis=1, inplace=True)
df_test.drop(['PassengerId','SibSp','Parch'], axis=1, inplace=True)

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
# Machine Learning Modeling

from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split



In [None]:
X_train = df_train.drop(['Survived'], axis=1).values
target_label = df_train['Survived'].values
X_test = df_test.values

In [None]:
X_tr, X_vd, y_tr, y_vd = train_test_split(X_train, target_label, test_size=0.3, random_state=2018)


In [None]:
model = RandomForestClassifier()
model.fit(X_tr, y_tr)

In [None]:
prediction = model.predict(X_vd)

In [None]:
print('총 {} 명 중 {:.2f}% 정확도로 생존 맞춤'.format(y_vd.shape[0], 100 * metrics.accuracy_score(prediction, y_vd)))

## Feature importance

- 학습된 모델은 feature importance를 가지게 됩니다. 우리는 이것을 확인하여 지금 만드 모델이 어떤 feature에 영향을 많이 받는지 확인할 수 있습니다.


In [None]:
model.feature_importances_

In [None]:
from pandas import Series

In [None]:
feature_importance = model.feature_importances_
Series_feat_imp = Series(feature_importance, index=df_train.drop(['Survived'], axis=1).columns)


In [None]:
plt.figure(figsize=(10, 10))
Series_feat_imp.sort_values(ascending=True).plot.barh()
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.show()

In [None]:
submission = pd.read_csv('../input/titanic/gender_submission.csv')
submission.head()

In [None]:
prediction = model.predict(X_test)

In [None]:
submission['Survived'] = prediction

In [None]:
submission.to_csv('./my_first_submission3.csv', index=False)