In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('/kaggle/input/titanic/train.csv')
test = pd.read_csv('/kaggle/input/titanic/test.csv')
print(f'shape of train: {train.shape}, shape of test: {test.shape}')
train.head()

In [None]:
train1, valid = train_test_split(train, test_size = 0.3, stratify = train['Survived'])
train1.head()
# alive_train = len(train[train['Survived']==1])/len(train)
# alive_train1 = len(train1[train1['Survived']==1])/len(train1)
# alive_valid = len(valid[valid['Survived']==1])/len(valid)
# print(alive_train, alive_train1, alive_valid)

### missing value
* train1: age, cabin, embarked
* valid: age, cabin
* test: age, fare, cabin

### data type
* 7 float/int features
* 5 object(string) features


In [None]:
train1.info()
# 623筆資料，有null的column: age, cabin
print('-'*30)
valid.info()
print('-'*30)
test.info()

### Numeric data
* 約38%的人存活
* 多數人(>75%)的人沒有和parents, children一起
* 票價分佈範圍很廣，有少數的人買很貴的票
* 有少數年紀很大的人

In [None]:
train1.describe()
# Fare有票價特別高的

### Categorical data  
* 性別大部分是male(410/623)
* ticket和cabin有些重複
* embarked: S港口最多人

In [None]:
train1.describe(include = ['O'])
# ticket和cabin有些重複
# cabin有太多missing value，不會用在model

Pclass, Sex, SibSp, Parch都沒有missing value，可以看各個類別對應Survived的關係。  
只能對categorical, ordinal, discrete且沒有missing value的feauture做。(why?)

In [None]:
train1[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [None]:
sns.catplot(x="Pclass", y = 'Survived', data=train1, kind="bar")

In [None]:
sns.catplot(x = 'Sex', y = 'Survived', data = train1, kind = 'bar')

In [None]:
sns.catplot(x = 'SibSp', y = 'Survived', data = train1, kind = 'bar')

baby存活率高，80歲的有存活

## Correlating numerical features

In [None]:
g = sns.FacetGrid(train1, col= 'Survived')
g.map(plt.hist, 'Age', bins = 20)


## Correlating numerical and ordinal features

In [None]:
g = sns.FacetGrid(train1, col = 'Survived', row = 'Pclass')
g.map(plt.hist, 'Age', bins = 20)
# g.add_legend();

## Correlating categorical features

In [None]:
g = sns.FacetGrid(train1, col = 'Embarked')
g.map(sns.pointplot, 'Pclass', 'Survived', 'Sex').add_legend()
# sns.catplot(x='Sex', y='Survived', col='Pclass', row = 'Embarked', data = train1, kind='bar')

## Correlating categorical and numerical features  
Fare越高，較有可能存活  
將Fare分成區間

In [None]:
g = sns.FacetGrid(train1, row = 'Embarked', col = 'Survived')
g.map(sns.barplot, 'Sex', 'Fare').add_legend()

In [None]:
# number of alive(1) and dead(0) in train dataset
train1['Survived'].value_counts()

In [None]:
# sex_dict = {'male': 0, 'female': 1}
for df in [train1, valid, test]:
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})

## Fill missing value in feature Age
Pclass與Sex的排列組合中Age的分佈  
(why choose these features?)

In [None]:
g = sns.FacetGrid(train1, row = 'Pclass', col = 'Sex')
g.map(plt.hist, 'Age')

In [None]:
train1['Age'].isnull().sum()

In [None]:
for i, pclass in enumerate(train1['Pclass'].unique()):
    for j, sex in enumerate(train1['Sex'].unique()):
        age_median = train1.loc[(train1['Pclass'] == pclass) & (train1['Sex'] == sex), 'Age'].dropna().median()
        for df in [train1, valid, test]:
            df.loc[(df['Age'].isnull()) & (df['Pclass'] == pclass) & (df['Sex'] == sex), 'Age'] = age_median
#             df['Age'] = df['Age'].astype(int)

In [None]:
train1['Age'] = train1['Age'].astype(int)
valid['Age'] = valid['Age'].astype(int)
test['Age'] = test['Age'].astype(int)

In [None]:
# train1['Age_band'] = pd.cut(train1['Age'], bins = 5, labels = range(1, 6))
train1['Age_band'] = pd.cut(train1['Age'], bins = 5)

In [None]:
train1['Age_band'].dtype

In [None]:
train1[['Age_band', 'Survived']].groupby(['Age_band'], as_index=False).mean().sort_values(by='Age_band', ascending=True)

In [None]:
for df in [train1, valid, test]:
    df.loc[df['Age'] <= 16, 'Age_code'] = 1
    df.loc[(df['Age'] > 16) & (df['Age'] <= 32), 'Age_code'] = 2
    df.loc[(df['Age'] > 32) & (df['Age'] <= 48), 'Age_code'] = 3
    df.loc[(df['Age'] > 48) & (df['Age'] <= 64), 'Age_code'] = 4
    df.loc[df['Age'] > 64, 'Age_code'] = 5
# train1[['Age', 'Age_code']].head(20)

In [None]:
sns.catplot(x='Age_code', y = 'Survived', data = train1, kind = 'bar', height = 4, aspect = 2)

In [None]:
train1.info()

In [None]:
# age_median = train1['Age'].median()

# for df in [train1, valid, test]:
#     df['Age'].fillna(age_median, inplace = True)
# sns.boxplot(x='Age', data=train1)

In [None]:
# mode() return series
embarked_mode = train1['Embarked'].mode()[0]

for df in [train1, valid, test]:
    df['Embarked'].fillna(embarked_mode, inplace = True)

In [None]:
train1['Cabin'].isnull().sum()/train1.shape[0]
# Cabin missing value太多，drop掉

In [None]:
from sklearn.preprocessing import OneHotEncoder
# sparse: Will return sparse matrix if set True else will return an array.
onehotencoder = OneHotEncoder(handle_unknown = 'ignore', sparse = False)
onehotencoder.fit(train1['Embarked'].values.reshape(-1, 1))

def embarked_ohe_concat(df):
    encoded = onehotencoder.transform(df['Embarked'].values.reshape(-1,1))
    encoded = pd.DataFrame(encoded, columns = onehotencoder.get_feature_names(['Embarked']))
    return pd.concat([df.reset_index(drop = True), encoded], axis = 1)

train1 = embarked_ohe_concat(train1)
valid = embarked_ohe_concat(valid)
test = embarked_ohe_concat(test)

In [None]:
# train1 = pd.get_dummies(train1, columns = ['Embarked'], drop_first = True)
# onehotencoder = OneHotEncoder(handle_unknown = 'ignore')
# train1_embarked_encoded = onehotencoder.fit_transform(train1['Embarked'].values.reshape(-1,1)).toarray()
# valid_embarked_encoded = onehotencoder.transform(valid['Embarked'].values.reshape(-1,1)).toarray()
# train1_embarked_encoded = pd.DataFrame(train1_embarked_encoded, columns = encoder.get_feature_names(['Embarked']))
# valid_embarked_encoded = pd.DataFrame(valid_embarked_encoded, columns = encoder.get_feature_names(['Embarked']))
# train1 = pd.concat([train1.reset_index(drop = True), train1_embarked_encoded], axis = 1)
# valid = pd.concat([valid.reset_index(drop = True), valid_embarked_encoded], axis = 1)

In [None]:
f, ax = plt.subplots(figsize=(9, 7))
ax = sns.heatmap(train1.corr(), annot = True, cmap='Blues')

In [None]:
for df in [train1, valid, test]:
    df['Family'] = df['SibSp'] + df['Parch'] + 1

In [None]:
sns.distplot(train1[['Fare']])
# 票價分佈很廣且傾斜，有很低的也有很高的

In [None]:
sns.boxplot(x = 'Fare', data=train1)
# log transformation

In [None]:
# missing value of Fare in test
test['Fare'].fillna(train1['Fare'].median(), inplace = True)
for df in [train1, valid, test]:
    df['Fare_Log'] = df['Fare'].map(lambda x: np.log(x+1))

sns.distplot(train1[['Fare_Log']])

In [None]:
train1['Fare_band'] = pd.qcut(train1['Fare'], 4)
train1[['Fare_band', 'Fare']].groupby('Fare_band', as_index=False).mean().sort_values(by = 'Fare_band')

In [None]:
for df in [train1, valid, test]:
    df.loc[df['Fare'] <=  7.925, 'Fare_code'] = 1
    df.loc[(df['Fare'] >  7.925) & (df['Fare'] <=  14.454), 'Fare_code'] = 2
    df.loc[(df['Fare'] > 14.454) & (df['Fare'] <=  31.0), 'Fare_code'] = 3
    df.loc[df['Fare'] > 31.0, 'Fare_code'] = 4
    df['Fare_code'] = df['Fare_code'].astype(int)

In [None]:
train1['Fare_code'].head()

In [None]:
numeric = ['Age', 'SibSp', 'Parch', 'Fare_Log']
for i in range(len(numeric)):
    plt.subplot(1,len(numeric),i+1)
    sns.set_style('whitegrid')
    sns.boxplot(train1[numeric[i]],color='green',orient='v')
    plt.tight_layout()

In [None]:
numeric = ['Age', 'SibSp', 'Fare_Log']
for i in range(len(numeric)):
    plt.subplot(1,len(numeric),i+1)
    sns.set_style('whitegrid')
    sns.distplot(train1[numeric[i]])
    plt.tight_layout()

In [None]:
cat = ['Pclass', 'Sex','SibSp', 'Parch']
for i in range(len(cat)):
    plt.subplot(1, len(cat), i+1)
    sns.set(style = 'whitegrid')
    sns.countplot(x = cat[i], hue = 'Survived', data=train1)
#     train[cat[i]].value_counts().plot(kind='bar')
    plt.tight_layout()

In [None]:
train1.columns

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
# col = ['Pclass', 'Sex', 'Age', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Fare_Log', 'Family']
col = ['Pclass', 'Sex', 'Age_code', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Fare_code', 'Family']


clf = RandomForestClassifier(n_estimators=100, oob_score = True)
clf.fit(train1[col], train1['Survived'])

In [None]:
clf.oob_score_

In [None]:
predict = clf.predict(valid[col])

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(valid['Survived'], predict)

In [None]:
test.info()

In [None]:
test.describe()

In [None]:
result = clf.predict(test[col])

In [None]:
output = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': result})
output.to_csv('submission_2.csv', index = False)