In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import(
    KFold,
    StratifiedKFold,
    cross_val_score,
    train_test_split,
    GridSearchCV,
    learning_curve
)
from sklearn.ensemble import(
    RandomForestClassifier,
    AdaBoostClassifier,
    GradientBoostingClassifier,
    ExtraTreesClassifier,
    VotingClassifier
)
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

from sklearn import tree
from sklearn.metrics import classification_report, confusion_matrix

#import mglearn
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

sns.set_style('darkgrid')

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
train = pd.read_csv('/kaggle/input/titanic/train.csv')
test = pd.read_csv('/kaggle/input/titanic/test.csv')
gender = pd.read_csv('/kaggle/input/titanic/gender_submission.csv')

学習データとテストデータの分布を調べるため、データをマージ  
* 学習データとテストデータを区別できるようにWhatIsDataカラムを追加  
* テストデータにSurvivedカラムを追加

In [None]:

'''
train_1 = train
test_1 = test
train_1['WhatIsData'] = 'Train'
test_1['WhatIsData'] = 'Test'
test_1['Survived'] = 9
'''
alldata = pd.concat(
    [
        train,
        test
    ], axis=0
).reset_index(drop=True)

In [None]:
train['Survived'].mean()

PclassのSurvived平均（生存率）を算出

In [None]:
train['Survived'].groupby(train['Pclass']).mean()

In [None]:
sns.countplot(train['Pclass'], hue=train['Survived'])

*グラフからわかること*  
・クラスによる生存率はおおむね横ばい  
・死亡率はPclass=3で突出している

In [None]:
#pd.crosstab(alldata['Pclass'], alldata['WhatIsData'], normalize='columns')

**=Name (honorific)=**

In [None]:
train['Name'].head()

乗客名から敬称(hororific)を抽出し、その数をカウント

In [None]:
train['honorific'] = train['Name'].map(lambda x: x.split(', ')[1].split('. ')[0])
#train['honorific']
train['honorific'].value_counts()

In [None]:
# alldata['honorific'] = alldata['Name'].map(lambda x: x.split(', ')[1].split('. ')[0])
#pd.crosstab(alldata['honorific'], alldata['WhatIsData'])

学習データのみに現れる敬称を除外する

In [None]:
only_train_honorific = ['Capt', 'Don', 'Jonkheer', 'Lady', 'Major', 'Mile', 'Mme', 'Sir', 'the Countess']
train_omit1 = train[~train['honorific'].isin(only_train_honorific)].reset_index(drop=True)
train_omit1['Survived'].groupby(train_omit1['honorific']).agg(['mean', 'count'])

mean:生存割合  
count:生存数

In [None]:
train_omit1['honorific'].replace(['Col', 'Dr', 'Rev'], 'Rare', inplace=True)
train_omit1['honorific'].replace('Mlle', 'Miss', inplace=True)
train_omit1['honorific'].replace('Ms', 'Miss', inplace=True)

train_omit1['Survived'].groupby(train_omit1['honorific']).agg(['mean', 'count'])

**=Fare=**

In [None]:
sns.distplot(train[train['Survived']==1]['Fare'], kde=False, rug=False, bins=10, label='Survived')
sns.distplot(train[train['Survived']==0]['Fare'], kde=False, rug=False, bins=10, label='Death')
plt.legend()

In [None]:
train.head()

In [None]:
print("Skewness of Fare:", train['Fare'].skew())

対数変換をして分布修正

In [None]:
sns.distplot(np.log1p(train[train['Survived']==1]['Fare']), kde=False, rug=False, bins=10, label='Survived')
sns.distplot(np.log1p(train[train['Survived']==0]['Fare']), kde=False, rug=False, bins=10, label='Dead')
plt.legend()

**=Age=**

In [None]:
train['Survived'].groupby(train['Age'].isnull()).mean()

欠損値を除いた分布を確認

In [None]:
train_age_omit = train.dropna(subset=['Age'])

sns.distplot(train_age_omit[train_age_omit['Survived']==1]['Age'], kde=True, rug=False, bins=10, label='Survived')
sns.distplot(train_age_omit[train_age_omit['Survived']==0]['Age'], kde=True, rug=False, bins=10, label='Dead')
plt.legend()

・20代以降、生存死亡の分布は似通っている  
・0~20歳では生存分布が高い  
 ->子供の生存確率が高い

In [None]:
plt.figure(figsize=(12, 7))
sns.boxplot(x='Pclass', y='Age', data = train,palette='winter')

**=FamilySize=**

In [None]:
train['FamilySize'] = train['Parch'] + train['SibSp'] + 1
alldata['FamilySize'] = alldata['Parch'] + alldata['SibSp'] + 1

sns.countplot(train['FamilySize'], hue=train['Survived'])

FamilySizeをビン分割（グルーピング）する

In [None]:
train['FamilySize_bin'] = 'large'
train.loc[train['FamilySize'] == 1, 'FamilySize_bin'] = 'alone'
train.loc[(train['FamilySize'] >= 2) & (train['FamilySize']<=4), 'FamilySize_bin'] = 'small'
train.loc[(train['FamilySize'] >= 5) & (train['FamilySize']<=7), 'FamilySize_bin'] = 'medium'

train['Survived'].groupby(train['FamilySize_bin']).mean()

In [None]:
train['Survived'].groupby(train['FamilySize_bin']).count()

fam_cat_list = [1, 4, 7, 8,11]
fam_cat_name = ['alone', 'small', 'medium', 'large']
fam_cat = pd.cut(train['FamilySize'], bins = fam_cat_list, labels = fam_cat_name)

pd.DataFrame({'FamilySize': train['FamilySize'], 'family_category': fam_cat})

**=Cabin=**

Cabinの頭文字は部屋がどの階層にあったかを示している  

In [None]:
train['Cabin_ini'] = train['Cabin'].map(lambda x:str(x)[0])
alldata['Cabin_ini'] = alldata['Cabin'].map(lambda x:str(x)[0])

train['Survived'].groupby(train['Cabin_ini']).agg(['mean', 'count'])

In [None]:
#pd.crosstab(alldata['Cabin_ini'], alldata['WhatIsData'])

**=Ticket=**

In [None]:
train['Ticket'].head()

チケット記号の頭文字に着目する

In [None]:
train['Survived'].groupby(train['Ticket'].map(lambda x: str(x)[0])).agg(['mean', 'count'])

学習データとテストデータの分布を確認

In [None]:
#pd.crosstab(alldata['Ticket'].map(lambda x: str(x)[0]), alldata['WhatIsData'])

In [None]:
train.head()

In [None]:
train.describe()

In [None]:
test.describe()

In [None]:
test_shape = test.shape
train_shape = train.shape

print(test_shape)
print(train_shape)

**==データ処理==**  
=Pclass=

・Pclassはすべて埋まっている＆数値データのためそのまま使用

=Sex=  
・male=0, female=1として数値変換

In [None]:
train["Sex"][train["Sex"] == "male"] = 0
train["Sex"][train["Sex"] == "female"] = 1

test["Sex"][test["Sex"] == "male"] = 0
test["Sex"][test["Sex"] == "female"] = 1

In [None]:
train.head(10)

In [None]:
test.describe()

In [None]:
def kesson_table(df): 
        null_val = df.isnull().sum()
        percent = 100 * df.isnull().sum()/len(df)
        kesson_table = pd.concat([null_val, percent], axis=1)
        kesson_table_ren_columns = kesson_table.rename(
        columns = {0 : '欠損数', 1 : '%'})
        return kesson_table_ren_columns
    
kesson_table(train)

In [None]:
kesson_table(test)

**=Age=**  
・train,testの欠損データに中央値を補完

In [None]:
train["Age"] = train["Age"].fillna(train["Age"].median())
test["Age"] = train["Age"].fillna(train["Age"].median())

In [None]:
kesson_table(test)

**=Honorific=**  
・'test'にも'Honorific'カラムを追加  
・敬称を数値に置換

In [None]:
test['honorific'] = test['Name'].map(lambda x: x.split(', ')[1].split('. ')[0])
test['honorific'].value_counts()

In [None]:
test['honorific'].replace(['Col', 'Dr', 'Rev', 'Dona'], 'Rare', inplace=True)
test['honorific'].replace('Mlle', 'Miss', inplace=True)
test['honorific'].replace('Ms', 'Miss', inplace=True)

test['honorific'].value_counts()

In [None]:
only_train_honorific = ['Capt', 'Don', 'Jonkheer', 'Lady', 'Major', 'Mile', 'Mme', 'Sir', 'the Countess']
train = train[~train['honorific'].isin(only_train_honorific)].reset_index(drop=True)

train['honorific'].value_counts()
#train['Survived'].groupby(train['honorific']).agg(['mean', 'count'])

In [None]:
train['honorific'].replace(['Col', 'Dr', 'Rev'], 'Rare', inplace=True)
train['honorific'].replace('Mlle', 'Miss', inplace=True)
train['honorific'].replace('Ms', 'Miss', inplace=True)

train['honorific'].value_counts()

//敬称を数値データに置換

In [None]:
#train
train["honorific"][train["honorific"] == "Mr"] = 0
train["honorific"][train["honorific"] == "Miss"] = 1
train["honorific"][train["honorific"] == "Mrs"] = 2
train["honorific"][train["honorific"] == "Master"] = 3
train["honorific"][train["honorific"] == "Rare"] = 4

#test
test["honorific"][test["honorific"] == "Mr"] = 0
test["honorific"][test["honorific"] == "Miss"] = 1
test["honorific"][test["honorific"] == "Mrs"] = 2
test["honorific"][test["honorific"] == "Master"] = 3
test["honorific"][test["honorific"] == "Rare"] = 4

train.head(10)

**=Fare=**  
・"test"内の欠損値を中央値で補完

In [None]:
test.Fare[152] = test.Fare.median()

**=Family Size=**  
・'test'に'train'と同様の処理  
・FamilySize = 'Parch' + 'SibSp' + 1  
・FamilySizeを4つにグルーピング(alone, small, medium, large)


In [None]:
test['FamilySize'] = test['Parch'] + test['SibSp'] + 1

In [None]:
test['FamilySize_bin'] = 'large'
test.loc[test['FamilySize'] == 1, 'FamilySize_bin'] = 'alone'
test.loc[(test['FamilySize'] >= 2) & (test['FamilySize']<=4), 'FamilySize_bin'] = 'small'
test.loc[(test['FamilySize'] >= 5) & (test['FamilySize']<=7), 'FamilySize_bin'] = 'medium'

test['FamilySize'].groupby(test['FamilySize_bin']).count()

In [None]:
#FamilySize_binを数値変換

#train
train["FamilySize_bin"][train["FamilySize_bin"] == 'alone'] = 0
train["FamilySize_bin"][train["FamilySize_bin"] == 'small'] = 1
train["FamilySize_bin"][train["FamilySize_bin"] == 'medium'] = 2
train["FamilySize_bin"][train["FamilySize_bin"] == 'large'] = 3

#test
test["FamilySize_bin"][test["FamilySize_bin"] == 'alone'] = 0
test["FamilySize_bin"][test["FamilySize_bin"] == 'small'] = 1
test["FamilySize_bin"][test["FamilySize_bin"] == 'medium'] = 2
test["FamilySize_bin"][test["FamilySize_bin"] == 'large'] = 3

**=Embarked=**

In [None]:
train['Embarked'].value_counts()

In [None]:
train["Embarked"] = train["Embarked"].fillna("S")

In [None]:
#数値変換
#train
train["Embarked"][train["Embarked"] == "S"] = 0
train["Embarked"][train["Embarked"] == "C"] = 1
train["Embarked"][train["Embarked"] == "Q"] = 2

#test
test["Embarked"][test["Embarked"] == "S"] = 0
test["Embarked"][test["Embarked"] == "C"] = 1
test["Embarked"][test["Embarked"] == "Q"] = 2

trainデータを”学習”、”検証”用に分割(train_test_split)

In [None]:

target = train["Survived"].values
features = train[["Pclass", "Sex", "Age", "Fare", "FamilySize_bin", "Embarked"]].values

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.20, random_state = 1)

max_depth = 10
min_samples_split = 5

my_tree = tree.DecisionTreeClassifier(max_depth = max_depth, min_samples_split = min_samples_split, random_state = 1)
my_tree.fit(X_train, y_train)


check_prediction = my_tree.predict(X_test)

print(classification_report(y_test, check_prediction))
check_prediction.shape

テストデータで推論を行う

In [None]:
test_features = test[["Pclass", "Sex", "Age", "Fare", "FamilySize_bin", "Embarked"]].values

test_prediction = my_tree.predict(test_features)

In [None]:
PassengerId = np.array(test["PassengerId"]).astype(int)

my_solution = pd.DataFrame(test_prediction, PassengerId, columns = ["Survived"])

my_solution.to_csv("my_solution.csv", index_label = ["PassengerId"])