In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression  # as LR
import matplotlib.pyplot as plt
from sklearn.grid_search import GridSearchCV

from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

#データの格納
train = pd.read_csv("~/train.csv")
test = pd.read_csv("~/test.csv")

#新しい特徴量の作成

#単身旅行者と団体旅行者の特徴量を新しく作成
train['Family_group'] = train.SibSp + train.Parch
test['Family_group'] = test.SibSp + test.Parch

#家族人数の特徴量を新しく作成
train['Family'] = train.SibSp + train.Parch + 1
test['Family'] = test.SibSp + test.Parch + 1

#Family_groupの特徴量をダミー変数に変換
train['Family_group'] = np.where(train['Family_group'] >= 1, 1, 0)
test['Family_group'] = np.where(test['Family_group'] >= 1, 1, 0)

#単身旅行者の場合は確実に乗れるので確率は１とする。

#団体旅行者の中で敬称が無い人の脱出船に乗れる確率
escape_boarding_probability_average_train = 2 / ((sum(train['Family']) - sum(train['Family_group'] == 0))/sum(train['Family_group'] == 1))
escape_boarding_probability_average_test = 2 / ((sum(test['Family']) - sum(test['Family_group'] == 0))/sum(test['Family_group'] == 1))

train['escape_boarding_probability_train'] = 1
test['escape_boarding_probability_test'] = 1

train['escape_boarding_probability_train'] = train['escape_boarding_probability_train'].replace(1, np.nan)
test['escape_boarding_probability_test'] = test['escape_boarding_probability_test'].replace(1, np.nan)

for i in range(1, 891):
    if (train['Family_group'][i] == 0):
        train['escape_boarding_probability_train'][i] = 1

for i in range(1, 418):
    if (test['Family_group'][i] == 0):
        test['escape_boarding_probability_test'][i] = 1        

#敬称ごとにindexの格納
train_mr_index = train['Name'].str.contains(' Mr. ')
train_miss_index = train['Name'].str.contains(' Miss. ')
train_mrs_index = train['Name'].str.contains(' Mrs. ')
train_master_index = train['Name'].str.contains(' Master. ')
test_mr_index = test['Name'].str.contains(' Mr. ')
test_miss_index = test['Name'].str.contains(' Miss. ')
test_mrs_index = test['Name'].str.contains(' Mrs. ')
test_master_index = test['Name'].str.contains(' Master. ')

train['escape_boarding_probability_train'][train_mr_index] = 1 / train['Family'][train_mr_index] 
train['escape_boarding_probability_train'][train_miss_index] = train['Family'][train_miss_index] - 1 / train['Family'][train_miss_index]
train['escape_boarding_probability_train'][train_mrs_index] = 1 / train['Family'][train_mrs_index]
train['escape_boarding_probability_train'][train_master_index] = train['Family'][train_master_index] - 1 / train['Family'][train_master_index]   
train['escape_boarding_probability_train']=train['escape_boarding_probability_train'].fillna(escape_boarding_probability_average_train)

test['escape_boarding_probability_test'][test_mr_index] = 1 / test['Family'][test_mr_index] 
test['escape_boarding_probability_test'][test_miss_index] = test['Family'][test_miss_index] - 1 / test['Family'][test_miss_index]
test['escape_boarding_probability_test'][test_mrs_index] = 1 / test['Family'][test_mrs_index]
test['escape_boarding_probability_test'][test_master_index] = test['Family'][test_master_index] - 1 / test['Family'][test_master_index]    
test['escape_boarding_probability_test']=test['escape_boarding_probability_test'].fillna(escape_boarding_probability_average_test)

#敬称ごとにindexの格納
train_mr_index = train['Name'].str.contains(' Mr. ')
train_miss_index = train['Name'].str.contains(' Miss. ')
train_mrs_index = train['Name'].str.contains(' Mrs. ')
train_master_index = train['Name'].str.contains(' Master. ')
test_mr_index = test['Name'].str.contains(' Mr. ')
test_miss_index = test['Name'].str.contains(' Miss. ')
test_mrs_index = test['Name'].str.contains(' Mrs. ')
test_master_index = test['Name'].str.contains(' Master. ')

train['escape_boarding_probability_train'][train_mr_index] = 1 / train['Family'][train_mr_index] 
train['escape_boarding_probability_train'][train_miss_index] = train['Family'][train_miss_index] - 1 / train['Family'][train_miss_index]
train['escape_boarding_probability_train'][train_mrs_index] = 1 / train['Family'][train_mrs_index]
train['escape_boarding_probability_train'][train_master_index] = train['Family'][train_master_index] - 1 / train['Family'][train_master_index]   
train['escape_boarding_probability_train']=train['escape_boarding_probability_train'].fillna(escape_boarding_probability_average_train)

test['escape_boarding_probability_test'][test_mr_index] = 1 / test['Family'][test_mr_index] 
test['escape_boarding_probability_test'][test_miss_index] = test['Family'][test_miss_index] - 1 / test['Family'][test_miss_index]
test['escape_boarding_probability_test'][test_mrs_index] = 1 / test['Family'][test_mrs_index]
test['escape_boarding_probability_test'][test_master_index] = test['Family'][test_master_index] - 1 / test['Family'][test_master_index]    
test['escape_boarding_probability_test']=test['escape_boarding_probability_test'].fillna(escape_boarding_probability_average_test)


#欠損値の補完

#敬称ごとの平均値の算出
train_mr = train[train['Name'].str.contains(' Mr. ')]
train_miss = train[train['Name'].str.contains(' Miss. ')]
train_mrs = train[train['Name'].str.contains(' Mrs. ')]
train_master = train[train['Name'].str.contains(' Master. ')]
test_mr = test[test['Name'].str.contains(' Mr. ')]
test_miss = test[test['Name'].str.contains(' Miss. ')]
test_mrs = test[test['Name'].str.contains(' Mrs. ')]
test_master = test[test['Name'].str.contains(' Master. ')]

train_mr_num = train_mr['Age'].dropna().mean()
train_miss_num = train_miss['Age'].dropna().mean()
train_mrs_num = train_mrs['Age'].dropna().mean()
train_master_num = train_master['Age'].dropna().mean()
train_all_num = train['Age'].dropna().median()

test_mr_num = test_mr['Age'].dropna().mean()
test_miss_num = test_miss['Age'].dropna().mean()
test_mrs_num = test_mrs['Age'].dropna().mean()
test_master_num = test_master['Age'].dropna().mean()
test_all_num = test['Age'].dropna().median()

#各世代の平均年齢の確認
print("trainデータの敬称'Mr'の平均値 = " + str(train_mr_num))
print("trainデータの敬称'Miss'の平均値 = " + str(train_miss_num))
print("trainデータの敬称'Mrs'の平均値 = " + str(train_mrs_num))
print("trainデータの敬称'Master'の平均値 = " + str(train_master_num))
print("trainデータの中央値 = " + str(train_all_num), '\n')

print("testデータの敬称'Mr'の平均値 = " + str(test_mr_num))
print("testデータの敬称'Miss'の平均値 = " + str(test_miss_num))
print("testデータの敬称'Mrs'の平均値 = " + str(test_mrs_num))
print("testデータの敬称'Master'の平均値 = " + str(test_master_num))
print("testデータの中央値 = " + str(test_all_num))

#以下、新しいノートに記述---------------------------------------------------------------------------------

#欠損値”Age”に対しての敬称ごとの平均値の補完
train['Age'][train_mr_index] = train_mr['Age'].fillna(32)
train['Age'][train_miss_index] = train_master['Age'].fillna(22)
train['Age'][train_mrs_index] = train_mrs['Age'].fillna(36)
train['Age'][train_master_index] = train_master['Age'].fillna(5)
train['Age'] = train['Age'].fillna(28)

test['Age'][test_mr_index] = test_mr['Age'].fillna(32)
test['Age'][test_miss_index] = test_miss['Age'].fillna(22)
test['Age'][test_mrs_index] = test_mrs['Age'].fillna(39)
test['Age'][test_master_index] = test_master['Age'].fillna(7)
test['Age'] = test['Age'].fillna(27)

#その他欠損値の補完
train['Embarked'] = train['Embarked'].fillna('S')
test['Fare'] = test['Fare'].fillna(test['Fare'].mean())

#性別と乗船港をダミー変数への変換
dummy_train = pd.get_dummies(train[['Sex', 'Embarked']])
dummy_test = pd.get_dummies(test[['Sex', 'Embarked']])

train_two = pd.concat([train.drop(["Sex", "Embarked"], axis = 1),dummy_train], axis = 1)
test_two = pd.concat([test.drop(["Sex", "Embarked"], axis = 1),dummy_test], axis = 1)


#不要な特徴量の削除
train_three = train_two.drop(['PassengerId', 'Name', 'Ticket', 'Cabin','Parch', 'SibSp'], axis = 1)
x_test = test_two.drop(['PassengerId', 'Name', 'Ticket', 'Cabin','Parch', 'SibSp'], axis = 1)


#データフレーム型への変換
x_train_df = train_three.drop(['Survived'], axis = 1)
x_train = x_train_df


#目的変数の格納
y_train = train_three.Survived


#モデルの作成

#決定木の学習を行う
depth = 4
clf = tree.DecisionTreeClassifier(max_depth = depth)
clf.fit(x_train_df, y_train)

#applyクラスでリーフごとのリーフ番号を返す
x_train_leaf_no = clf.apply(x_train_df)
x_test_leaf_no = clf.apply(x_test)

#リーフごとにロジスティック回帰分析を行う

#全てのindexを０にした配列を用意しておく
x_train_proba = np.zeros(x_train.shape[0])
x_test_proba = np.zeros(x_test.shape[0])

#重複しないリーフ番号をリストに格納する
unique_leaf_no = list(set(x_train_leaf_no))

#ロジスティック回帰のハイパーパラメータのチューニング
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}

#格納したリーフ番号を取り出す
for i in unique_leaf_no :
    #取り出すリーフ番号の確認
    print('leaf no:', i)

    #trainデータのリーフ番号を指定して取り出したデータフレームを変数に格納
    leaf_data_train_x = x_train[x_train_leaf_no == i]
    leaf_data_train_y = y_train[x_train_leaf_no == i]
    #testデータのリーフ番号を指定して取り出したデータフレームを変数に格納
    leaf_data_test_x = x_test[x_test_leaf_no == i]

    #一度、ダミー変数のデータを除外する
    leaf_data_train_x_drop = leaf_data_train_x.drop(['Family_group', 'Pclass', 'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_S', 'Embarked_Q', 'escape_boarding_probability_train'], axis = 1)
    leaf_data_test_x = leaf_data_test_x.drop(['Family_group', 'Pclass', 'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_S', 'Embarked_Q', 'escape_boarding_probability_test'], axis = 1)

    #survivedの値に生存者と死亡者の両方がいる場合
    if len(set(leaf_data_train_y)) > 1:

        #GridSearchを行う
        try:
            grid_search = GridSearchCV(LogisticRegression(), param_grid, cv = 5, scoring = 'roc_auc')   
            grid_search.fit(leaf_data_train_x_drop, leaf_data_train_y)
            clf = LogisticRegression(C=grid_search.best_params_['C'],class_weight="balanced")
        except (ValueError, TypeError, NameError, SyntaxError):
            clf = LogisticRegression()

        #ロジスティック回帰分析を行う
        clf.fit(leaf_data_train_x_drop, leaf_data_train_y)

        #所属しているクラスの確率を戻す
        a = clf.predict_proba(leaf_data_train_x_drop)

        #生存の場合の確率のみを格納
        x_train_proba[x_train_leaf_no == i] = a[:,1]

        if len(leaf_data_test_x) > 0:
            b = clf.predict_proba(leaf_data_test_x)    
            x_test_proba[x_test_leaf_no == i] = b[:,1]

    #survivedの値に生存者と死亡者のどちらかしかいない場合    
    else:
        x_train_proba[x_train_leaf_no == i] = leaf_data_train_y.head(1)
        if len(leaf_data_test_x) > 0:
            x_test_proba[x_test_leaf_no == i] =leaf_data_train_y.head(1)
            
#ループ終了の確認
print("for loop end")

#生存と死亡の確率のデータフレームを結合
train_data = pd.concat([x_train, pd.DataFrame(x_train_proba)], axis =1)
test_data = pd.concat([x_test, pd.DataFrame(x_test_proba)], axis =1)

#ロジスティック回帰のハイパーパラメータのチューニング
param_grid = {'max_depth': [3,5,8,13,21,34]}

#GridSearchを行う
grid_search = GridSearchCV(GradientBoostingClassifier(n_estimators=100), param_grid, cv = 5, scoring = 'roc_auc')   
grid_search.fit(train_data, y_train)

#勾配ブースティングによる学習と予測
model = GradientBoostingClassifier(max_depth=grid_search.best_params_['max_depth'], n_estimators=100)
model.fit(train_data, y_train)
output = model.predict(test_data).astype(int)


#結果をCSV形式で出力
leaf_data_test = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Survived": output
})

leaf_data_test.to_csv('training_camp06.csv', index = False) 