# Kaggleの「Titanic: Mchine Learning from Disaster」をやってみる  
[Kaggleの「Titanic: Mchine Learning from Disaster」ページはこちら](https://www.kaggle.com/c/titanic "titanic")

In [1]:
import numpy as np
import pandas as pd

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train_x = train.drop(['Survived'], axis=1)
train_y =train['Survived']

In [4]:
# テストデータは特徴量のみなので、そのままで良い
test_x = test.copy()

## 特徴量の作成

In [5]:
train_x = train_x.drop(['PassengerId','Name','Ticket','Cabin'], axis=1)
test_x = test_x.drop(['PassengerId','Name','Ticket','Cabin'], axis=1)

In [6]:
from sklearn.preprocessing import LabelEncoder

# それぞれのカテゴリ変数にlabel encodingを適用する
for c in ['Sex', 'Embarked']:
    le = LabelEncoder()
    le.fit(train_x[c].fillna('NA'))
    train_x[c] = le.transform(train_x[c].fillna('NA'))
    test_x[c] = le.transform(test_x[c].fillna('NA'))

## モデルの作成

In [7]:
from xgboost import XGBClassifier

# モデルの作成および学習データを与えての学習
model = XGBClassifier(n_estimators=20, random=71)
model.fit(train_x, train_y)

Parameters: { random } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=20, n_jobs=0, num_parallel_tree=1, random=71,
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [8]:
# 予測値を確率で出力
pred = model.predict_proba(test_x)[:, 1]

# 予測値を2値に変換
pred_label = np.where(pred > 0.5, 1, 0)

In [9]:
# 提出用ファイルの作成
submission = pd.DataFrame({'PassengerId':test['PassengerId'], 'Survived':pred_label})
submission.to_csv('submission_first.csv', index=False)

## モデルの評価

In [10]:
from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import KFold

In [11]:
# 各foldのスコアを保存するリスト
scores_accuracy =[]
scores_logloss = []

In [12]:
# クロスバリデーション
# 学習データを４分割して１つをバリデーションデータとすることを繰り返す
kf = KFold(n_splits=4, shuffle=True, random_state=71)
for tr_idx, va_idx in kf.split(train_x):
    # 学習データを学習データとバリデーションデータに分ける
    tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
    tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
    
    # モデルの学習
    model = XGBClassifier(n_estimators=20, random=71)
    model.fit(tr_x, tr_y)
    
    # 予測値を確率で出力
    va_pred = model.predict_proba(va_x)[:, 1]
    
    # バリデーションでのスコアを計算
    logloss = log_loss(va_y, va_pred)
    accuracy = accuracy_score(va_y, va_pred > 0.5)
    
    # foldのスコアを保存する
    scores_logloss.append(logloss)
    scores_accuracy.append(accuracy)

# 各foldのスコアの平均を出力する
logloss = np.mean(scores_logloss)
accuracy = np.mean(scores_accuracy)
print(f'logloss: {logloss:.4f}, accuracy: {accuracy:.4f}')

Parameters: { random } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { random } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { random } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { random } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoo

## モデルのチューニング

In [13]:
import itertools

# チューニング候補とするパラメータを準備
param_space = {
    'max_depth':[3, 5, 7],
    'min_child_weight':[1.0, 2.0, 4.0]
}

# 探索するハイパーパラメータの組み合わせ
param_combinations = itertools.product(param_space['max_depth'], param_space["min_child_weight"])

In [14]:
# 各パラメータの組み合わせとスコアを保存するリスト
params = []
scores = []

# 各パラメータの組み合わせごとにクロスバリデーション
for max_depth, min_child_weight in param_combinations:
    score_folds=[]
    
    # クロスバリデーション
    # 学習データを４分割して１つをバリデーションデータとすることを繰り返す
    kf = KFold(n_splits=4, shuffle=True, random_state=71)
for tr_idx, va_idx in kf.split(train_x):
    # 学習データを学習データとバリデーションデータに分ける
    tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
    tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
    
    # モデルの学習
    model = XGBClassifier(n_estimators=20, random=71,
                          max_depth=max_depth, min_child_weight=min_child_weight)
    model.fit(tr_x, tr_y)
    
    # 予測値を確率で出力
    va_pred = model.predict_proba(va_x)[:, 1]
    
    # バリデーションでのスコアを計算
    logloss = log_loss(va_y, va_pred)
    
    # foldのスコアを保存する
    score_folds.append(logloss)
    
    # 各foldのスコアを平均する
    score_mean = np.mean(score_folds)
    
    # パラメータの組み合わせとスコアを保存
    params.append((max_depth, min_child_weight))
    scores.append(score_mean)
    
# 最もスコアが良いものをベストなパラメータとする
best_idx = np.argsort(scores)[0]
best_param = params[best_idx]
print(f'max_depth: {best_param[0]}, min_child_weight: {best_param[1]}')

# max_depth: 7, min_child_weight: 4.0 がベスト

Parameters: { random } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { random } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { random } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { random } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoo

## アンサンブル

In [15]:
from sklearn.linear_model import LogisticRegression

# xgboostモデル
model_xgb = XGBClassifier(n_estimators=20, random=71)
model_xgb.fit(train_x, train_y)
pred_xgb = model_xgb.predict_proba(test_x)[:, 1]

# ロジスティック回帰モデル
# xgboostとは異なる特徴量が必要なので別途train_x2, test_x2があることになっている
# model_lr.fit(train_x2, train_y)
# pred_lr = model_lr.predict_proba(test_x2)[:, 1]

# 予測値の加重平均をとる
# pred = pred_xgb * 0.8 + pred_lr * 0.2
# pred_label = np.label = np.where(pred > 0.5, 1, 0)

Parameters: { random } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


