## 1.5　上位を目指すためのポイント

In [86]:
import numpy as np
import pandas as pd
import itertools
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [87]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

print(train.shape)
train.head()

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### 1.5.1　タスクと評価指標

In [88]:
train_x =train.drop(['Survived'], axis=1)
train_y = train['Survived']

test_x = test.copy()

モデルや特徴量を作る上でまず優先すべきなのが、**データ理解**（EDA：Exploratory Data Analysis／探索的データ分析）である。<br>
**カラムの種類・型・値の分布・欠損値・外れ値・目的変数との相関や関係性**を掴むことで、やるべきことが分かってくる。

### 1.5.2　特徴量の作成

In [89]:
train_x = train_x.drop(['PassengerId'], axis=1)
test_x = test_x.drop(['PassengerId'], axis=1)

In [90]:
train_x = train_x.drop(['Name', 'Ticket', 'Cabin'], axis=1)
test_x = test_x.drop(['Name', 'Ticket', 'Cabin'], axis=1)

In [91]:
for c in ['Sex', 'Embarked']:
    le = LabelEncoder()
    le.fit(train_x[c].fillna('NA'))
    
    train_x[c] = le.transform(train_x[c].fillna('NA'))
    test_x[c] = le.transform(test_x[c].fillna('NA'))
    
print(train_x.isnull().sum())
train_x.head()

Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      0
dtype: int64


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,22.0,1,0,7.25,3
1,1,0,38.0,1,0,71.2833,0
2,3,0,26.0,0,0,7.925,3
3,1,0,35.0,1,0,53.1,3
4,3,1,35.0,0,0,8.05,3


if文で場合分けして数値を割り振るよりも、簡単かつ早く数値変換ができている。

### 1.5.3　モデルの作成

コンペでは最初に、安定して高い精度が期待できるGBDTを試すことが多い。

In [92]:
model = XGBClassifier(n_estimators=20, random_state=71)
model.fit(train_x, train_y)

pred = model.predict_proba(test_x)[:, 1]
print(pred[:5])

pred_label = np.where(pred > 0.5, 1, 0)
print(pred_label[:5])

[0.04495935 0.08867919 0.17907852 0.1694976  0.4144685 ]
[0 0 0 0 0]


In [93]:
submission = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': pred_label})
submission.to_csv('submission_first.csv', index=False)

submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


### 1.5.4　モデルの評価

loglossは予測確率が外れているほど高いペナルティが与えられる評価指標である。

In [94]:
scores_accuracy = []
scores_logloss = []

In [95]:
kf = KFold(n_splits=4, shuffle=True, random_state=71)

for tr_idx, va_idx, in kf.split(train_x):
    tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
    tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
    
    model = XGBClassifier(n_estimators=20, random_state=71)
    model.fit(tr_x, tr_y)
    
    va_pred = model.predict_proba(va_x)[:, 1]
    
    logloss = log_loss(va_y, va_pred)
    accuracy = accuracy_score(va_y, va_pred > 0.5)
    
    scores_logloss.append(logloss)
    scores_accuracy.append(accuracy)
    
logloss = np.mean(scores_logloss)
accuracy = np.mean(scores_accuracy)
    
print(f'logloss: {logloss:.4f}, accuracy: {accuracy:.4f}')

logloss: 0.4384, accuracy: 0.8182


accuracyだけであれば、cross_validation_scoreの方が簡単に算出できる。

In [96]:
score = cross_val_score(model, train_x, train_y, cv=kf)

print('score:', score)
print('score_mean: {:.4f}'.format(score.mean()))

score: [0.82511211 0.80717489 0.81165919 0.82882883]
score_mean: 0.8182


### 1.5.5　モデルのチューニング

In [97]:
param_space = {
    'max_depth': [3, 5, 7],
    'min_child_weight': [1.0, 2.0, 4.0]
}

param_combinations = itertools.product(param_space['max_depth'], param_space['min_child_weight'])

params = []
scores =[]

for max_depth, min_child_weight in param_combinations:
    score_folds = []
    
    kf = KFold(n_splits=4, shuffle=True, random_state=123456)
    for tr_idx, va_idx in kf.split(train_x):
        tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
        tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
        
        model = XGBClassifier(n_estimators=20, random_state=71, max_depth=max_depth, min_child_weight=min_child_weight)
        model.fit(tr_x, tr_y)
        
        va_pred = model.predict_proba(va_x)[:, 1]
        logloss = log_loss(va_y, va_pred)
        score_folds.append(logloss)
        
    score_mean = np.mean(score_folds)
    scores.append(score_mean)
    params.append((max_depth, min_child_weight))
    
    # for max_depth, min_child_weight...に戻る

best_idx = np.argsort(scores)[0]
best_param = params[best_idx]

各パラメータの組み合わせとスコアの対応は、以下の通りである。

In [98]:
print('param:', params)
print('scores:', scores)

param: [(3, 1.0), (3, 2.0), (3, 4.0), (5, 1.0), (5, 2.0), (5, 4.0), (7, 1.0), (7, 2.0), (7, 4.0)]
scores: [0.42429454725813276, 0.42155422863628145, 0.42260688796491164, 0.43863006075171157, 0.44170160768985545, 0.429008473282794, 0.46079977973604264, 0.440218414870769, 0.4361315505093361]


ソートして、loglossが最も低いデータを調べ、元のインデックス番号を調べる。

In [99]:
print('sort scores:', np.sort(scores))
print('sort index:', np.argsort(scores))

sort scores: [0.42155423 0.42260689 0.42429455 0.42900847 0.43613155 0.43863006
 0.44021841 0.44170161 0.46079978]
sort index: [1 2 0 5 8 3 7 4 6]


これで、loglossが最も低いときのパラメータを調べることができた。

In [100]:
print('best_index:', np.argsort(scores)[0])
print('best_param:', params[best_idx])

best_index: 1
best_param: (3, 2.0)


補足

np.argsortは、ソートした値の元のインデックスを返す。

In [101]:
a = np.array([100, 50, 10, 500, 1])

sort = np.sort(a)
print('sort:', sort)

argsort = np.argsort(a)
print('argsort:', argsort)

sort: [  1  10  50 100 500]
argsort: [4 2 1 0 3]


### 1.5.6　アンサンブル

xgboostの方が精度が高いため、大きな重みをかけた加重平均を取る。

In [107]:
model_xgb = XGBClassifier(n_estimators=20, random_state=71)
model_xgb.fit(train_x, train_y)
pred_xgb = model_xgb.predict_proba(test_x)[:, 1]

print(pred_xgb[:5])

[0.05172093 0.13485578 0.14574982 0.12685756 0.38975474]


In [108]:
train_x = train_x.fillna(train_x.mean())
test_x = test_x.fillna(test_x.mean())

std = StandardScaler()
train_x2 = std.fit_transform(train_x)
test_x2 = std.fit_transform(test_x)

model_lr = LogisticRegression(solver='lbfgs', max_iter=300)
model_lr.fit(train_x2, train_y)
pred_lr = model_lr.predict_proba(test_x2)[:, 1]

print(pred_lr[:5])

[0.08461409 0.32853238 0.08192019 0.09809377 0.55020534]


In [111]:
pred = pred_xgb * 0.8 + pred_lr * 0.2
pred_label = np.where(pred > 0.5, 1, 0)

print(pred_label[:10])

[0 0 0 0 0 0 1 0 1 0]
