- ライブラリ

In [1]:
# lightGBMのModelを構築

import lightgbm as lgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold, train_test_split

import optuna
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


- 読み込み

In [2]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

X = train.drop(['gameId', 'blueWins'], axis=1)
y = train['blueWins']

test = test.drop('gameId', axis=1)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

- 

In [4]:
# modelのパラメーター
params = {
    'task' : 'train',
    'boosting_type' : 'gbdt',
    'objective' : 'binary',
    'seed' : 71,
    'verbose' : 0,
    'metric' : 'binary-logloss'
}

# スコア、モデル保存用の配列
scores = []
models = []

# 訓練データをK-Foldにより4分割
kf = KFold(n_splits=4, shuffle=True, random_state=71)

# 学習を実施
for tr_idx, va_idx in kf.split(X_train):
  # 学習データ、評価データに分割
  tr_x, va_x = X_train.iloc[tr_idx], X_train.iloc[va_idx]
  tr_y, va_y = y_train.iloc[tr_idx], y_train.iloc[va_idx]

  # lightGBMデータ構造に変換
  lgb_train = lgb.Dataset(tr_x, tr_y)
  lgb_eval = lgb.Dataset(va_x, va_y, reference=lgb_train)

  model_gbm = lgb.train(
      params,
      lgb_train,
      num_boost_round=500,
      valid_sets=lgb_eval
  )

  # スコアの確認
  pred_y = model_gbm.predict(va_x)
  pred_y_label = np.where(pred_y>0.5, 1, 0)
  score = accuracy_score(pred_y_label, va_y)

  # 結果を格納
  scores.append(score)
  models.append(model_gbm)

# 予測実行関数
def pred(models, X_test):
  # 予測結果サマリ
  pred_y_summary = []

  # model分ループ
  for i in range(len(models)):
    # 予測を実行
    pred_y = models[i].predict(X_test)
    # 結果を格納
    pred_y_summary.append(pred_y)

  # 各モデルの予測結果の平均値を作成
  pred_y_mean = np.mean(pred_y_summary, axis=0)
  return pred_y_mean

# 予測を実行（Mean）
pred_y = pred(models, test)
pred_y_label = np.where(pred_y>0.5, 1, 0)

In [5]:
# 早期停止コールバックを設定
callbacks = [lgb.early_stopping(50)]

# モデルのトレーニング
model_gbm_tuned = lgb.train(params, lgb_train, valid_sets=[lgb_train, lgb_eval], num_boost_round=1000, callbacks=callbacks)

# スコアの確認
pred_y = model_gbm_tuned.predict(va_x)
pred_y_label = np.where(pred_y > 0.5, 1, 0)

ValueError: For early stopping, at least one dataset and eval metric is required for evaluation

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB

names = ['LogisticRegression', 'NearestNeighbors', 'RandomForest', 'DecisionTree','AdaBoost', 'NaiveBayes']

classifiers = [
    LogisticRegression(random_state=123),
    KNeighborsClassifier(),
    RandomForestClassifier(random_state=123),
    DecisionTreeClassifier(random_state=123),
    AdaBoostClassifier(random_state=123),
    GaussianNB()]

for name, model in zip(names, classifiers):
  model.fit(tr_x, tr_y)
  pred_y = model.predict(va_x)
  score = accuracy_score(pred_y, va_y)
  print(name, ' Accuracy : ', score)

LogisticRegression  Accuracy :  0.688125
NearestNeighbors  Accuracy :  0.74625
RandomForest  Accuracy :  0.740625
DecisionTree  Accuracy :  0.708125
AdaBoost  Accuracy :  0.76375
NaiveBayes  Accuracy :  0.705


In [7]:
# lightGBMのModelを構築

from tqdm import tqdm
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

# グリッドサーチの条件設定
grid = {RandomForestClassifier(random_state=123): {'n_estimators' : [i for i in range(1, 30)],
                                                      'criterion' : ['gini', 'entropy'],
                                                      'max_depth' : [i for i in range(1, 10)]}}

# ベストスコア
best_score = 0

# 予測
for model, param in tqdm(grid.items()):
  # Model構築
  clf = GridSearchCV(model, param)
  clf.fit(tr_x, tr_y)
  # スコアの確認
  pred_y = clf.predict(va_x)
  score = accuracy_score(pred_y, va_y)
  # 判定
  if best_score < score:
    best_score = score
    best_param = clf.best_params_
    best_model = model.__class__.__name__


100%|██████████| 1/1 [00:59<00:00, 59.44s/it]


In [8]:
print(best_param)

{'criterion': 'gini', 'max_depth': 9, 'n_estimators': 25}

{'criterion': 'gini', 'max_depth': 8, 'n_estimators': 19}


{'criterion': 'gini', 'max_depth': 9, 'n_estimators': 25}

In [None]:
def predict(X_train, y_train, X_test, mode):
  # 結果格納用の配列
  preds = []
  preds_test = []
  idxes = []
  # クロスバリデーションで予測を実行
  kf = KFold(n_splits=4, shuffle=True, random_state=71)
  for tr_idx, va_idx in kf.split(X_train):
    # 学習データ、評価データに分割
    tr_x, va_x = X_train.iloc[tr_idx], X_train.iloc[va_idx]
    tr_y, va_y = y_train.iloc[tr_idx], y_train.iloc[va_idx]
    # modelを構築
    if mode == 'LightGBM':
      params = {'bagging_fraction': 0.4600347572555584,
                'bagging_freq': 5,
                'boosting_type': 'gbdt',
                'feature_fraction': 0.7,
                'feature_pre_filter': False,
                'lambda_l1': 0.004418000666138604,
                'lambda_l2': 8.039538280454251e-06,
                'min_child_samples': 20,
                'num_leaves': 4,
                'objective': 'binary',
                'seed': 71,
                'task': 'train',
                'verbose': 0}
      lgb_train = lgb.Dataset(tr_x, tr_y)
      lgb_eval = lgb.Dataset(tr_x, tr_y, reference=lgb_train)
      model = lgb.train(params, lgb_train, num_boost_round=1000, early_stopping_rounds=50, valid_sets=lgb_eval)
    elif mode == 'RandomForest':
      model = RandomForestClassifier(random_state=123, n_estimators=9, criterion='gini', max_depth=25)
      model = model.fit(tr_x, tr_y)
    elif mode == 'SVM':
      model = svm.LinearSVC()
      model = model.fit(tr_x, tr_y)
    # 予測値を算出
    pred = model.predict(va_x)
    preds.append(pred)
    pred_test = model.predict(X_test)
    preds_test.append(pred_test)
    idxes.append(idx)

  # バリデーションデータに対する予測値を連結、その後元の順序に直す
  idxes = np.concatenate(idxes)
  preds = np.concatenate(preds, axis=0)
  pred_train = preds[np.argsort(idxes)]

  # テストデータに対する平均値を取得
  preds_test = np.mean(preds_test, axis=0)
  return pred_train, preds_test
