### ライブラリのインストール

In [1]:
from itertools import cycle
import re

import warnings
import datetime
from collections import defaultdict

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn
from sklearn.model_selection import train_test_split, KFold, cross_validate, cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,auc,log_loss,roc_curve,roc_auc_score

from xfeat import TargetEncoder
import lightgbm as lgbm
from sklearn.metrics import roc_auc_score

In [2]:
#notebookのプロットの設定
warnings.filterwarnings("ignore")
pd.set_option('max_columns', 50)
plt.style.use('bmh')
color_pal = plt.rcParams['axes.prop_cycle'].by_key()['color']
color_cycle = cycle(plt.rcParams['axes.prop_cycle'].by_key()['color'])


### 学習データのロード

In [3]:
train_path = 'train.csv'
test_path = 'test.csv'

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

df = pd.concat([train_df,test_df])
print(train_df.shape,test_df.shape)

df['arrival_date'] = pd.to_datetime(df['arrival_date'])


(108785, 27) (10099, 26)


### 前処理

In [4]:
#数値変数と文字列(カテゴリ)変数をリストで保持しておく

num_features = ["lead_time",
                "stays_in_weekend_nights","stays_in_week_nights","adults","children",
                "babies","is_repeated_guest", "previous_cancellations",
                "previous_bookings_not_canceled","agent","company",
                "required_car_parking_spaces", "total_of_special_requests", "adr"]

cat_features = ["hotel","meal","market_segment",
                "distribution_channel","reserved_room_type","customer_type"]

features = num_features + cat_features

In [5]:
#説明変数と目的変数を分ける。
train_y = train_df["is_canceled"]
train_X = train_df.drop(["is_canceled",'arrival_date'], axis=1)[features]

test_X = test_df.drop(['arrival_date'], axis=1)[features]

In [6]:
fold = KFold(n_splits=5, shuffle=True, random_state=71)
cv = list(fold.split(train_X, train_y)) # もともとが generator なため明示的に list に変換する

In [7]:
def target_encoding_train(x_train, y_train,input_cols, fold):
    train = pd.concat([x_train,y_train],axis=1)
    encoder = TargetEncoder(
        input_cols=input_cols,
        target_col=y_train.name,
        fold = fold,
        output_suffix="_te"
    )
    train = encoder.fit_transform(train)
    return train.drop(['is_canceled']+input_cols,axis=1),train['is_canceled']

def target_encoding_valid(x_train, y_train, x_valid, input_cols, fold):
    for col in input_cols:
        data_tmp = pd.DataFrame({col: x_train[col], 'target': y_train})
        target_mean = data_tmp.groupby(col)['target'].mean()
        x_valid.loc[:,col] = x_valid[col].map(target_mean)
    return x_valid

def target_encoding_cv(X, y,cat_features,fold, cv):
    x_train_list = []
    x_valid_list = []
    y_train_list = []
    y_valid_list = []
    for (idx_train, idx_valid) in cv:
        # training data を train/valid に分割
        x_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
        x_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]
        x_valid = target_encoding_valid(x_train, y_train, x_valid, cat_features, fold)
        x_train, y_train = target_encoding_train(x_train, y_train, cat_features, fold)
        x_train_list.append(x_train.values)
        x_valid_list.append(x_valid.values)
        y_train_list.append(y_train.values)
        y_valid_list.append(y_valid.values)
    return  x_train_list, x_valid_list, y_train_list, y_valid_list

In [8]:
x_train_list, x_valid_list, y_train_list, y_valid_list = target_encoding_cv(train_X, train_y,cat_features,fold, cv)

## 学習
4foldのクロスバリデーションを行う。

In [9]:
def fit_lgbm(x_train_list, x_valid_list, y_train_list, y_valid_list, y, cv, params: dict=None, verbose=100):

    # パラメータがないときはからの dict で置き換える
    if params is None:
        params = {}

    models = []
    oof_pred = np.zeros_like(y, dtype=np.float)

    for x_train, x_valid, y_train, y_valid,(idx_train, idx_valid) in zip(x_train_list, x_valid_list, y_train_list, y_valid_list,cv):     
        clf = lgbm.LGBMClassifier(**params)
        clf.fit(x_train, y_train, 
                eval_set=[(x_valid, y_valid)],  
                early_stopping_rounds=100, 
                #eval_metric=auc,
                verbose=verbose)

        pred_i = clf.predict_proba(x_valid)[:, 1]
        oof_pred[idx_valid] = pred_i
        models.append(clf)
        
    score =  roc_auc_score(y, oof_pred)
    print('FINISHED \ whole score: {:.4f}'.format(score))
    return oof_pred, models


In [10]:
params = {
    'objective': 'binary',
    'learning_rate': 0.05,
    'reg_lambda': 1.,
    'n_estimators': 10000,
    'metric': 'auc',
    'colsample_bytree': .7,
}

In [11]:
oof, models = fit_lgbm(x_train_list, x_valid_list, y_train_list, y_valid_list, train_y, cv, params)

Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.900391
[200]	valid_0's auc: 0.907805
[300]	valid_0's auc: 0.911917
[400]	valid_0's auc: 0.914379
[500]	valid_0's auc: 0.916587
[600]	valid_0's auc: 0.918104
[700]	valid_0's auc: 0.919066
[800]	valid_0's auc: 0.920051
[900]	valid_0's auc: 0.920917
[1000]	valid_0's auc: 0.921628
[1100]	valid_0's auc: 0.922309
[1200]	valid_0's auc: 0.922803
[1300]	valid_0's auc: 0.92317
[1400]	valid_0's auc: 0.923485
[1500]	valid_0's auc: 0.923813
[1600]	valid_0's auc: 0.924319
[1700]	valid_0's auc: 0.924782
[1800]	valid_0's auc: 0.924933
[1900]	valid_0's auc: 0.92521
[2000]	valid_0's auc: 0.925342
[2100]	valid_0's auc: 0.925619
[2200]	valid_0's auc: 0.925817
[2300]	valid_0's auc: 0.92593
[2400]	valid_0's auc: 0.926028
[2500]	valid_0's auc: 0.926178
[2600]	valid_0's auc: 0.926356
[2700]	valid_0's auc: 0.926543
[2800]	valid_0's auc: 0.926669
[2900]	valid_0's auc: 0.926738
[3000]	valid_0's auc: 0.926759
[3100]	valid_0's auc

提出用csv出力

In [12]:
x_test_list = []
for (idx_train, idx_valid) in cv:
    # training data を train/valid に分割
    x_train, y_train = train_X.iloc[idx_train], train_y.iloc[idx_train]
    x_test = test_X.copy()
    x_test_list.append(target_encoding_valid(x_train, y_train, x_test, cat_features, fold))
    
pred = np.array([model.predict_proba(x_test_list[i].values)[:, 1] for i,model in enumerate(models)])
pred = np.mean(pred, axis=0)

In [13]:
print(pred)

[1.44596374e-01 3.91636574e-01 1.68665258e-05 ... 4.32188330e-01
 5.58882379e-01 4.44079328e-01]


In [14]:
#予測値を提出用dataframeに格納
submission = test_df.copy()
submission['preds'] = pred
submission[['id','preds']].to_csv('submission.csv', index=False)