In [1]:
import gc
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import feather
import datetime
from matplotlib_venn import venn2
from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgbm
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import LatentDirichletAllocation as LDA
pd.set_option('display.max_Columns', 100)

In [2]:
cartlog = pd.read_feather('../inputs/cartlog.f')
product_master = pd.read_feather('../inputs/product_master.f')
meta = pd.read_feather('../inputs/meta.f')
user_master = pd.read_feather('../inputs/user_master.f')
test = pd.read_csv('../inputs/test.csv')
display_action_id = pd.read_csv('../inputs/display_action_id.csv')

product_master['JAN'] = product_master['JAN'].astype(str)

In [3]:
test_sessions = test["session_id"].unique()
print(len(test_sessions))
test_input_log = cartlog[cartlog["session_id"].isin(test_sessions)]

56486


In [4]:
target_category = [
    38,  # アイスクリーム__ノベルティー
    110,  # スナック・キャンディー__ガム
    113,  # スナック・キャンディー__シリアル
    114,  # スナック・キャンディー__スナック
    134,  # チョコ・ビスクラ__チョコレート
    171,  # ビール系__RTD
    172,  # ビール系__ノンアルコール
    173,  # ビール系__ビール系
    376,  # 和菓子__米菓
    435,  # 大型PET__無糖茶（大型PET）
    467,  # 小型PET__コーヒー（小型PET）
    537,  # 水・炭酸水__大型PET（炭酸水）
    539,  # 水・炭酸水__小型PET（炭酸水）
    629,  # 缶飲料__コーヒー（缶）
    768,  # 麺類__カップ麺
]

In [5]:
# 2020-01-01以降、2020-08-01以前で10分以上経過し購買が発生したセッションにtrainデータを絞る
tmp_sessions = meta[(meta['date'] >= '2020-01-01')&(meta['date'] < '2020-08-01')]['session_id'].unique()
tmp_log = cartlog[cartlog["session_id"].isin(tmp_sessions)]
print('2020-01-01以降、2020-08-01以前: ', len(tmp_sessions))

# 購買が発生したセッション
payment_sessions = set(tmp_log[tmp_log['is_payment']==1]['session_id'].unique())
print('購買が発生: ', len(payment_sessions))
# 10分以上のセッション
over10min_sessions = set(tmp_log[tmp_log['spend_time']>=600]['session_id'].unique())
print('10分以上: ', len(over10min_sessions))
# 積集合
all_train_sessions = payment_sessions & over10min_sessions
print('積集合: ', len(all_train_sessions))

# 10分以上の全trainのログデータ
all_train_log = tmp_log[tmp_log["session_id"].isin(all_train_sessions)]
print('全trainのログ: ', len(all_train_log))

2020-01-01以降、2020-08-01以前:  234103
購買が発生:  229959
10分以上:  174412
積集合:  172868
全trainのログ:  4548534


In [6]:
def agg_payment(cartlog) -> pd.DataFrame:
    """セッションごと・商品ごとの購買個数を集計する"""
    # JANコード (vale_1)ごとに商品の購入個数(n_items)を足し算
    agg = cartlog.loc[cartlog["kind_1"] == "商品"].groupby(["session_id", "value_1"])["n_items"].sum().reset_index()
    agg = agg.rename(columns={"value_1": "JAN"})
    return pd.merge(agg, product_master[["JAN", "category_id"]], on="JAN", how="inner").drop(columns=['JAN'])

In [7]:
class RetailDataset:
    def __init__(self, thres_sec, meta):
        self.thres_sec = thres_sec
        self.meta = meta.copy()
        self.meta['time_elapsed_sec'] = self.meta['time_elapsed'] * 60
        self.meta.loc[self.meta['time_elapsed_sec'].isnull(), 'time_elapsed_sec'] = thres_sec
        
        # all_train_logの中で、指定時間より前のログのみを抽出　-> public_train_log, train_sessions
        merge_train = pd.merge(all_train_log, self.meta[["session_id", "time_elapsed_sec"]], on=["session_id"], how="inner")
        self.public_train_log = merge_train[merge_train['spend_time'] <= merge_train['time_elapsed_sec']]
        self.train_sessions = self.public_train_log["session_id"].unique()
        
    def get_train_input_log(self) -> pd.DataFrame:
        return self.public_train_log

    def get_train_target(self) -> pd.DataFrame:
        """学習で使用するセッションの目的変数を取得する"""
        train_target = pd.DataFrame(
            index=self.train_sessions,
        )
        train_target.index.name = "session_id"

        # time_elapsed以降のデータから購買個数を集計する
        merge_train = pd.merge(all_train_log, self.meta[["session_id", "time_elapsed_sec"]], on=["session_id"], how="inner")
        after_elapsed_log = merge_train[merge_train['spend_time'] > merge_train['time_elapsed_sec']]
        
        train_item_num = agg_payment(after_elapsed_log)
        train_item_num = train_item_num[train_item_num['category_id'].isin(target_category)]
        train_target_pos = train_item_num.groupby(["session_id", "category_id"])["n_items"].sum().unstack().fillna(0).astype(int)
        train_target_pos[train_target_pos > 0] = 1
        train_target_pos[train_target_pos <= 0] = 0

        return train_target.join(train_target_pos).fillna(0).reset_index()

In [8]:
def get_train_log(elapsed_min):
    dataset = RetailDataset(elapsed_min*60, meta)
    train_input_log = dataset.get_train_input_log()
    y_train = dataset.get_train_target()
    print('train_session', y_train.shape)
    return train_input_log, y_train

In [9]:
ELAPSED_MIN = [0, 3, 5, 10]
# ELAPSED_MIN = [5]

In [10]:
LOG_VER = 4

In [11]:
def save_train_log(ver):
    for elap_min in ELAPSED_MIN:
        train_log, train_y = get_train_log(elap_min)
        train_log = train_log.reset_index(drop=True)
        train_y = train_y.reset_index(drop=True)
        train_y.columns = [str(c) for c in train_y.columns]
        train_log.to_feather('../inputs/train{}_log_{}.f'.format(ver, elap_min))
        train_y.to_feather('../inputs/train{}_y_{}.f'.format(ver, elap_min))

In [12]:
def load_train_log(ver):
    train_log = {}
    train_y = {}
    for elap_min in ELAPSED_MIN:
        log = pd.read_feather('../inputs/train{}_log_{}.f'.format(ver, elap_min))
        y = pd.read_feather('../inputs/train{}_y_{}.f'.format(ver, elap_min))
        train_log[elap_min] = log
        train_y[elap_min] = y
    return train_log, train_y

In [13]:
# save_train_log(LOG_VER)

#### ver.4: 2020-01-01以降、2020-08-01以前
- 0, train_session (170752, 16)
- 3, train_session (172654, 16)
- 5, train_session (172794, 16)
- 10, train_session (172835, 16)

#### ver.3: 2020-04-01以降、2020-08-01以前
- 0, train_session (101562, 16)
- 3, train_session (102348, 16)
- 5, train_session (102394, 16)
- 10, train_session (102410, 16)

#### ver.2: 2020-08-01以前
- 0, train_session (378594, 16)
- 3, train_session (389649, 16)
- 5, train_session (390621, 16)
- 10, train_session (391074, 16)

### 過去のログデータ

In [14]:
payed_item = agg_payment(all_train_log)

In [15]:
def LDA_topic(df_input, topic, index, prefix):
    df_cp = df_input.set_index(index)
    lda = LDA(n_components=topic)
    lda_out = pd.DataFrame(lda.fit_transform(df_cp), index=df_cp.index).add_prefix(prefix)
    return lda_out.reset_index()

### ユーザ情報

In [16]:
user_features = pd.merge(meta[["session_id", "user_id"]], user_master, on="user_id", how="left")
user_features.loc[user_features['age'] >= 80, 'age'] = np.NaN
user_features.loc[user_features['age'] < 10, 'age'] = np.NaN
user_features.loc[user_features['gender'] > 1, 'gender'] = np.NaN

In [17]:
def get_user_item(payed_item):
    # train用のデータから購買した商品
    user_payed_item = pd.merge(payed_item, meta[['session_id', 'user_id']], on='session_id', how='left')
    group_user_item = user_payed_item.groupby(['user_id', 'category_id'])[['n_items']].sum().reset_index()
    pivot_user_item = group_user_item.pivot_table(index='user_id', columns='category_id', values='n_items')
    # 全ユーザーの購入数の合計が5000以上のカテゴリに絞り込み
    sum_user_item = pivot_user_item.sum()
    user_item_index = sum_user_item[sum_user_item > 5000].index
    pivot_user_item = pivot_user_item[user_item_index].fillna(0).reset_index()
    # trainに存在しない人用に平均値で穴埋め
    user_item_mean = pivot_user_item.mean()
    # 全ユーザーとマージ
    all_user_item = pd.merge(user_master[['user_id']], pivot_user_item, on='user_id', how='left')
    # targetのカテゴリは除く
    for col in all_user_item.columns:
        if (col == 'user_id') or (col in target_category):
            continue
        new_col = 'user_pay_{}'.format(col)
        all_user_item[new_col] = all_user_item[col].fillna(user_item_mean[col]).astype('float32')
        all_user_item.loc[all_user_item[new_col]<0, new_col] = 0

    return all_user_item.drop(columns=list(user_item_mean.index))

In [18]:
all_user_item = get_user_item(payed_item)
print(all_user_item.shape)

(40350, 153)


In [19]:
def save_item_lda(ver, all_user_item):
    user_lda = LDA_topic(all_user_item, 10, 'user_id', 'LDA_')
    user_lda.to_feather('../inputs/user_lda_{}.f'.format(ver))

In [20]:
def load_item_lda(ver):
    user_lda = pd.read_feather('../inputs/user_lda_{}.f'.format(ver))
    return user_lda

In [21]:
# %%time
# save_item_lda(LOG_VER, all_user_item)

In [22]:
user_lda = load_item_lda(LOG_VER)
all_user_item = pd.merge(all_user_item, user_lda, on='user_id', how='left')
print(all_user_item.shape)

(40350, 163)


In [23]:
all_user_item = user_lda

### メタ情報

In [24]:
def get_meta_features(meta):
    meta_features = meta.copy()
    meta_features['year'] = meta_features['date'].dt.year
    meta_features['month'] = meta_features['date'].dt.month
    meta_features['day'] = meta_features['date'].dt.day
    meta_features['dow'] = meta_features['date'].dt.dayofweek
#     meta_features['doy'] = meta_features['date'].dt.dayofyear
    meta_features['week_time'] = meta_features['dow'] * 24 + meta_features['hour']
    le = preprocessing.LabelEncoder()
    meta_features['userid'] = le.fit_transform(meta_features['user_id'])
    
    # 曜日x時間の来店者数
    df_tz = meta_features.groupby(['week_time']).size().rename('timezone_count')
    df_tz =  pd.DataFrame(df_tz).reset_index()
    meta_features = pd.merge(meta_features, df_tz, on='week_time', how='left')
    
    # userごとに前の来店からどれくらいの日数が経ったか
    df = meta_features.groupby(['session_id', 'user_id', 'date']).first().reset_index().sort_values(['user_id', 'session_id'])
    df['date_diff'] = df['date'].diff(1)
    df['user_diff'] = df['user_id'].shift(1)
    df.loc[df['user_diff'] != df['user_id'], 'date_diff'] = pd.NaT
    df['date_diff'] = df['date_diff'].dt.days
    meta_features = meta_features.join(df[['date_diff']])

    # userごとの来店回数
    meta_features['date_rank'] = meta_features.groupby(['user_id'])['date'].rank(ascending=True)
    
    return meta_features.drop(columns=['user_id', 'date', 'time_elapsed', 'date_str'])

In [25]:
meta_features = get_meta_features(meta)

### ディスプレイアクション

In [26]:
disp_name_dic = {}
for i, disp in enumerate(display_action_id['display_name'].unique()):
    disp_name_dic[disp] = 'disp_cnt_{}'.format(i)
    
act_name_dic = {}
for i, action in enumerate(display_action_id['action_name'].unique()):
    act_name_dic[action] = 'act_cnt_{}'.format(i)

In [27]:
def get_display_name_feature(input_log):
    merge = pd.merge(input_log, display_action_id, on='display_action_id', how='left')
    disp_group_count = merge.groupby(['session_id', 'display_name']).size().reset_index().rename(columns={0:'disp_name_count'})
    disp_name_pivot = disp_group_count.pivot_table(index='session_id', columns='display_name', values='disp_name_count', aggfunc='sum')
    disp_name_pivot = disp_name_pivot.reset_index().fillna(0).rename(columns=disp_name_dic)
    
    disp_out = disp_name_pivot[['session_id']].copy()
    for val in disp_name_dic.values():
        disp_out[val] = 0
    
    for col in disp_name_pivot.columns:
        if col == 'session_id':
            continue
        disp_out[col] = disp_name_pivot[col]
    
    return disp_out

In [28]:
def get_action_name_feature(input_log):
    merge = pd.merge(input_log, display_action_id, on='display_action_id', how='left')
    act_group_count = merge.groupby(['session_id', 'action_name']).size().reset_index().rename(columns={0:'act_name_count'})
    act_name_pivot = act_group_count.pivot_table(index='session_id', columns='action_name', values='act_name_count', aggfunc='sum')
    act_name_pivot = act_name_pivot.reset_index().fillna(0).rename(columns=act_name_dic)
    
    act_out = act_name_pivot[['session_id']].copy()
    for val in act_name_dic.values():
        act_out[val] = 0
    
    for col in act_name_pivot.columns:
        if col == 'session_id':
            continue
        act_out[col] = act_name_pivot[col]
    
    return act_out

### セッション単位の特徴量

In [29]:
# def get_coupon_info(input_log):
#     session_coupon = input_log[input_log["kind_1"] == "クーポン"][['session_id', 'value_1']].rename(columns={'value_1':'coupon'})
#     session_coupon = pd.merge(session_coupon, lda_coupon, on='coupon', how='left').drop(columns=['coupon'])
#     session_coupon = session_coupon.groupby(['session_id']).max().reset_index()
#     return session_coupon

In [30]:
# def get_coupon_info(input_log):
#     session_coupon = input_log[input_log["kind_1"] == "クーポン"][['session_id', 'name_1']].rename(columns = {'name_1':'coupon'})
#     session_coupon = pd.merge(session_coupon, df_coupon_cat, on='coupon', how='left').drop(columns=['coupon'])
    
#     for cat in df_coupon_cat['coup_cat'].unique():
#         new_col = 'coup_cat_{}'.format(cat)
#         session_coupon[new_col] = 0
#         session_coupon.loc[session_coupon['coup_cat'] == cat, new_col] = 1
    
#     session_coupon.drop(columns=['coup_cat'], inplace=True)
#     return session_coupon.groupby('session_id').sum().reset_index()

In [31]:
def get_pre_payment_item(input_log):
    session_unique = input_log['session_id'].unique()
    agg = input_log.loc[input_log["kind_1"] == "商品"].groupby(["session_id", "value_1"])["n_items"].sum().reset_index()
    agg = agg.rename(columns={"value_1": "JAN"})
    agg = pd.merge(agg, product_master[["JAN", "category_id"]], on="JAN", how="inner")
    agg = agg[agg['category_id'].isin(target_category)]
    agg = agg.groupby(["session_id", "category_id"])["n_items"].sum().reset_index()
    
    sesi = np.zeros(len(target_category))
    cate = [ct for ct in target_category]
    
    dummy = pd.DataFrame({'session_id':sesi, 'category_id':cate, 'n_items':sesi})
    agg = pd.concat([agg, dummy])
    
    agg = agg.pivot_table(index='session_id', columns='category_id', values='n_items').fillna(0)
    src_columns = ['x_{}'.format(c) for c in agg.columns]
    agg.columns = src_columns
    
    col = ['pre_target_{}'.format(c) for c in target_category]
    df_out = pd.DataFrame(index=session_unique, columns=col)
    df_out.index.name = "session_id"
    df_out = df_out.join(agg)
    for ct in target_category:
        src = 'x_{}'.format(ct)
        dst = 'pre_target_{}'.format(ct)
        df_out[dst] = df_out[src]
    
    return df_out.drop(columns=src_columns).fillna(0).reset_index()

In [32]:
def get_session_kind_group(input_log):
    kind_name ={
        'クーポン': 'coupon',
        '会計': 'kaikei',
        'キー': 'key',
        'カテゴリ': 'categry',
        'バーコードスキャン': 'barcode',
        'UUID': 'uuid',
        '使用ポイント': 'usedpoint',
        '確認': 'confirm',
        'ブランドスイッチ': 'bland',
        'レシピ': 'recipe',
        'スマホスキャン': 'smartphone',
        '磁気スキャン': 'magnetic',
        'レコメンド': 'recommend',
        '倍率ポイント': 'point',
    }
    group_count = input_log[input_log['kind_1'] == '商品'].groupby(["session_id"]).size().rename('group_count_'+'item')
    for kind, name in kind_name.items():
        tmp = input_log[input_log['kind_1'] == kind].groupby(["session_id"]).size().rename('group_count_'+name)
        group_count = pd.concat([group_count, tmp], axis=1)
        
    return group_count.reset_index()

In [33]:
def get_session_item_info(input_log):
    item_log = input_log[input_log['kind_1'] == '商品'].copy()
    item_log = item_log.rename(columns={"value_1": "JAN"})
    item_log = pd.merge(item_log, product_master[["JAN", "category_id"]], on="JAN", how="inner")
    item_log['total'] = item_log['n_items'] * item_log['unit_price']
    session_item = item_log.groupby(['session_id']).agg({
        'total':'sum', 
        'number_1':'sum', 
        'n_items':'sum', 
        'name_1':'nunique',
        'category_id':'nunique',
    }).reset_index()
    session_item = session_item.rename(columns={
        'number_1':'cart_item_cnt', 
        'n_items':'total_item_cnt', 
        'name_1':'item_nunique',
        'category_id':'cat_nunique',
    })
    session_item['mean_price'] = session_item['total'] / session_item['total_item_cnt']
    session_item['item_cnt_per_nuniq'] = session_item['total_item_cnt'] / session_item['item_nunique']
    session_item['item_nuniq_per_cat_nuniq'] = session_item['item_nunique'] / session_item['cat_nunique']
    session_item['mean_price_per_cat'] = session_item['total'] / session_item['cat_nunique']
    return session_item

In [34]:
def get_session_info(input_log):
    # アクション数
    n_actions = input_log.groupby(["session_id"]).size().rename("n_actions")
    # 経過時間の平均
    mean_spend_time = input_log.groupby(["session_id"])["spend_time"].mean()
    # ユニークユーザー
    unique_user = input_log.groupby(["session_id"])['user_id'].nunique().rename("uniq_user")
    
    session_features = pd.concat([
        n_actions,
        mean_spend_time,
        unique_user,
    ], axis=1)
    
    session_features['n_actions_user'] = session_features['n_actions'] * session_features['uniq_user']
    session_features['spend_time_user'] = session_features['spend_time'] * session_features['uniq_user']
    
    return session_features.reset_index()

### セッション単位で集計

In [35]:
def get_session_features(input_log):
    df_ses = pd.DataFrame(columns=['session_id'])
    
    session_feat = [
        get_session_info(input_log),
        get_session_kind_group(input_log),
        get_display_name_feature(input_log),
        get_action_name_feature(input_log),
        get_pre_payment_item(input_log),
#         get_coupon_info(input_log),
        get_session_item_info(input_log),
    ]
    
    for feat in session_feat:
        df_ses = pd.merge(df_ses, feat, on='session_id', how='outer')
        
    return df_ses

### 特徴量を集約する

In [36]:
def merge_features(input_log, session):
    feat_list = [
        get_session_features(input_log),
        user_features,
        meta_features,
    ]
    out = pd.DataFrame({"session_id": session})
    for feat in feat_list:
        out = pd.merge(out, feat, on="session_id", how="left")
        
    # userの情報
    out = pd.merge(out, all_user_item, on='user_id', how='left').drop(columns='user_id')

    assert len(session) == len(out)
    return out

In [37]:
def get_train_all_features(elapsed_min, train_log_list, train_y_list):
    train_input_log = train_log_list[elapsed_min]
    y_train = train_y_list[elapsed_min]
    
    train_features = merge_features(train_input_log, y_train['session_id'])
    print('train_features', train_features.shape)
    return train_features, y_train

In [38]:
def get_test_all_feature(elapsed_min):
    test_meta = meta[meta['session_id'].isin(test_sessions)]
    test_meta = test_meta[test_meta['time_elapsed'] == elapsed_min]
    test_input_elapsed = pd.merge(test_input_log, test_meta[['session_id']], on='session_id', how='left')
    
    test_features = merge_features(test_input_elapsed, test_meta['session_id'])
    print('test_features', test_features.shape)
    return test_features

In [39]:
n_fold = 4

In [40]:
lgbm_param = {
    'objective' : 'binary',
    'boosting_type': 'gbdt',
    'metric': 'auc',
    'seed' : 0,
    'learning_rate':  0.1,
#   'max_depth': 6,
    'feature_fraction': 0.6,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1
}

In [41]:
def train_lgbm(X, y, params=lgbm_param):

    fold = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=0)
    cv = fold.split(X, y)
    
    models = []
    oof_pred = np.zeros_like(y, dtype=np.float)
    
    cat_feat = ['age', 'gender', 'dow', 'register_number']

    for i, (idx_train, idx_valid) in enumerate(cv): 
        x_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
        x_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]
        
        lgbm_train = lgbm.Dataset(x_train, y_train, categorical_feature = cat_feat)
        lgbm_eval = lgbm.Dataset(x_valid, y_valid, reference=lgbm_train, categorical_feature = cat_feat)
        
        lgbm_model = lgbm.train(params, 
                                                    lgbm_train, 
                                                    valid_sets=lgbm_eval,
                                                    categorical_feature = cat_feat,
                                                    num_boost_round=10000,
                                                    early_stopping_rounds=100,
                                                    verbose_eval=-1)
        y_pred = lgbm_model.predict(x_valid, num_iteration=lgbm_model.best_iteration)
        
        oof_pred[idx_valid] = y_pred
        models.append(lgbm_model)

    score = roc_auc_score(y, oof_pred)
    print('--- FINISHED \ whole score: {:.4f} ---'.format(score))
    return oof_pred, models, score

In [42]:
def predict(models, feature):
    pred_list = []
    for i, model in enumerate(models):
        pred = model.predict(feature, num_iteration = model.best_iteration)
        pred_list.append(pred)
    
    score = np.mean(pred_list, axis=0)
    return score

In [43]:
train_log_list, train_y_list = load_train_log(LOG_VER)

In [44]:
gc.collect()

80

In [45]:
%%time
df_pred_all = pd.DataFrame()
df_score_all = pd.DataFrame(index=ELAPSED_MIN)
models_list_list = []

for elapsed_min in ELAPSED_MIN:
    print(f'===== {elapsed_min} =====')
    train_features, y_train = get_train_all_features(elapsed_min, train_log_list, train_y_list)
    test_features = get_test_all_feature(elapsed_min)

    df_pred = pd.DataFrame(index=test_features['session_id'])
    train_features.drop(columns=['session_id'], inplace=True)
    test_features.drop(columns=['session_id'], inplace=True)
    
    models_list = []
    for target in y_train.columns:
        if target == 'session_id':
            continue
        
        print(f"---- id = {target} -----")
        oof, models, score = train_lgbm(train_features, y_train[target])
        models_list.append(models)

        pred = predict(models, test_features)
        df_pred[target] = pred
        df_score_all.loc[elapsed_min, target] = score
        
    models_list_list.append(models_list)
    df_pred_all = pd.concat([df_pred_all, df_pred])
    print(len(df_pred_all))

===== 0 =====
train_features (170752, 114)
test_features (14277, 114)
---- id = 38 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[775]	valid_0's auc: 0.757351
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[807]	valid_0's auc: 0.752881
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[808]	valid_0's auc: 0.757698
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[809]	valid_0's auc: 0.765337
--- FINISHED \ whole score: 0.7583 ---
---- id = 110 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[512]	valid_0's auc: 0.716088
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[550]	valid_0's auc: 0.731233
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[784]	valid_0's auc: 0.71178
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[819]	valid_0's auc: 0.739057
--- FINISHED \ whole score: 0.7184 ---
---- id = 113 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1014]	valid_0's auc: 0.73094
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1110]	valid_0's auc: 0.722286
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[544]	valid_0's auc: 0.726355
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[803]	valid_0's auc: 0.716833
--- FINISHED \ whole score: 0.7217 ---
---- id = 114 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1055]	valid_0's auc: 0.710342
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1142]	valid_0's auc: 0.714647
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1023]	valid_0's auc: 0.712239
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1069]	valid_0's auc: 0.711379
--- FINISHED \ whole score: 0.7121 ---
---- id = 134 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[829]	valid_0's auc: 0.693628
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[794]	valid_0's auc: 0.690768
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[978]	valid_0's auc: 0.698341
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[807]	valid_0's auc: 0.687659
--- FINISHED \ whole score: 0.6926 ---
---- id = 171 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1293]	valid_0's auc: 0.807308
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1313]	valid_0's auc: 0.812645
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1128]	valid_0's auc: 0.812416
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1475]	valid_0's auc: 0.813074
--- FINISHED \ whole score: 0.8112 ---
---- id = 172 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[855]	valid_0's auc: 0.813678
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[610]	valid_0's auc: 0.802329
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1423]	valid_0's auc: 0.827675
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[776]	valid_0's auc: 0.811979
--- FINISHED \ whole score: 0.8057 ---
---- id = 173 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1424]	valid_0's auc: 0.798707
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1187]	valid_0's auc: 0.804969
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1236]	valid_0's auc: 0.809919
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1216]	valid_0's auc: 0.802032
--- FINISHED \ whole score: 0.8037 ---
---- id = 376 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[807]	valid_0's auc: 0.685445
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[799]	valid_0's auc: 0.693467
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[799]	valid_0's auc: 0.689896
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1069]	valid_0's auc: 0.68721
--- FINISHED \ whole score: 0.6887 ---
---- id = 435 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1237]	valid_0's auc: 0.824117
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1298]	valid_0's auc: 0.822453
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[766]	valid_0's auc: 0.818645
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[766]	valid_0's auc: 0.819626
--- FINISHED \ whole score: 0.8197 ---
---- id = 467 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[551]	valid_0's auc: 0.810128
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[675]	valid_0's auc: 0.818326
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[805]	valid_0's auc: 0.822639
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[807]	valid_0's auc: 0.823681
--- FINISHED \ whole score: 0.8178 ---
---- id = 537 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[993]	valid_0's auc: 0.84781
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[829]	valid_0's auc: 0.845496
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1020]	valid_0's auc: 0.847131
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[996]	valid_0's auc: 0.84506
--- FINISHED \ whole score: 0.8459 ---
---- id = 539 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[886]	valid_0's auc: 0.784295
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[800]	valid_0's auc: 0.801385
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[988]	valid_0's auc: 0.795928
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[831]	valid_0's auc: 0.786185
--- FINISHED \ whole score: 0.7917 ---
---- id = 629 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[688]	valid_0's auc: 0.85426
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[257]	valid_0's auc: 0.836213
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[530]	valid_0's auc: 0.859681
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[418]	valid_0's auc: 0.844911
--- FINISHED \ whole score: 0.8390 ---
---- id = 768 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[784]	valid_0's auc: 0.684854
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[978]	valid_0's auc: 0.687024
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[536]	valid_0's auc: 0.680417
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1050]	valid_0's auc: 0.689203
--- FINISHED \ whole score: 0.6851 ---
14277
===== 3 =====
train_features (172654, 114)
test_features (11304, 114)
---- id = 38 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[488]	valid_0's auc: 0.746257
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[409]	valid_0's auc: 0.741393
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[691]	valid_0's auc: 0.742774
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[431]	valid_0's auc: 0.735746
--- FINISHED \ whole score: 0.7412 ---
---- id = 110 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[416]	valid_0's auc: 0.708436
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[373]	valid_0's auc: 0.707252
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[127]	valid_0's auc: 0.697264
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[369]	valid_0's auc: 0.679908
--- FINISHED \ whole score: 0.6888 ---
---- id = 113 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[651]	valid_0's auc: 0.702909
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[818]	valid_0's auc: 0.712401
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[955]	valid_0's auc: 0.710397
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[177]	valid_0's auc: 0.696092
--- FINISHED \ whole score: 0.7000 ---
---- id = 114 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[673]	valid_0's auc: 0.700456
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[428]	valid_0's auc: 0.701305
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[502]	valid_0's auc: 0.698708
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[969]	valid_0's auc: 0.696417
--- FINISHED \ whole score: 0.6989 ---
---- id = 134 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[473]	valid_0's auc: 0.680594
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[374]	valid_0's auc: 0.668973
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[351]	valid_0's auc: 0.682168
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[464]	valid_0's auc: 0.682131
--- FINISHED \ whole score: 0.6785 ---
---- id = 171 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[502]	valid_0's auc: 0.783129
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1228]	valid_0's auc: 0.792565
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1215]	valid_0's auc: 0.797627
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[965]	valid_0's auc: 0.791832
--- FINISHED \ whole score: 0.7901 ---
---- id = 172 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[451]	valid_0's auc: 0.776118
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1419]	valid_0's auc: 0.77922
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[602]	valid_0's auc: 0.787794
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[508]	valid_0's auc: 0.783058
--- FINISHED \ whole score: 0.7600 ---
---- id = 173 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[894]	valid_0's auc: 0.779256
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[918]	valid_0's auc: 0.776784
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1012]	valid_0's auc: 0.777045
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1379]	valid_0's auc: 0.787156
--- FINISHED \ whole score: 0.7797 ---
---- id = 376 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[469]	valid_0's auc: 0.6749
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[499]	valid_0's auc: 0.676245
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[431]	valid_0's auc: 0.671715
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[489]	valid_0's auc: 0.666959
--- FINISHED \ whole score: 0.6724 ---
---- id = 435 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[486]	valid_0's auc: 0.778947
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[830]	valid_0's auc: 0.792125
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[888]	valid_0's auc: 0.794512
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1336]	valid_0's auc: 0.803217
--- FINISHED \ whole score: 0.7890 ---
---- id = 467 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[366]	valid_0's auc: 0.789353
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[258]	valid_0's auc: 0.791308
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[404]	valid_0's auc: 0.781562
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[502]	valid_0's auc: 0.789008
--- FINISHED \ whole score: 0.7863 ---
---- id = 537 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1017]	valid_0's auc: 0.823737
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1401]	valid_0's auc: 0.825307
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[515]	valid_0's auc: 0.815348
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[811]	valid_0's auc: 0.827215
--- FINISHED \ whole score: 0.8144 ---
---- id = 539 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1459]	valid_0's auc: 0.766386
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[683]	valid_0's auc: 0.765139
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[684]	valid_0's auc: 0.763669
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[906]	valid_0's auc: 0.763019
--- FINISHED \ whole score: 0.7590 ---
---- id = 629 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[741]	valid_0's auc: 0.830122
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[490]	valid_0's auc: 0.820347
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[371]	valid_0's auc: 0.814644
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[311]	valid_0's auc: 0.826894
--- FINISHED \ whole score: 0.8109 ---
---- id = 768 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[489]	valid_0's auc: 0.658124
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[565]	valid_0's auc: 0.664555
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[488]	valid_0's auc: 0.658992
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[513]	valid_0's auc: 0.659206
--- FINISHED \ whole score: 0.6602 ---
25581
===== 5 =====
train_features (172794, 114)
test_features (14072, 114)
---- id = 38 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[607]	valid_0's auc: 0.740717
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[499]	valid_0's auc: 0.738729
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[918]	valid_0's auc: 0.745479
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[507]	valid_0's auc: 0.734681
--- FINISHED \ whole score: 0.7395 ---
---- id = 110 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[345]	valid_0's auc: 0.699462
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[412]	valid_0's auc: 0.691537
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[334]	valid_0's auc: 0.66965
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[651]	valid_0's auc: 0.681382
--- FINISHED \ whole score: 0.6767 ---
---- id = 113 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[314]	valid_0's auc: 0.696269
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[498]	valid_0's auc: 0.706646
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[448]	valid_0's auc: 0.692886
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[461]	valid_0's auc: 0.703399
--- FINISHED \ whole score: 0.6995 ---
---- id = 114 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[881]	valid_0's auc: 0.702064
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[794]	valid_0's auc: 0.693337
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[517]	valid_0's auc: 0.692154
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[487]	valid_0's auc: 0.689899
--- FINISHED \ whole score: 0.6943 ---
---- id = 134 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[439]	valid_0's auc: 0.679114
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[424]	valid_0's auc: 0.671963
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[487]	valid_0's auc: 0.670601
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[313]	valid_0's auc: 0.673617
--- FINISHED \ whole score: 0.6738 ---
---- id = 171 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1037]	valid_0's auc: 0.778747
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1008]	valid_0's auc: 0.769515
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1004]	valid_0's auc: 0.777015
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1056]	valid_0's auc: 0.778342
--- FINISHED \ whole score: 0.7759 ---
---- id = 172 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[538]	valid_0's auc: 0.752446
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[927]	valid_0's auc: 0.777448
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[691]	valid_0's auc: 0.766082
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[380]	valid_0's auc: 0.76221
--- FINISHED \ whole score: 0.7548 ---
---- id = 173 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1507]	valid_0's auc: 0.768944
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[971]	valid_0's auc: 0.765105
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[929]	valid_0's auc: 0.765993
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1083]	valid_0's auc: 0.774397
--- FINISHED \ whole score: 0.7679 ---
---- id = 376 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[647]	valid_0's auc: 0.672379
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[478]	valid_0's auc: 0.673689
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[447]	valid_0's auc: 0.663355
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[747]	valid_0's auc: 0.666758
--- FINISHED \ whole score: 0.6688 ---
---- id = 435 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1090]	valid_0's auc: 0.780955
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[888]	valid_0's auc: 0.770059
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[732]	valid_0's auc: 0.772041
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[683]	valid_0's auc: 0.773933
--- FINISHED \ whole score: 0.7726 ---
---- id = 467 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[445]	valid_0's auc: 0.766195
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[530]	valid_0's auc: 0.778798
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[146]	valid_0's auc: 0.770645
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[233]	valid_0's auc: 0.799748
--- FINISHED \ whole score: 0.7718 ---
---- id = 537 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[681]	valid_0's auc: 0.799272
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[673]	valid_0's auc: 0.817285
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[894]	valid_0's auc: 0.811124
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[749]	valid_0's auc: 0.813377
--- FINISHED \ whole score: 0.8093 ---
---- id = 539 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[765]	valid_0's auc: 0.754271
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[517]	valid_0's auc: 0.74171
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[507]	valid_0's auc: 0.751203
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[470]	valid_0's auc: 0.756458
--- FINISHED \ whole score: 0.7493 ---
---- id = 629 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[378]	valid_0's auc: 0.793155
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1364]	valid_0's auc: 0.813376
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[334]	valid_0's auc: 0.808927
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[467]	valid_0's auc: 0.809985
--- FINISHED \ whole score: 0.7584 ---
---- id = 768 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[386]	valid_0's auc: 0.642466
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[448]	valid_0's auc: 0.653811
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[467]	valid_0's auc: 0.647601
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[347]	valid_0's auc: 0.648975
--- FINISHED \ whole score: 0.6481 ---
39653
===== 10 =====
train_features (172835, 114)
test_features (16833, 114)
---- id = 38 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[640]	valid_0's auc: 0.727641
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[443]	valid_0's auc: 0.723801
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[797]	valid_0's auc: 0.741187
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[597]	valid_0's auc: 0.737585
--- FINISHED \ whole score: 0.7324 ---
---- id = 110 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[491]	valid_0's auc: 0.694661
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[147]	valid_0's auc: 0.676935
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[246]	valid_0's auc: 0.666697
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[454]	valid_0's auc: 0.702557
--- FINISHED \ whole score: 0.6652 ---
---- id = 113 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[500]	valid_0's auc: 0.70201
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[237]	valid_0's auc: 0.702152
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[165]	valid_0's auc: 0.714982
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[395]	valid_0's auc: 0.686755
--- FINISHED \ whole score: 0.6971 ---
---- id = 114 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[498]	valid_0's auc: 0.69482
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[558]	valid_0's auc: 0.70228
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[591]	valid_0's auc: 0.702511
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[353]	valid_0's auc: 0.69788
--- FINISHED \ whole score: 0.6993 ---
---- id = 134 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[244]	valid_0's auc: 0.694296
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[139]	valid_0's auc: 0.693529
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[384]	valid_0's auc: 0.684654
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[171]	valid_0's auc: 0.685256
--- FINISHED \ whole score: 0.6891 ---
---- id = 171 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[850]	valid_0's auc: 0.757459
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[776]	valid_0's auc: 0.763754
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[982]	valid_0's auc: 0.740586
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1180]	valid_0's auc: 0.749581
--- FINISHED \ whole score: 0.7515 ---
---- id = 172 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[863]	valid_0's auc: 0.740985
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[858]	valid_0's auc: 0.738019
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[532]	valid_0's auc: 0.758639
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[364]	valid_0's auc: 0.765628
--- FINISHED \ whole score: 0.7297 ---
---- id = 173 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[838]	valid_0's auc: 0.755284
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[998]	valid_0's auc: 0.755879
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[686]	valid_0's auc: 0.753264
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1192]	valid_0's auc: 0.761526
--- FINISHED \ whole score: 0.7556 ---
---- id = 376 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[116]	valid_0's auc: 0.676906
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[210]	valid_0's auc: 0.685688
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[244]	valid_0's auc: 0.683283
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[274]	valid_0's auc: 0.680185
--- FINISHED \ whole score: 0.6813 ---
---- id = 435 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[509]	valid_0's auc: 0.759604
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[933]	valid_0's auc: 0.749247
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[491]	valid_0's auc: 0.759859
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[198]	valid_0's auc: 0.752155
--- FINISHED \ whole score: 0.7435 ---
---- id = 467 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[487]	valid_0's auc: 0.73981
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[490]	valid_0's auc: 0.778038
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[474]	valid_0's auc: 0.764033
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[81]	valid_0's auc: 0.756341
--- FINISHED \ whole score: 0.7438 ---
---- id = 537 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1162]	valid_0's auc: 0.817628
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[630]	valid_0's auc: 0.795377
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[622]	valid_0's auc: 0.794635
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[205]	valid_0's auc: 0.772606
--- FINISHED \ whole score: 0.7758 ---
---- id = 539 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[981]	valid_0's auc: 0.74721
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[676]	valid_0's auc: 0.718257
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[680]	valid_0's auc: 0.740181
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[682]	valid_0's auc: 0.732244
--- FINISHED \ whole score: 0.7324 ---
---- id = 629 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[498]	valid_0's auc: 0.801152
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1182]	valid_0's auc: 0.806698
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[781]	valid_0's auc: 0.795808
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[658]	valid_0's auc: 0.740691
--- FINISHED \ whole score: 0.7496 ---
---- id = 768 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[235]	valid_0's auc: 0.645311
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[457]	valid_0's auc: 0.649064
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[238]	valid_0's auc: 0.658341
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[150]	valid_0's auc: 0.648326
--- FINISHED \ whole score: 0.6497 ---
56486
CPU times: user 8h 50min 11s, sys: 6h 31min 24s, total: 15h 21min 35s
Wall time: 40min 11s


In [46]:
df_pred_all.head()

Unnamed: 0_level_0,38,110,113,114,134,171,172,173,376,435,467,537,539,629,768
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
663721,0.156579,0.017921,0.032155,0.29308,0.050165,0.010795,0.000273,0.053865,0.120354,0.002479,0.011313,0.006976,0.019468,0.001122,0.042962
663761,0.117785,0.06525,0.359008,0.269701,0.205154,0.066375,0.087745,0.041251,0.249032,0.036863,0.012215,0.008394,0.15225,0.003157,0.492035
663763,0.03251,0.007905,0.014013,0.673834,0.370776,0.009787,0.004982,0.019615,0.152244,0.032556,0.009569,0.006731,0.01455,0.004095,0.142574
663775,0.035916,0.004031,0.035653,0.30025,0.250395,0.037708,0.002278,0.165981,0.085855,0.177086,0.019469,0.016731,0.012426,0.002854,0.232483
663778,0.070932,0.003032,0.076217,0.222711,0.19418,0.027976,0.006542,0.074718,0.019122,0.02541,0.006975,0.003198,0.012877,0.000119,0.035298


In [47]:
df_score_all

Unnamed: 0,38,110,113,114,134,171,172,173,376,435,467,537,539,629,768
0,0.758323,0.718438,0.721713,0.712149,0.692603,0.811171,0.805652,0.803732,0.688719,0.819689,0.817808,0.845903,0.79167,0.838965,0.685139
3,0.741194,0.688787,0.699961,0.698943,0.678454,0.79012,0.760031,0.779673,0.672434,0.788952,0.78627,0.814423,0.758952,0.810935,0.660236
5,0.739463,0.676657,0.699509,0.694285,0.673781,0.775899,0.754816,0.767903,0.668837,0.772575,0.771836,0.809279,0.749253,0.75843,0.648077
10,0.732398,0.665206,0.69714,0.699309,0.689105,0.751507,0.729693,0.755556,0.681328,0.743537,0.743778,0.775849,0.732365,0.749627,0.649706


In [48]:
cv = df_score_all.mean(axis=1)
print(cv)
print('- cv =', cv.mean())

0     0.767445
3     0.741958
5     0.730707
10    0.719740
dtype: float64
- cv = 0.7399623877634017


In [49]:
assert len(df_pred_all) == len(test)

In [50]:
submission = pd.merge(test[['session_id']], df_pred_all.reset_index(), on='session_id', how='inner')
assert len(submission) == len(test)

In [51]:
submission.drop(columns='session_id').to_csv('../outputs/submission.csv', index=False)

#### baseline_16: LDAのみ
- feat = 114
- Wall time: 40min 11s
- cv = 0.73996 (0.767/0.741/0.730/0.719)
- LB = 0.695

#### baseline_15: 2020-01-01以降, doyを除外, register_numberをcategoryに
- feat = 266
- Wall time: 53min 34s
- cv = 0.78354 (0.800/0.788/0.779/0.766)
- LB = 0.7333

#### baseline_14: 2020-01-01以降
- feat = 267
- Wall time: 36min 38s
- cv = 0.80381 (0.819/0.807/0.799/0.788)
- LB = 0.7473

#### baseline_13: 2020-04-01以降
- feat = 208
- Wall time: 25min 3s
- cv = 0.78581 (0.803/0.788/0.779/0.772)
- LB = 0.7304

#### baseline_12: 特徴量削減, lr=0.05
- feat = 345
- Wall time: 2h 18min 19s
- cv = 0.82506 (0.835/0.826/0.821/0.816)
- LB = 0.7675

#### baseline_11: クーポンカテゴリの種類減らす, ユニークユーザー数, 平均単価, ユニーク商品数 
- feat = 375
- Wall time: 2h 18min 19s
- cv = 0.81843 (0.831/0.820/0.814/0.806)
- LB = 0.7626

#### baseline_10: クーポンカテゴリの表示回数
- feat = 412
- Wall time: 2h 8min 53s
- cv = 0.821003 (0.831/0.823/0.818/0.8103)
- LB = 0.7657

#### baseline_9-1: 'feature_fraction': 0.6,'bagging_fraction': 0.6,'bagging_freq': 2,
- feat = 333
- Wall time: 2h 2min 14s
- cv = 0.81961 (0.829/0.822/0.816/0.809)
- LB = 0.7658

#### baseline_9: 来店間隔、回数、曜日x時間の来店者数
- feat = 333
- Wall time: 2h 9min 51s
- cv = 0.82118 (0.831/0.824/0.818/0.810)
- LB = 0.7665

#### baseline_8: クーポン発行とターゲットの購買実績
- feat = 344
- Wall time: 2h 5min 13s
- cv = 0.820398 (0.832/0.822/0.817/0.809)
- LB = 0.7484

#### baseline_7: userごとのカテゴリの過去の購買実績
- feat = 329
- Wall time: 1h 59min 16s
- cv = 0.82148 (0.832/0.824/0.818/0.810)
- LB = 0.7671

#### baseline_6: sesssion内のターゲットの購買
- feat = 89
- Wall time: 37min 47s
- cv = 0.66015
- LB = 0.6247

#### baseline_5
- feat = 74
- Wall time: 39min 18s
- cv = 0.66859
- LB = 0.6230

In [52]:
def feat_imp(model):
    fi = model.feature_importance()
    fn = model.feature_name()
    df_feature_importance = pd.DataFrame({'name':fn, 'imp':fi})
    df_feature_importance.sort_values('imp', inplace=True)
    return df_feature_importance

def feature_importance(models):
    fi = pd.DataFrame(columns=['name'])
    for i, model in enumerate(models):
        fi_tmp = feat_imp(model)
        colname = 'imp_{}'.format(i)
        fi_tmp.rename(columns={'imp': colname}, inplace=True)
        fi = pd.merge(fi, fi_tmp, on=['name'], how='outer')
    fi['sum'] = fi.sum(axis=1)
    return fi.sort_values(['sum'], ascending=False)

In [53]:
models = models_list_list[2]

In [54]:
feature_importance(models[0]).head(10)

Unnamed: 0,name,imp_0,imp_1,imp_2,imp_3,sum
112,register_number,2271,1916,3122,1894,9203
111,userid,825,626,1193,639,3283
107,LDA_5,748,622,1148,671,3189
110,LDA_3,782,621,1111,672,3186
109,LDA_4,773,662,1116,628,3179
108,LDA_9,754,637,1112,663,3166
105,LDA_0,734,612,1115,657,3118
106,LDA_6,737,588,1162,604,3091
104,date_rank,697,603,1059,641,3000
103,LDA_7,674,593,1055,585,2907


In [55]:
feature_importance(models[1]).head(10)

Unnamed: 0,name,imp_0,imp_1,imp_2,imp_3,sum
112,register_number,821,1018,824,1483,4146
103,userid,398,541,413,881,2233
109,LDA_5,416,512,416,749,2093
101,LDA_0,383,457,406,814,2060
107,LDA_3,414,459,418,762,2053
111,mean_price_per_cat,420,442,405,773,2040
110,mean_price,418,492,380,748,2038
104,total,403,500,376,743,2022
105,LDA_9,408,477,388,741,2014
102,date_rank,384,467,376,787,2014


In [56]:
feature_importance(models[2]).head(10)

Unnamed: 0,name,imp_0,imp_1,imp_2,imp_3,sum
112,register_number,1065,1611,1395,1430,5501
111,userid,430,606,550,626,2212
110,LDA_9,408,594,513,577,2092
107,LDA_3,360,582,569,538,2049
105,date_rank,355,583,534,560,2032
109,LDA_4,390,535,540,528,1993
106,LDA_0,360,575,509,534,1978
104,LDA_5,354,554,516,538,1962
103,mean_price_per_cat,349,555,522,526,1952
108,LDA_6,374,580,468,496,1918


In [57]:
feature_importance(models[3]).head(10)

Unnamed: 0,name,imp_0,imp_1,imp_2,imp_3,sum
112,register_number,3397,3197,2260,2161,11015
111,LDA_6,1111,1002,646,571,3330
110,userid,1089,969,593,578,3229
109,LDA_9,1068,923,619,592,3202
106,LDA_4,1015,923,608,601,3147
108,LDA_5,1047,875,623,581,3126
104,LDA_3,975,926,644,577,3122
107,LDA_0,1030,921,581,554,3086
105,date_rank,980,879,623,603,3085
103,LDA_7,952,915,621,570,3058


In [58]:
feature_importance(models[4]).head(10)

Unnamed: 0,name,imp_0,imp_1,imp_2,imp_3,sum
112,register_number,1911,1826,2003,1515,7255
111,userid,535,508,571,352,1966
108,LDA_4,501,532,554,376,1963
105,LDA_3,480,492,593,381,1946
102,LDA_6,453,523,589,374,1939
110,date_rank,523,480,575,349,1927
106,LDA_9,484,486,583,353,1906
109,LDA_7,505,484,541,375,1905
103,LDA_5,471,481,571,315,1838
104,LDA_0,476,480,550,321,1827


In [59]:
feature_importance(models[5]).head(10)

Unnamed: 0,name,imp_0,imp_1,imp_2,imp_3,sum
112,register_number,3610,3562,3517,3609,14298
109,userid,1385,1326,1394,1485,5590
111,LDA_6,1408,1373,1356,1451,5588
110,LDA_0,1385,1281,1302,1256,5224
108,LDA_4,1317,1253,1271,1306,5147
104,LDA_9,1246,1276,1205,1401,5128
107,LDA_3,1278,1253,1234,1321,5086
105,LDA_5,1258,1269,1262,1293,5082
106,LDA_7,1265,1224,1211,1287,4987
102,date_rank,1111,1152,1199,1207,4669


In [60]:
feature_importance(models[6]).head(10)

Unnamed: 0,name,imp_0,imp_1,imp_2,imp_3,sum
112,register_number,1347,2037,1625,975,5984
111,userid,744,1211,905,555,3415
110,LDA_5,725,1123,964,503,3315
109,LDA_6,698,1225,881,494,3298
108,LDA_3,681,1169,844,508,3202
103,LDA_0,626,1127,870,487,3110
107,LDA_4,679,1087,856,469,3091
104,LDA_7,646,1147,813,472,3078
106,LDA_9,665,1124,807,456,3052
105,date_rank,647,1052,820,434,2953


In [61]:
feature_importance(models[7]).head(10)

Unnamed: 0,name,imp_0,imp_1,imp_2,imp_3,sum
112,register_number,5032,3495,3312,3866,15705
110,userid,1981,1307,1251,1521,6060
111,LDA_6,1993,1299,1246,1376,5914
108,LDA_3,1884,1239,1202,1412,5737
109,LDA_9,1892,1268,1206,1346,5712
104,LDA_4,1801,1244,1230,1357,5632
107,LDA_0,1874,1183,1188,1384,5629
106,LDA_5,1840,1167,1223,1311,5541
105,LDA_7,1823,1184,1125,1305,5437
103,date_rank,1766,1131,1052,1233,5182


In [62]:
feature_importance(models[8]).head(10)

Unnamed: 0,name,imp_0,imp_1,imp_2,imp_3,sum
112,register_number,2453,1789,1769,2774,8785
111,userid,836,647,552,937,2972
105,date_rank,734,594,566,879,2773
108,LDA_3,755,599,547,852,2753
109,LDA_4,755,523,570,884,2732
106,LDA_9,740,558,529,836,2663
102,LDA_5,707,559,525,870,2661
110,LDA_6,763,524,516,809,2612
107,LDA_0,742,526,475,841,2584
104,LDA_7,719,505,493,807,2524


In [63]:
feature_importance(models[9]).head(10)

Unnamed: 0,name,imp_0,imp_1,imp_2,imp_3,sum
112,register_number,3256,2738,2331,2169,10494
111,userid,1449,1224,996,919,4588
110,LDA_9,1425,1067,974,847,4313
108,LDA_6,1388,1105,948,857,4298
109,LDA_4,1390,1074,924,877,4265
102,LDA_3,1208,1116,893,850,4067
106,LDA_5,1263,1046,888,846,4043
107,LDA_0,1287,1003,828,813,3931
104,date_rank,1250,1024,857,785,3916
105,LDA_8,1254,968,859,770,3851


In [64]:
feature_importance(models[10]).head(10)

Unnamed: 0,name,imp_0,imp_1,imp_2,imp_3,sum
112,register_number,1239,1436,571,728,3974
109,userid,544,682,171,322,1719
111,LDA_5,562,639,190,316,1707
105,LDA_6,522,665,178,278,1643
107,LDA_3,528,623,197,280,1628
108,LDA_9,532,636,140,288,1596
106,LDA_4,522,607,173,291,1593
110,date_rank,553,611,158,250,1572
102,LDA_7,495,612,166,272,1545
103,LDA_0,498,590,182,270,1540


In [65]:
feature_importance(models[11]).head(10)

Unnamed: 0,name,imp_0,imp_1,imp_2,imp_3,sum
112,register_number,1940,1966,2478,2146,8530
111,userid,995,959,1233,1064,4251
110,LDA_6,959,933,1230,1021,4143
109,LDA_9,889,914,1146,946,3895
108,LDA_0,881,881,1126,930,3818
106,LDA_3,872,857,1130,958,3817
107,date_rank,876,797,1155,961,3789
105,LDA_4,870,841,1060,895,3666
104,LDA_5,867,861,1042,878,3648
103,LDA_7,822,768,1030,922,3542


In [66]:
feature_importance(models[12]).head(10)

Unnamed: 0,name,imp_0,imp_1,imp_2,imp_3,sum
112,register_number,2266,1601,1528,1449,6844
110,userid,996,737,687,657,3077
111,LDA_6,1026,626,659,627,2938
109,LDA_5,963,647,658,645,2913
107,LDA_3,923,673,658,629,2883
106,LDA_0,920,649,613,613,2795
105,date_rank,907,647,660,578,2792
108,LDA_9,941,640,623,560,2764
104,LDA_7,896,616,649,533,2694
103,LDA_4,889,606,608,543,2646


In [67]:
feature_importance(models[13]).head(10)

Unnamed: 0,name,imp_0,imp_1,imp_2,imp_3,sum
112,register_number,874,1885,811,1011,4581
111,userid,520,1648,476,567,3211
98,date_rank,395,1773,399,617,3184
108,LDA_4,466,1688,445,584,3183
107,total,455,1758,387,560,3160
102,mean_price,430,1666,391,566,3053
97,mean_price_per_cat,388,1728,388,548,3052
99,LDA_6,399,1650,415,581,3045
110,LDA_5,501,1536,409,569,3015
109,LDA_7,476,1598,374,542,2990


In [68]:
feature_importance(models[14]).head(10)

Unnamed: 0,name,imp_0,imp_1,imp_2,imp_3,sum
112,register_number,1526,1710,1745,1392,6373
109,LDA_3,457,524,537,440,1958
111,LDA_5,482,510,538,427,1957
110,date_rank,471,537,545,382,1935
105,userid,445,526,563,399,1933
107,LDA_9,449,525,537,395,1906
108,LDA_8,455,500,524,408,1887
106,LDA_6,449,509,546,366,1870
104,LDA_4,440,488,482,405,1815
100,LDA_0,379,499,530,384,1792
