In [1]:
import gc
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import feather
import datetime
from matplotlib_venn import venn2
from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgbm
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import LatentDirichletAllocation as LDA
pd.set_option('display.max_Columns', 100)

In [2]:
cartlog = pd.read_feather('../inputs/cartlog.f')
product_master = pd.read_feather('../inputs/product_master.f')
meta = pd.read_feather('../inputs/meta.f')
user_master = pd.read_feather('../inputs/user_master.f')
test = pd.read_csv('../inputs/test.csv')
display_action_id = pd.read_csv('../inputs/display_action_id.csv')

product_master['JAN'] = product_master['JAN'].astype(str)

In [3]:
test_sessions = test["session_id"].unique()
print(len(test_sessions))
test_input_log = cartlog[cartlog["session_id"].isin(test_sessions)]

56486


In [4]:
target_category = [
    38,  # アイスクリーム__ノベルティー
    110,  # スナック・キャンディー__ガム
    113,  # スナック・キャンディー__シリアル
    114,  # スナック・キャンディー__スナック
    134,  # チョコ・ビスクラ__チョコレート
    171,  # ビール系__RTD
    172,  # ビール系__ノンアルコール
    173,  # ビール系__ビール系
    376,  # 和菓子__米菓
    435,  # 大型PET__無糖茶（大型PET）
    467,  # 小型PET__コーヒー（小型PET）
    537,  # 水・炭酸水__大型PET（炭酸水）
    539,  # 水・炭酸水__小型PET（炭酸水）
    629,  # 缶飲料__コーヒー（缶）
    768,  # 麺類__カップ麺
]

In [5]:
# 2020-08-01以前で10分以上経過し購買が発生したセッションにtrainデータを絞る
tmp_sessions = meta[(meta['date'] < '2020-08-01')]['session_id'].unique()
tmp_log = cartlog[cartlog["session_id"].isin(tmp_sessions)]
print('2020-08-01以前: ', len(tmp_sessions))

# 購買が発生したセッション
payment_sessions = set(tmp_log[tmp_log['is_payment']==1]['session_id'].unique())
print('購買が発生: ', len(payment_sessions))
# 10分以上のセッション
over10min_sessions = set(tmp_log[tmp_log['spend_time']>=600]['session_id'].unique())
print('10分以上: ', len(over10min_sessions))
# 積集合
all_train_sessions = payment_sessions & over10min_sessions
print('積集合: ', len(all_train_sessions))

# 10分以上の全trainのログデータ
all_train_log = tmp_log[tmp_log["session_id"].isin(all_train_sessions)]
print('全trainのログ: ', len(all_train_log))

2020-08-01以前:  663708
購買が発生:  618462
10分以上:  404825
積集合:  391383
全trainのログ:  10826062


In [6]:
def agg_payment(cartlog) -> pd.DataFrame:
    """セッションごと・商品ごとの購買個数を集計する"""
    # JANコード (vale_1)ごとに商品の購入個数(n_items)を足し算
    agg = cartlog.loc[cartlog["kind_1"] == "商品"].groupby(["session_id", "value_1"])["n_items"].sum().reset_index()
    agg = agg.rename(columns={"value_1": "JAN"})
    return pd.merge(agg, product_master[["JAN", "category_id"]], on="JAN", how="inner").drop(columns=['JAN'])

In [7]:
class RetailDataset:
    def __init__(self, thres_sec, meta):
        self.thres_sec = thres_sec
        self.meta = meta.copy()
        self.meta['time_elapsed_sec'] = self.meta['time_elapsed'] * 60
        self.meta.loc[self.meta['time_elapsed_sec'].isnull(), 'time_elapsed_sec'] = thres_sec
        
        # all_train_logの中で、指定時間より前のログのみを抽出　-> public_train_log, train_sessions
        merge_train = pd.merge(all_train_log, self.meta[["session_id", "time_elapsed_sec"]], on=["session_id"], how="inner")
        self.public_train_log = merge_train[merge_train['spend_time'] <= merge_train['time_elapsed_sec']]
        self.train_sessions = self.public_train_log["session_id"].unique()
        
    def get_train_input_log(self) -> pd.DataFrame:
        return self.public_train_log

    def get_train_target(self) -> pd.DataFrame:
        """学習で使用するセッションの目的変数を取得する"""
        train_target = pd.DataFrame(
            index=self.train_sessions,
        )
        train_target.index.name = "session_id"

        # time_elapsed以降のデータから購買個数を集計する
        merge_train = pd.merge(all_train_log, self.meta[["session_id", "time_elapsed_sec"]], on=["session_id"], how="inner")
        after_elapsed_log = merge_train[merge_train['spend_time'] > merge_train['time_elapsed_sec']]
        
        train_item_num = agg_payment(after_elapsed_log)
        train_item_num = train_item_num[train_item_num['category_id'].isin(target_category)]
        train_target_pos = train_item_num.groupby(["session_id", "category_id"])["n_items"].sum().unstack().fillna(0).astype(int)
        train_target_pos[train_target_pos > 0] = 1
        train_target_pos[train_target_pos <= 0] = 0

        return train_target.join(train_target_pos).fillna(0).reset_index()

In [8]:
def get_train_log(elapsed_min):
    dataset = RetailDataset(elapsed_min*60, meta)
    train_input_log = dataset.get_train_input_log()
    y_train = dataset.get_train_target()
    print('train_session', y_train.shape)
    return train_input_log, y_train

In [9]:
ELAPSED_MIN = [0, 3, 5, 10]
# ELAPSED_MIN = [5]

In [10]:
LOG_VER = 2

In [11]:
def save_train_log(ver):
    for elap_min in ELAPSED_MIN:
        train_log, train_y = get_train_log(elap_min)
        train_log = train_log.reset_index(drop=True)
        train_y = train_y.reset_index(drop=True)
        train_y.columns = [str(c) for c in train_y.columns]
        train_log.to_feather('../inputs/train{}_log_{}.f'.format(ver, elap_min))
        train_y.to_feather('../inputs/train{}_y_{}.f'.format(ver, elap_min))

In [12]:
def load_train_log(ver):
    train_log = {}
    train_y = {}
    for elap_min in ELAPSED_MIN:
        log = pd.read_feather('../inputs/train{}_log_{}.f'.format(ver, elap_min))
        y = pd.read_feather('../inputs/train{}_y_{}.f'.format(ver, elap_min))
        train_log[elap_min] = log
        train_y[elap_min] = y
    return train_log, train_y

In [13]:
# save_train_log(LOG_VER)

#### ver.5: 2020-04-01以前
- 0, train_session (277032, 16)
- 3, train_session (287301, 16)
- 5, train_session (288227, 16)
- 10, train_session (288664, 16)

#### ver.4: 2020-01-01以降、2020-08-01以前
- 0, train_session (170752, 16)
- 3, train_session (172654, 16)
- 5, train_session (172794, 16)
- 10, train_session (172835, 16)

#### ver.3: 2020-04-01以降、2020-08-01以前
- 0, train_session (101562, 16)
- 3, train_session (102348, 16)
- 5, train_session (102394, 16)
- 10, train_session (102410, 16)

#### ver.2: 2020-08-01以前
- 0, train_session (378594, 16)
- 3, train_session (389649, 16)
- 5, train_session (390621, 16)
- 10, train_session (391074, 16)

### 過去のログデータ

In [14]:
payed_item = agg_payment(all_train_log)

In [15]:
def LDA_topic(df_input, topic, index, prefix):
    df_cp = df_input.set_index(index)
    lda = LDA(n_components=topic)
    lda_out = pd.DataFrame(lda.fit_transform(df_cp), index=df_cp.index).add_prefix(prefix)
    return lda_out.reset_index()

### ユーザ情報

In [16]:
user_features = pd.merge(meta[["session_id", "user_id"]], user_master, on="user_id", how="left")
user_features.loc[user_features['age'] >= 80, 'age'] = np.NaN
user_features.loc[user_features['age'] < 10, 'age'] = np.NaN
user_features.loc[user_features['gender'] > 1, 'gender'] = np.NaN

In [17]:
def get_user_item(payed_item):
    # train用のデータから購買した商品
    user_payed_item = pd.merge(payed_item, meta[['session_id', 'user_id']], on='session_id', how='left')
    group_user_item = user_payed_item.groupby(['user_id', 'category_id'])[['n_items']].sum().reset_index()
    pivot_user_item = group_user_item.pivot_table(index='user_id', columns='category_id', values='n_items')
    # 全ユーザーの購入数の合計が5000以上のカテゴリに絞り込み
    sum_user_item = pivot_user_item.sum()
    user_item_index = sum_user_item[sum_user_item > 5000].index
    pivot_user_item = pivot_user_item[user_item_index].fillna(0).reset_index()
    # trainに存在しない人用に平均値で穴埋め
    user_item_mean = pivot_user_item.mean()
    # 全ユーザーとマージ
    all_user_item = pd.merge(user_master[['user_id']], pivot_user_item, on='user_id', how='left')
    # targetのカテゴリは除く
    for col in all_user_item.columns:
        if (col == 'user_id') or (col in target_category):
            continue
        new_col = 'user_pay_{}'.format(col)
        all_user_item[new_col] = all_user_item[col].fillna(user_item_mean[col]).astype('float32')
        all_user_item.loc[all_user_item[new_col]<0, new_col] = 0

    return all_user_item.drop(columns=list(user_item_mean.index))

In [18]:
all_user_item = get_user_item(payed_item)
print(all_user_item.shape)

(40350, 231)


In [19]:
def save_item_lda(ver, all_user_item):
    user_lda = LDA_topic(all_user_item, 10, 'user_id', 'LDA_')
    user_lda.to_feather('../inputs/user_lda_{}.f'.format(ver))

In [20]:
def load_item_lda(ver):
    user_lda = pd.read_feather('../inputs/user_lda_{}.f'.format(ver))
    return user_lda

In [21]:
# %%time
# save_item_lda(LOG_VER, all_user_item)

In [22]:
user_lda = load_item_lda(LOG_VER)
all_user_item = pd.merge(all_user_item, user_lda, on='user_id', how='left')
print(all_user_item.shape)

(40350, 241)


### メタ情報

In [23]:
def get_meta_features(meta):
    meta_features = meta.copy()
    meta_features['year'] = meta_features['date'].dt.year
    meta_features['month'] = meta_features['date'].dt.month
    meta_features['day'] = meta_features['date'].dt.day
    meta_features['dow'] = meta_features['date'].dt.dayofweek
#     meta_features['doy'] = meta_features['date'].dt.dayofyear
    meta_features['week_time'] = meta_features['dow'] * 24 + meta_features['hour']
    le = preprocessing.LabelEncoder()
    meta_features['userid'] = le.fit_transform(meta_features['user_id'])
    
    # 曜日x時間の来店者数
    df_tz = meta_features.groupby(['week_time']).size().rename('timezone_count')
    df_tz =  pd.DataFrame(df_tz).reset_index()
    meta_features = pd.merge(meta_features, df_tz, on='week_time', how='left')
    
    # userごとに前の来店からどれくらいの日数が経ったか
    df = meta_features.groupby(['session_id', 'user_id', 'date']).first().reset_index().sort_values(['user_id', 'session_id'])
    df['date_diff'] = df['date'].diff(1)
    df['user_diff'] = df['user_id'].shift(1)
    df.loc[df['user_diff'] != df['user_id'], 'date_diff'] = pd.NaT
    df['date_diff'] = df['date_diff'].dt.days
    meta_features = meta_features.join(df[['date_diff']])

    # userごとの来店回数
    meta_features['date_rank'] = meta_features.groupby(['user_id'])['date'].rank(ascending=True)
    
    return meta_features.drop(columns=['user_id', 'date', 'time_elapsed', 'date_str'])

In [24]:
meta_features = get_meta_features(meta)

### ディスプレイアクション

In [25]:
disp_name_dic = {}
for i, disp in enumerate(display_action_id['display_name'].unique()):
    disp_name_dic[disp] = 'disp_cnt_{}'.format(i)
    
act_name_dic = {}
for i, action in enumerate(display_action_id['action_name'].unique()):
    act_name_dic[action] = 'act_cnt_{}'.format(i)

In [26]:
def get_display_name_feature(input_log):
    merge = pd.merge(input_log, display_action_id, on='display_action_id', how='left')
    disp_group_count = merge.groupby(['session_id', 'display_name']).size().reset_index().rename(columns={0:'disp_name_count'})
    disp_name_pivot = disp_group_count.pivot_table(index='session_id', columns='display_name', values='disp_name_count', aggfunc='sum')
    disp_name_pivot = disp_name_pivot.reset_index().fillna(0).rename(columns=disp_name_dic)
    
    disp_out = disp_name_pivot[['session_id']].copy()
    for val in disp_name_dic.values():
        disp_out[val] = 0
    
    for col in disp_name_pivot.columns:
        if col == 'session_id':
            continue
        disp_out[col] = disp_name_pivot[col]
    
    return disp_out

In [27]:
def get_action_name_feature(input_log):
    merge = pd.merge(input_log, display_action_id, on='display_action_id', how='left')
    act_group_count = merge.groupby(['session_id', 'action_name']).size().reset_index().rename(columns={0:'act_name_count'})
    act_name_pivot = act_group_count.pivot_table(index='session_id', columns='action_name', values='act_name_count', aggfunc='sum')
    act_name_pivot = act_name_pivot.reset_index().fillna(0).rename(columns=act_name_dic)
    
    act_out = act_name_pivot[['session_id']].copy()
    for val in act_name_dic.values():
        act_out[val] = 0
    
    for col in act_name_pivot.columns:
        if col == 'session_id':
            continue
        act_out[col] = act_name_pivot[col]
    
    return act_out

### セッション単位の特徴量

In [28]:
# def get_coupon_info(input_log):
#     session_coupon = input_log[input_log["kind_1"] == "クーポン"][['session_id', 'value_1']].rename(columns={'value_1':'coupon'})
#     session_coupon = pd.merge(session_coupon, lda_coupon, on='coupon', how='left').drop(columns=['coupon'])
#     session_coupon = session_coupon.groupby(['session_id']).max().reset_index()
#     return session_coupon

In [29]:
# def get_coupon_info(input_log):
#     session_coupon = input_log[input_log["kind_1"] == "クーポン"][['session_id', 'name_1']].rename(columns = {'name_1':'coupon'})
#     session_coupon = pd.merge(session_coupon, df_coupon_cat, on='coupon', how='left').drop(columns=['coupon'])
    
#     for cat in df_coupon_cat['coup_cat'].unique():
#         new_col = 'coup_cat_{}'.format(cat)
#         session_coupon[new_col] = 0
#         session_coupon.loc[session_coupon['coup_cat'] == cat, new_col] = 1
    
#     session_coupon.drop(columns=['coup_cat'], inplace=True)
#     return session_coupon.groupby('session_id').sum().reset_index()

In [30]:
def get_pre_payment_item(input_log):
    session_unique = input_log['session_id'].unique()
    agg = input_log.loc[input_log["kind_1"] == "商品"].groupby(["session_id", "value_1"])["n_items"].sum().reset_index()
    agg = agg.rename(columns={"value_1": "JAN"})
    agg = pd.merge(agg, product_master[["JAN", "category_id"]], on="JAN", how="inner")
    agg = agg[agg['category_id'].isin(target_category)]
    agg = agg.groupby(["session_id", "category_id"])["n_items"].sum().reset_index()
    
    sesi = np.zeros(len(target_category))
    cate = [ct for ct in target_category]
    
    dummy = pd.DataFrame({'session_id':sesi, 'category_id':cate, 'n_items':sesi})
    agg = pd.concat([agg, dummy])
    
    agg = agg.pivot_table(index='session_id', columns='category_id', values='n_items').fillna(0)
    src_columns = ['x_{}'.format(c) for c in agg.columns]
    agg.columns = src_columns
    
    col = ['pre_target_{}'.format(c) for c in target_category]
    df_out = pd.DataFrame(index=session_unique, columns=col)
    df_out.index.name = "session_id"
    df_out = df_out.join(agg)
    for ct in target_category:
        src = 'x_{}'.format(ct)
        dst = 'pre_target_{}'.format(ct)
        df_out[dst] = df_out[src]
    
    return df_out.drop(columns=src_columns).fillna(0).reset_index()

In [31]:
def get_session_kind_group(input_log):
    kind_name ={
        'クーポン': 'coupon',
        '会計': 'kaikei',
        'キー': 'key',
        'カテゴリ': 'categry',
        'バーコードスキャン': 'barcode',
        'UUID': 'uuid',
        '使用ポイント': 'usedpoint',
        '確認': 'confirm',
        'ブランドスイッチ': 'bland',
        'レシピ': 'recipe',
        'スマホスキャン': 'smartphone',
        '磁気スキャン': 'magnetic',
        'レコメンド': 'recommend',
        '倍率ポイント': 'point',
    }
    group_count = input_log[input_log['kind_1'] == '商品'].groupby(["session_id"]).size().rename('group_count_'+'item')
    for kind, name in kind_name.items():
        tmp = input_log[input_log['kind_1'] == kind].groupby(["session_id"]).size().rename('group_count_'+name)
        group_count = pd.concat([group_count, tmp], axis=1)
        
    return group_count.reset_index()

In [32]:
def get_session_item_info(input_log):
    item_log = input_log[input_log['kind_1'] == '商品'].copy()
    item_log = item_log.rename(columns={"value_1": "JAN"})
    item_log = pd.merge(item_log, product_master[["JAN", "category_id"]], on="JAN", how="inner")
    item_log['total'] = item_log['n_items'] * item_log['unit_price']
    session_item = item_log.groupby(['session_id']).agg({
        'total':'sum', 
        'number_1':'sum', 
        'n_items':'sum', 
        'name_1':'nunique',
        'category_id':'nunique',
    }).reset_index()
    session_item = session_item.rename(columns={
        'number_1':'cart_item_cnt', 
        'n_items':'total_item_cnt', 
        'name_1':'item_nunique',
        'category_id':'cat_nunique',
    })
    session_item['mean_price'] = session_item['total'] / session_item['total_item_cnt']
    session_item['item_cnt_per_nuniq'] = session_item['total_item_cnt'] / session_item['item_nunique']
    session_item['item_nuniq_per_cat_nuniq'] = session_item['item_nunique'] / session_item['cat_nunique']
    session_item['mean_price_per_cat'] = session_item['total'] / session_item['cat_nunique']
    return session_item

In [33]:
def get_session_info(input_log):
    # アクション数
    n_actions = input_log.groupby(["session_id"]).size().rename("n_actions")
    # 経過時間の平均
    mean_spend_time = input_log.groupby(["session_id"])["spend_time"].mean()
    # ユニークユーザー
    unique_user = input_log.groupby(["session_id"])['user_id'].nunique().rename("uniq_user")
    
    session_features = pd.concat([
        n_actions,
        mean_spend_time,
        unique_user,
    ], axis=1)
    
    session_features['n_actions_user'] = session_features['n_actions'] * session_features['uniq_user']
    session_features['spend_time_user'] = session_features['spend_time'] * session_features['uniq_user']
    
    return session_features.reset_index()

### セッション単位で集計

In [34]:
def get_session_features(input_log):
    df_ses = pd.DataFrame(columns=['session_id'])
    
    session_feat = [
        get_session_info(input_log),
        get_session_kind_group(input_log),
        get_display_name_feature(input_log),
        get_action_name_feature(input_log),
        get_pre_payment_item(input_log),
#         get_coupon_info(input_log),
        get_session_item_info(input_log),
    ]
    
    for feat in session_feat:
        df_ses = pd.merge(df_ses, feat, on='session_id', how='outer')
        
    return df_ses

### 特徴量を集約する

In [35]:
def merge_features(input_log, session):
    feat_list = [
        get_session_features(input_log),
        user_features,
        meta_features,
    ]
    out = pd.DataFrame({"session_id": session})
    for feat in feat_list:
        out = pd.merge(out, feat, on="session_id", how="left")
        
    # userの情報
    out = pd.merge(out, all_user_item, on='user_id', how='left').drop(columns='user_id')

    assert len(session) == len(out)
    return out

In [36]:
def get_train_all_features(elapsed_min, train_log_list, train_y_list):
    train_input_log = train_log_list[elapsed_min]
    y_train = train_y_list[elapsed_min]
    
    train_features = merge_features(train_input_log, y_train['session_id'])
    print('train_features', train_features.shape)
    return train_features, y_train

In [37]:
def get_test_all_feature(elapsed_min):
    test_meta = meta[meta['session_id'].isin(test_sessions)]
    test_meta = test_meta[test_meta['time_elapsed'] == elapsed_min]
    test_input_elapsed = pd.merge(test_input_log, test_meta[['session_id']], on='session_id', how='left')
    
    test_features = merge_features(test_input_elapsed, test_meta['session_id'])
    print('test_features', test_features.shape)
    return test_features

In [38]:
n_fold = 4

In [39]:
lgbm_param = {
    'objective' : 'binary',
    'boosting_type': 'gbdt',
    'metric': 'auc',
    'seed' : 0,
    'learning_rate':  0.05,
#   'max_depth': 6,
    'feature_fraction': 0.6,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1
}

In [40]:
def train_lgbm(X, y, params=lgbm_param):

    fold = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=0)
    cv = fold.split(X, y)
    
    models = []
    oof_pred = np.zeros_like(y, dtype=np.float)
    
    cat_feat = ['age', 'gender', 'dow', 'register_number', 'userid']

    for i, (idx_train, idx_valid) in enumerate(cv): 
        x_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
        x_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]
        
        lgbm_train = lgbm.Dataset(x_train, y_train, categorical_feature = cat_feat)
        lgbm_eval = lgbm.Dataset(x_valid, y_valid, reference=lgbm_train, categorical_feature = cat_feat)
        
        lgbm_model = lgbm.train(params, 
                                                    lgbm_train, 
                                                    valid_sets=lgbm_eval,
                                                    categorical_feature = cat_feat,
                                                    num_boost_round=10000,
                                                    early_stopping_rounds=100,
                                                    verbose_eval=-1)
        y_pred = lgbm_model.predict(x_valid, num_iteration=lgbm_model.best_iteration)
        
        oof_pred[idx_valid] = y_pred
        models.append(lgbm_model)

    score = roc_auc_score(y, oof_pred)
    print('--- FINISHED \ whole score: {:.4f} ---'.format(score))
    return oof_pred, models, score

In [41]:
def predict(models, feature):
    pred_list = []
    for i, model in enumerate(models):
        pred = model.predict(feature, num_iteration = model.best_iteration)
        pred_list.append(pred)
    
    score = np.mean(pred_list, axis=0)
    return score

In [42]:
train_log_list, train_y_list = load_train_log(LOG_VER)

In [43]:
gc.collect()

60

In [44]:
%%time
df_pred_all = pd.DataFrame()
df_score_all = pd.DataFrame(index=ELAPSED_MIN)
models_list_list = []

for elapsed_min in ELAPSED_MIN:
    print(f'===== {elapsed_min} =====')
    train_features, y_train = get_train_all_features(elapsed_min, train_log_list, train_y_list)
    test_features = get_test_all_feature(elapsed_min)

    df_pred = pd.DataFrame(index=test_features['session_id'])
    train_features.drop(columns=['session_id'], inplace=True)
    test_features.drop(columns=['session_id'], inplace=True)
    
    models_list = []
    for target in y_train.columns:
        if target == 'session_id':
            continue
        
        print(f"---- id = {target} -----")
        oof, models, score = train_lgbm(train_features, y_train[target])
        models_list.append(models)

        pred = predict(models, test_features)
        df_pred[target] = pred
        df_score_all.loc[elapsed_min, target] = score
        
    models_list_list.append(models_list)
    df_pred_all = pd.concat([df_pred_all, df_pred])
    print(len(df_pred_all))

===== 0 =====
train_features (378594, 344)
test_features (14277, 344)
---- id = 38 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[400]	valid_0's auc: 0.81987
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[468]	valid_0's auc: 0.821424
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[455]	valid_0's auc: 0.822394
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[646]	valid_0's auc: 0.822375
--- FINISHED \ whole score: 0.8215 ---
---- id = 110 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[98]	valid_0's auc: 0.788292
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[730]	valid_0's auc: 0.792423
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[281]	valid_0's auc: 0.796225
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[133]	valid_0's auc: 0.795324
--- FINISHED \ whole score: 0.7824 ---
---- id = 113 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[935]	valid_0's auc: 0.801666
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[802]	valid_0's auc: 0.795102
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[400]	valid_0's auc: 0.799961
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[777]	valid_0's auc: 0.798141
--- FINISHED \ whole score: 0.7982 ---
---- id = 114 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[404]	valid_0's auc: 0.753875
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[486]	valid_0's auc: 0.757273
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[561]	valid_0's auc: 0.754253
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[486]	valid_0's auc: 0.755538
--- FINISHED \ whole score: 0.7552 ---
---- id = 134 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[453]	valid_0's auc: 0.741548
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[274]	valid_0's auc: 0.743563
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[460]	valid_0's auc: 0.745425
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[324]	valid_0's auc: 0.741544
--- FINISHED \ whole score: 0.7430 ---
---- id = 171 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1601]	valid_0's auc: 0.862046
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[821]	valid_0's auc: 0.860343
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1668]	valid_0's auc: 0.862783
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1189]	valid_0's auc: 0.8659
--- FINISHED \ whole score: 0.8626 ---
---- id = 172 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[789]	valid_0's auc: 0.873019
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[793]	valid_0's auc: 0.872879
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[557]	valid_0's auc: 0.87341
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[606]	valid_0's auc: 0.867051
--- FINISHED \ whole score: 0.8711 ---
---- id = 173 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[524]	valid_0's auc: 0.858929
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1155]	valid_0's auc: 0.859901
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1165]	valid_0's auc: 0.858122
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1180]	valid_0's auc: 0.85939
--- FINISHED \ whole score: 0.8589 ---
---- id = 376 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[395]	valid_0's auc: 0.750026
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[395]	valid_0's auc: 0.757926
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[497]	valid_0's auc: 0.751014
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[315]	valid_0's auc: 0.751659
--- FINISHED \ whole score: 0.7526 ---
---- id = 435 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1608]	valid_0's auc: 0.861132
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1065]	valid_0's auc: 0.866611
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[789]	valid_0's auc: 0.865153
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[732]	valid_0's auc: 0.861892
--- FINISHED \ whole score: 0.8632 ---
---- id = 467 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[605]	valid_0's auc: 0.866113
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[537]	valid_0's auc: 0.866708
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[462]	valid_0's auc: 0.866585
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[408]	valid_0's auc: 0.866898
--- FINISHED \ whole score: 0.8663 ---
---- id = 537 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[824]	valid_0's auc: 0.908943
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1198]	valid_0's auc: 0.904607
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1931]	valid_0's auc: 0.904267
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1099]	valid_0's auc: 0.906046
--- FINISHED \ whole score: 0.9030 ---
---- id = 539 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[539]	valid_0's auc: 0.851191
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[765]	valid_0's auc: 0.854265
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[785]	valid_0's auc: 0.851092
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1026]	valid_0's auc: 0.85309
--- FINISHED \ whole score: 0.8520 ---
---- id = 629 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[294]	valid_0's auc: 0.917637
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[292]	valid_0's auc: 0.920744
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[257]	valid_0's auc: 0.924662
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[299]	valid_0's auc: 0.919056
--- FINISHED \ whole score: 0.9204 ---
---- id = 768 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[534]	valid_0's auc: 0.74325
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[535]	valid_0's auc: 0.74063
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[445]	valid_0's auc: 0.739584
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[489]	valid_0's auc: 0.744555
--- FINISHED \ whole score: 0.7420 ---
14277
===== 3 =====
train_features (389649, 344)
test_features (11304, 344)
---- id = 38 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[399]	valid_0's auc: 0.822976
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[355]	valid_0's auc: 0.819852
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[511]	valid_0's auc: 0.820185
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[505]	valid_0's auc: 0.825493
--- FINISHED \ whole score: 0.8221 ---
---- id = 110 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[273]	valid_0's auc: 0.787782
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[842]	valid_0's auc: 0.789525
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[74]	valid_0's auc: 0.785864
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[182]	valid_0's auc: 0.79088
--- FINISHED \ whole score: 0.7729 ---
---- id = 113 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[505]	valid_0's auc: 0.799669
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[606]	valid_0's auc: 0.79086
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[326]	valid_0's auc: 0.795527
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[355]	valid_0's auc: 0.793428
--- FINISHED \ whole score: 0.7946 ---
---- id = 114 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[763]	valid_0's auc: 0.753494
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[724]	valid_0's auc: 0.754236
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[306]	valid_0's auc: 0.753426
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[522]	valid_0's auc: 0.751913
--- FINISHED \ whole score: 0.7532 ---
---- id = 134 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[281]	valid_0's auc: 0.741081
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[240]	valid_0's auc: 0.740001
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[294]	valid_0's auc: 0.739586
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[293]	valid_0's auc: 0.741257
--- FINISHED \ whole score: 0.7405 ---
---- id = 171 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[965]	valid_0's auc: 0.85625
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1101]	valid_0's auc: 0.855919
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1368]	valid_0's auc: 0.858233
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[936]	valid_0's auc: 0.857451
--- FINISHED \ whole score: 0.8569 ---
---- id = 172 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[223]	valid_0's auc: 0.869784
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[293]	valid_0's auc: 0.8624
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[832]	valid_0's auc: 0.871996
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[355]	valid_0's auc: 0.869618
--- FINISHED \ whole score: 0.8646 ---
---- id = 173 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[959]	valid_0's auc: 0.85453
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[771]	valid_0's auc: 0.855476
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[936]	valid_0's auc: 0.854078
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1065]	valid_0's auc: 0.852969
--- FINISHED \ whole score: 0.8542 ---
---- id = 376 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[291]	valid_0's auc: 0.74869
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[283]	valid_0's auc: 0.751129
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[307]	valid_0's auc: 0.7522
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[355]	valid_0's auc: 0.751729
--- FINISHED \ whole score: 0.7509 ---
---- id = 435 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[969]	valid_0's auc: 0.855214
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1397]	valid_0's auc: 0.848291
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[661]	valid_0's auc: 0.851227
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1280]	valid_0's auc: 0.852666
--- FINISHED \ whole score: 0.8514 ---
---- id = 467 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[402]	valid_0's auc: 0.859419
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[374]	valid_0's auc: 0.86457
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[678]	valid_0's auc: 0.863871
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[415]	valid_0's auc: 0.855883
--- FINISHED \ whole score: 0.8601 ---
---- id = 537 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[974]	valid_0's auc: 0.904033
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[683]	valid_0's auc: 0.897307
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[969]	valid_0's auc: 0.906592
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[415]	valid_0's auc: 0.897892
--- FINISHED \ whole score: 0.9002 ---
---- id = 539 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1047]	valid_0's auc: 0.850706
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1177]	valid_0's auc: 0.846639
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[731]	valid_0's auc: 0.847769
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[983]	valid_0's auc: 0.845878
--- FINISHED \ whole score: 0.8473 ---
---- id = 629 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[222]	valid_0's auc: 0.907912
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[127]	valid_0's auc: 0.915081
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[292]	valid_0's auc: 0.915058
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[564]	valid_0's auc: 0.922339
--- FINISHED \ whole score: 0.9110 ---
---- id = 768 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[555]	valid_0's auc: 0.733865
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[412]	valid_0's auc: 0.736381
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[463]	valid_0's auc: 0.731225
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[359]	valid_0's auc: 0.733226
--- FINISHED \ whole score: 0.7336 ---
25581
===== 5 =====
train_features (390621, 344)
test_features (14072, 344)
---- id = 38 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[293]	valid_0's auc: 0.823267
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[293]	valid_0's auc: 0.816302
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[512]	valid_0's auc: 0.820787
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[276]	valid_0's auc: 0.820875
--- FINISHED \ whole score: 0.8201 ---
---- id = 110 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[42]	valid_0's auc: 0.784453
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[41]	valid_0's auc: 0.782846
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[301]	valid_0's auc: 0.781646
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[101]	valid_0's auc: 0.777652
--- FINISHED \ whole score: 0.7677 ---
---- id = 113 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[664]	valid_0's auc: 0.79537
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[659]	valid_0's auc: 0.794991
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[305]	valid_0's auc: 0.78592
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[240]	valid_0's auc: 0.7894
--- FINISHED \ whole score: 0.7910 ---
---- id = 114 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[678]	valid_0's auc: 0.751107
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[748]	valid_0's auc: 0.75033
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[685]	valid_0's auc: 0.747679
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[760]	valid_0's auc: 0.749542
--- FINISHED \ whole score: 0.7497 ---
---- id = 134 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[202]	valid_0's auc: 0.734619
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[293]	valid_0's auc: 0.736118
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[206]	valid_0's auc: 0.737097
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[305]	valid_0's auc: 0.739494
--- FINISHED \ whole score: 0.7368 ---
---- id = 171 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1184]	valid_0's auc: 0.851468
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1209]	valid_0's auc: 0.852389
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1143]	valid_0's auc: 0.847572
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[678]	valid_0's auc: 0.851927
--- FINISHED \ whole score: 0.8507 ---
---- id = 172 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[182]	valid_0's auc: 0.859965
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[258]	valid_0's auc: 0.865363
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[465]	valid_0's auc: 0.857412
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[294]	valid_0's auc: 0.863735
--- FINISHED \ whole score: 0.8589 ---
---- id = 173 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1186]	valid_0's auc: 0.852138
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1265]	valid_0's auc: 0.847936
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[912]	valid_0's auc: 0.850977
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[850]	valid_0's auc: 0.848628
--- FINISHED \ whole score: 0.8499 ---
---- id = 376 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[285]	valid_0's auc: 0.748273
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[267]	valid_0's auc: 0.747795
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[299]	valid_0's auc: 0.750544
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[158]	valid_0's auc: 0.747822
--- FINISHED \ whole score: 0.7484 ---
---- id = 435 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[698]	valid_0's auc: 0.848349
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[746]	valid_0's auc: 0.840778
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1157]	valid_0's auc: 0.84408
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[973]	valid_0's auc: 0.848592
--- FINISHED \ whole score: 0.8452 ---
---- id = 467 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[420]	valid_0's auc: 0.857781
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[323]	valid_0's auc: 0.857413
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[340]	valid_0's auc: 0.856899
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[501]	valid_0's auc: 0.85136
--- FINISHED \ whole score: 0.8552 ---
---- id = 537 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[888]	valid_0's auc: 0.890196
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[765]	valid_0's auc: 0.897067
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[974]	valid_0's auc: 0.896718
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[416]	valid_0's auc: 0.89319
--- FINISHED \ whole score: 0.8925 ---
---- id = 539 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[862]	valid_0's auc: 0.845644
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[429]	valid_0's auc: 0.834306
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1011]	valid_0's auc: 0.843799
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[668]	valid_0's auc: 0.837021
--- FINISHED \ whole score: 0.8396 ---
---- id = 629 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[164]	valid_0's auc: 0.910565
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[505]	valid_0's auc: 0.904493
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[125]	valid_0's auc: 0.90586
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[43]	valid_0's auc: 0.902771
--- FINISHED \ whole score: 0.8935 ---
---- id = 768 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[406]	valid_0's auc: 0.730079
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[619]	valid_0's auc: 0.728281
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[470]	valid_0's auc: 0.73309
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[404]	valid_0's auc: 0.726788
--- FINISHED \ whole score: 0.7295 ---
39653
===== 10 =====
train_features (391074, 344)
test_features (16833, 344)




---- id = 38 -----
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[612]	valid_0's auc: 0.820687
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[642]	valid_0's auc: 0.815192
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[205]	valid_0's auc: 0.820044
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[537]	valid_0's auc: 0.818171
--- FINISHED \ whole score: 0.8178 ---
---- id = 110 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[170]	valid_0's auc: 0.774183
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[375]	valid_0's auc: 0.782482
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[91]	valid_0's auc: 0.777652
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[217]	valid_0's auc: 0.748663
--- FINISHED \ whole score: 0.7628 ---
---- id = 113 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[570]	valid_0's auc: 0.777988
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1090]	valid_0's auc: 0.786837
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[383]	valid_0's auc: 0.793964
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[326]	valid_0's auc: 0.789756
--- FINISHED \ whole score: 0.7816 ---
---- id = 114 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[678]	valid_0's auc: 0.755975
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[433]	valid_0's auc: 0.75397
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[433]	valid_0's auc: 0.754168
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[653]	valid_0's auc: 0.755869
--- FINISHED \ whole score: 0.7550 ---
---- id = 134 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[194]	valid_0's auc: 0.741223
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[505]	valid_0's auc: 0.744619
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[207]	valid_0's auc: 0.742003
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[417]	valid_0's auc: 0.743622
--- FINISHED \ whole score: 0.7426 ---
---- id = 171 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[597]	valid_0's auc: 0.839476
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1148]	valid_0's auc: 0.841363
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1445]	valid_0's auc: 0.839837
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1462]	valid_0's auc: 0.840118
--- FINISHED \ whole score: 0.8397 ---
---- id = 172 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[246]	valid_0's auc: 0.850876
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[413]	valid_0's auc: 0.827174
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[538]	valid_0's auc: 0.862105
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[568]	valid_0's auc: 0.837599
--- FINISHED \ whole score: 0.8410 ---
---- id = 173 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[541]	valid_0's auc: 0.838769
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[650]	valid_0's auc: 0.846361
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[713]	valid_0's auc: 0.841631
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[557]	valid_0's auc: 0.844569
--- FINISHED \ whole score: 0.8428 ---
---- id = 376 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[494]	valid_0's auc: 0.757589
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[420]	valid_0's auc: 0.750851
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[468]	valid_0's auc: 0.763202
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[377]	valid_0's auc: 0.755723
--- FINISHED \ whole score: 0.7568 ---
---- id = 435 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[815]	valid_0's auc: 0.837443
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1195]	valid_0's auc: 0.841873
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[443]	valid_0's auc: 0.837922
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[933]	valid_0's auc: 0.835489
--- FINISHED \ whole score: 0.8359 ---
---- id = 467 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[211]	valid_0's auc: 0.833017
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[133]	valid_0's auc: 0.83312
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[588]	valid_0's auc: 0.85189
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[200]	valid_0's auc: 0.836592
--- FINISHED \ whole score: 0.8345 ---
---- id = 537 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[729]	valid_0's auc: 0.881331
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[520]	valid_0's auc: 0.880741
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[736]	valid_0's auc: 0.883984
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[762]	valid_0's auc: 0.880808
--- FINISHED \ whole score: 0.8811 ---
---- id = 539 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[446]	valid_0's auc: 0.833169
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[976]	valid_0's auc: 0.826868
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[351]	valid_0's auc: 0.831166
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[586]	valid_0's auc: 0.830499
--- FINISHED \ whole score: 0.8264 ---
---- id = 629 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[48]	valid_0's auc: 0.879658
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[238]	valid_0's auc: 0.888689
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[230]	valid_0's auc: 0.876904
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[197]	valid_0's auc: 0.893525
--- FINISHED \ whole score: 0.8741 ---
---- id = 768 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[347]	valid_0's auc: 0.723614
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[462]	valid_0's auc: 0.731747
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[470]	valid_0's auc: 0.727894
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[316]	valid_0's auc: 0.731883
--- FINISHED \ whole score: 0.7287 ---
56486
CPU times: user 1d 21h 18min 10s, sys: 22h 14min 39s, total: 2d 19h 32min 50s
Wall time: 2h 58min 49s


In [45]:
df_pred_all.head()

Unnamed: 0_level_0,38,110,113,114,134,171,172,173,376,435,467,537,539,629,768
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
663721,0.068139,0.016663,0.009452,0.393403,0.053768,0.038368,0.000456,0.083961,0.13993,0.006986,0.009957,0.001,0.018627,0.096627,0.014803
663761,0.232673,0.098959,0.374614,0.443095,0.307561,0.107206,0.26528,0.025694,0.318622,0.006329,0.003687,0.002075,0.11925,0.000635,0.413
663763,0.072436,0.047192,0.121131,0.908709,0.63496,0.008905,0.001806,0.0083,0.204473,0.026281,0.026601,0.00261,0.007935,0.001905,0.046756
663775,0.020725,0.006467,0.01115,0.123852,0.448874,0.161087,0.001802,0.750254,0.025371,0.23385,0.009787,0.002878,0.010244,0.001129,0.177203
663778,0.17813,0.008364,0.153008,0.166603,0.123617,0.027541,0.010018,0.044072,0.038942,0.019245,0.004438,0.005257,0.022208,0.001435,0.065702


In [46]:
df_score_all

Unnamed: 0,38,110,113,114,134,171,172,173,376,435,467,537,539,629,768
0,0.821489,0.782355,0.798183,0.755226,0.742959,0.862645,0.871136,0.85894,0.752611,0.86323,0.866297,0.90302,0.852012,0.92036,0.741999
3,0.822086,0.772886,0.794601,0.753187,0.740474,0.856927,0.864555,0.854232,0.750914,0.851379,0.860131,0.900176,0.847342,0.91101,0.733599
5,0.820139,0.767672,0.79105,0.74967,0.736766,0.850742,0.858918,0.849884,0.74843,0.845172,0.855216,0.892492,0.839612,0.893494,0.729526
10,0.817807,0.762811,0.781576,0.754983,0.742578,0.839688,0.841023,0.842803,0.756811,0.835922,0.834476,0.88115,0.826352,0.874087,0.728748


In [47]:
cv = df_score_all.mean(axis=1)
print(cv)
print('- cv =', cv.mean())

0     0.826164
3     0.820900
5     0.815252
10    0.808054
dtype: float64
- cv = 0.8175926599963461


In [48]:
assert len(df_pred_all) == len(test)

In [49]:
submission = pd.merge(test[['session_id']], df_pred_all.reset_index(), on='session_id', how='inner')
assert len(submission) == len(test)

In [50]:
submission.drop(columns='session_id').to_csv('../outputs/submission.csv', index=False)

#### baseline_19: (ver.2) lr=0.05
- feat = 344
- Wall time: 2h 58min 49s
- cv = 0.81759 (0.826/0.820/0.815/0.808)
- LB = 0.767

#### ensem_2: base12 x0.5 + base15 x0.3 + base18 x0.2
- LB = 0.7665

#### ensem_1: base12 x0.5 + base15 x0.5
- LB = 0.7637

#### baseline_18: (ver.5) 2020-04-01以前, 購買データは全部
- feat = 344
- Wall time: 1h 28min 8s
- cv = 0.80855
- LB = 0.7107

#### baseline_15: (ver.4) 2020-01-01以降, doyを除外, register_numberをcategoryに
- feat = 266
- Wall time: 53min 34s
- cv = 0.78354 (0.800/0.788/0.779/0.766)
- LB = 0.7333

#### baseline_14: (ver.4) 2020-01-01以降
- feat = 267
- Wall time: 36min 38s
- cv = 0.80381 (0.819/0.807/0.799/0.788)
- LB = 0.7473

#### baseline_12: (ver.2) 特徴量削減, lr=0.05
- feat = 345
- Wall time: 2h 18min 19s
- cv = 0.82506 (0.835/0.826/0.821/0.816)
- LB = 0.7675

#### baseline_7: userごとのカテゴリの過去の購買実績
- feat = 329
- Wall time: 1h 59min 16s
- cv = 0.82148 (0.832/0.824/0.818/0.810)
- LB = 0.7671

In [51]:
def feat_imp(model):
    fi = model.feature_importance()
    fn = model.feature_name()
    df_feature_importance = pd.DataFrame({'name':fn, 'imp':fi})
    df_feature_importance.sort_values('imp', inplace=True)
    return df_feature_importance

def feature_importance(models):
    fi = pd.DataFrame(columns=['name'])
    for i, model in enumerate(models):
        fi_tmp = feat_imp(model)
        colname = 'imp_{}'.format(i)
        fi_tmp.rename(columns={'imp': colname}, inplace=True)
        fi = pd.merge(fi, fi_tmp, on=['name'], how='outer')
    fi['sum'] = fi.sum(axis=1)
    return fi.sort_values(['sum'], ascending=False)

In [52]:
models = models_list_list[2]

In [53]:
feature_importance(models[0]).head(10)

Unnamed: 0,name,imp_0,imp_1,imp_2,imp_3,sum
342,userid,4455,4474,7887,4202,21018
341,register_number,1004,982,1561,952,4499
340,month,206,195,238,186,825
336,date_rank,98,102,210,88,498
335,date_diff,90,109,181,100,480
339,hour,114,106,142,111,473
338,user_pay_40,110,109,126,116,461
334,mean_price_per_cat,89,97,153,76,415
337,LDA_1,109,89,128,87,413
333,LDA_4,83,81,114,80,358


In [54]:
feature_importance(models[1]).head(10)

Unnamed: 0,name,imp_0,imp_1,imp_2,imp_3,sum
342,userid,532,499,3418,1425,5874
341,register_number,86,97,746,217,1146
340,date_rank,50,45,200,75,370
339,date_diff,27,25,144,57,253
338,LDA_1,22,30,128,50,230
337,mean_price_per_cat,18,10,152,34,214
335,week_time,14,16,136,35,201
333,spend_time,11,14,131,21,177
328,mean_price,9,10,119,37,175
329,total,9,9,117,38,173


In [55]:
feature_importance(models[2]).head(10)

Unnamed: 0,name,imp_0,imp_1,imp_2,imp_3,sum
342,userid,9218,8541,4217,3402,25378
341,register_number,1604,1727,916,704,4951
340,date_rank,315,310,174,151,950
339,date_diff,293,275,160,124,852
336,mean_price,248,283,96,59,686
338,mean_price_per_cat,281,231,96,59,667
337,total,251,234,75,55,615
335,LDA_1,209,176,115,84,584
332,week_time,191,232,73,66,562
334,spend_time,208,196,72,49,525


In [56]:
feature_importance(models[3]).head(10)

Unnamed: 0,name,imp_0,imp_1,imp_2,imp_3,sum
342,userid,10666,11956,11649,12130,46401
341,register_number,2049,2171,1752,2151,8123
340,date_rank,296,284,243,267,1090
339,date_diff,222,255,214,234,925
338,month,160,189,171,195,715
337,mean_price,157,181,135,184,657
336,mean_price_per_cat,153,190,144,170,657
335,user_pay_135,150,154,153,147,604
333,LDA_4,129,157,145,149,580
331,spend_time,119,148,134,165,566


In [57]:
feature_importance(models[4]).head(10)

Unnamed: 0,name,imp_0,imp_1,imp_2,imp_3,sum
342,userid,3007,4671,3100,4848,15626
341,register_number,804,1009,816,1065,3694
338,month,104,129,125,135,493
340,user_pay_135,120,119,123,127,489
339,date_rank,115,123,118,120,476
337,date_diff,102,121,95,124,442
336,LDA_4,93,120,88,101,402
334,LDA_5,44,59,52,83,238
335,mean_price,46,56,43,54,199
332,week_time,35,56,38,60,189


In [58]:
feature_importance(models[5]).head(10)

Unnamed: 0,name,imp_0,imp_1,imp_2,imp_3,sum
342,userid,16831,17214,16144,9752,59941
341,register_number,3029,3135,2965,2013,11142
340,date_rank,758,719,744,443,2664
338,date_diff,443,414,439,241,1537
339,mean_price,443,420,403,156,1422
336,mean_price_per_cat,375,452,378,161,1366
337,total,377,333,354,154,1218
335,week_time,374,304,344,163,1185
332,LDA_1,286,298,296,200,1080
333,spend_time,327,303,286,153,1069


In [59]:
feature_importance(models[6]).head(10)

Unnamed: 0,name,imp_0,imp_1,imp_2,imp_3,sum
342,userid,2216,3184,5954,3604,14958
341,register_number,439,596,1120,688,2843
340,date_rank,126,173,341,189,829
339,LDA_1,113,149,227,148,637
338,date_diff,97,136,251,149,633
337,mean_price,67,92,184,132,475
335,mean_price_per_cat,60,102,185,110,457
336,total,60,71,215,95,441
334,week_time,59,96,158,97,410
333,timezone_count,51,75,154,97,377


In [60]:
feature_importance(models[7]).head(10)

Unnamed: 0,name,imp_0,imp_1,imp_2,imp_3,sum
342,userid,17294,18359,14122,13064,62839
341,register_number,3115,3348,2272,2168,10903
340,date_rank,591,619,471,466,2147
339,date_diff,449,452,358,292,1551
338,mean_price,382,437,255,236,1310
337,mean_price_per_cat,370,405,285,232,1292
335,total,328,365,243,205,1141
336,week_time,353,349,218,215,1135
330,LDA_1,252,273,245,234,1004
334,timezone_count,293,318,199,170,980


In [61]:
feature_importance(models[8]).head(10)

Unnamed: 0,name,imp_0,imp_1,imp_2,imp_3,sum
342,userid,4418,4178,4689,2913,16198
341,register_number,943,929,958,478,3308
340,date_rank,161,135,169,61,526
339,date_diff,128,127,125,84,464
338,user_pay_135,109,102,121,87,419
334,user_pay_374,74,75,78,67,294
336,month,77,75,88,38,278
337,LDA_1,86,74,70,37,267
335,LDA_4,76,53,75,42,246
333,LDA_6,60,59,69,42,230


In [62]:
feature_importance(models[9]).head(10)

Unnamed: 0,name,imp_0,imp_1,imp_2,imp_3,sum
342,userid,9307,9936,15014,12658,46915
341,register_number,1776,1881,2738,2294,8689
340,date_rank,389,430,680,591,2090
339,date_diff,298,353,586,408,1645
338,mean_price,281,275,544,423,1523
337,mean_price_per_cat,260,248,477,350,1335
335,total,215,264,428,348,1255
336,week_time,218,224,412,363,1217
332,spend_time,178,200,338,294,1010
331,spend_time_user,167,186,345,303,1001


In [63]:
feature_importance(models[10]).head(10)

Unnamed: 0,name,imp_0,imp_1,imp_2,imp_3,sum
342,userid,5273,4017,4458,6372,20120
341,register_number,1090,927,846,1152,4015
340,date_rank,270,234,234,342,1080
339,date_diff,238,182,160,252,832
338,mean_price,192,107,142,228,669
337,mean_price_per_cat,170,120,134,220,644
336,week_time,165,117,114,228,624
332,LDA_1,125,113,125,197,560
334,total,147,101,104,202,554
335,spend_time,160,86,91,159,496


In [64]:
feature_importance(models[11]).head(10)

Unnamed: 0,name,imp_0,imp_1,imp_2,imp_3,sum
342,userid,10498,9188,11417,5382,36485
341,register_number,2394,1929,2520,1178,8021
340,date_rank,629,518,674,236,2057
339,date_diff,543,445,542,232,1762
338,mean_price,407,346,512,168,1433
336,mean_price_per_cat,344,291,428,167,1230
337,total,350,307,406,133,1196
335,spend_time_user,320,248,402,98,1068
331,LDA_1,287,279,338,149,1053
334,week_time,298,223,345,123,989


In [65]:
feature_importance(models[12]).head(10)

Unnamed: 0,name,imp_0,imp_1,imp_2,imp_3,sum
342,userid,11315,5736,12461,8946,38458
341,register_number,2066,1179,2522,1661,7428
340,date_rank,554,277,688,457,1976
339,date_diff,472,194,495,328,1489
338,mean_price,322,145,431,267,1165
337,mean_price_per_cat,317,129,394,262,1102
336,total,314,120,394,227,1055
335,week_time,291,127,361,223,1002
334,spend_time,275,112,344,214,945
332,LDA_1,256,135,299,187,877


In [66]:
feature_importance(models[13]).head(10)

Unnamed: 0,name,imp_0,imp_1,imp_2,imp_3,sum
342,userid,1779,4872,1380,415,8446
341,register_number,433,1289,323,100,2145
340,date_rank,150,463,133,50,796
338,date_diff,78,347,55,24,504
339,mean_price,80,325,49,8,462
337,mean_price_per_cat,76,270,58,15,419
334,week_time,61,302,35,18,416
333,total,57,296,31,7,391
332,spend_time,53,260,29,14,356
336,LDA_1,73,193,47,27,340


In [67]:
feature_importance(models[14]).head(10)

Unnamed: 0,name,imp_0,imp_1,imp_2,imp_3,sum
342,userid,6448,9639,7679,6441,30207
341,register_number,1269,1742,1287,1256,5554
340,date_diff,186,228,164,159,737
339,date_rank,126,208,177,145,656
338,month,110,159,121,110,500
337,hour,107,131,125,110,473
330,mean_price,77,175,102,78,432
333,week_time,82,150,99,99,430
335,LDA_1,90,106,105,107,408
329,mean_price_per_cat,74,143,117,62,396
