In [1]:
import gc
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import feather
import datetime
from matplotlib_venn import venn2
from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgbm
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import LatentDirichletAllocation as LDA
pd.set_option('display.max_Columns', 100)

In [2]:
cartlog = pd.read_feather('../inputs/cartlog.f')
product_master = pd.read_feather('../inputs/product_master.f')
meta = pd.read_feather('../inputs/meta.f')
user_master = pd.read_feather('../inputs/user_master.f')
test = pd.read_csv('../inputs/test.csv')
display_action_id = pd.read_csv('../inputs/display_action_id.csv')

product_master['JAN'] = product_master['JAN'].astype(str)

In [3]:
test_sessions = test["session_id"].unique()
print(len(test_sessions))
test_input_log = cartlog[cartlog["session_id"].isin(test_sessions)]

56486


In [4]:
target_category = [
    38,  # アイスクリーム__ノベルティー
    110,  # スナック・キャンディー__ガム
    113,  # スナック・キャンディー__シリアル
    114,  # スナック・キャンディー__スナック
    134,  # チョコ・ビスクラ__チョコレート
    171,  # ビール系__RTD
    172,  # ビール系__ノンアルコール
    173,  # ビール系__ビール系
    376,  # 和菓子__米菓
    435,  # 大型PET__無糖茶（大型PET）
    467,  # 小型PET__コーヒー（小型PET）
    537,  # 水・炭酸水__大型PET（炭酸水）
    539,  # 水・炭酸水__小型PET（炭酸水）
    629,  # 缶飲料__コーヒー（缶）
    768,  # 麺類__カップ麺
]

In [5]:
# 2020-08-01以前で10分以上経過し購買が発生したセッションにtrainデータを絞る
tmp_sessions = meta[meta['date'] < '2020-08-01']['session_id'].unique()
tmp_log = cartlog[cartlog["session_id"].isin(tmp_sessions)]
print('2020-08-01以前: ', len(tmp_sessions))

# 購買が発生したセッション
payment_sessions = set(tmp_log[tmp_log['is_payment']==1]['session_id'].unique())
print('購買が発生: ', len(payment_sessions))
# 10分以上のセッション
over10min_sessions = set(tmp_log[tmp_log['spend_time']>=600]['session_id'].unique())
print('10分以上: ', len(over10min_sessions))
# 積集合
all_train_sessions = payment_sessions & over10min_sessions
print('積集合: ', len(all_train_sessions))

# 10分以上の全trainのログデータ
all_train_log = tmp_log[tmp_log["session_id"].isin(all_train_sessions)]
print('全trainのログ: ', len(all_train_log))

2020-08-01以前:  663708
購買が発生:  618462
10分以上:  404825
積集合:  391383
全trainのログ:  10826062


In [6]:
def agg_payment(cartlog) -> pd.DataFrame:
    """セッションごと・商品ごとの購買個数を集計する"""
    # JANコード (vale_1)ごとに商品の購入個数(n_items)を足し算
    agg = cartlog.loc[cartlog["kind_1"] == "商品"].groupby(["session_id", "value_1"])["n_items"].sum().reset_index()
    agg = agg.rename(columns={"value_1": "JAN"})
    return pd.merge(agg, product_master[["JAN", "category_id"]], on="JAN", how="inner").drop(columns=['JAN'])

In [7]:
class RetailDataset:
    def __init__(self, thres_sec, meta):
        self.thres_sec = thres_sec
        self.meta = meta.copy()
        self.meta['time_elapsed_sec'] = self.meta['time_elapsed'] * 60
        self.meta.loc[self.meta['time_elapsed_sec'].isnull(), 'time_elapsed_sec'] = thres_sec
        
        # all_train_logの中で、指定時間より前のログのみを抽出　-> public_train_log, train_sessions
        merge_train = pd.merge(all_train_log, self.meta[["session_id", "time_elapsed_sec"]], on=["session_id"], how="inner")
        self.public_train_log = merge_train[merge_train['spend_time'] <= merge_train['time_elapsed_sec']]
        self.train_sessions = self.public_train_log["session_id"].unique()
        
    def get_train_input_log(self) -> pd.DataFrame:
        return self.public_train_log

    def get_train_target(self) -> pd.DataFrame:
        """学習で使用するセッションの目的変数を取得する"""
        train_target = pd.DataFrame(
            index=self.train_sessions,
        )
        train_target.index.name = "session_id"

        # time_elapsed以降のデータから購買個数を集計する
        merge_train = pd.merge(all_train_log, self.meta[["session_id", "time_elapsed_sec"]], on=["session_id"], how="inner")
        after_elapsed_log = merge_train[merge_train['spend_time'] > merge_train['time_elapsed_sec']]
        
        train_item_num = agg_payment(after_elapsed_log)
        train_item_num = train_item_num[train_item_num['category_id'].isin(target_category)]
        train_target_pos = train_item_num.groupby(["session_id", "category_id"])["n_items"].sum().unstack().fillna(0).astype(int)
        train_target_pos[train_target_pos > 0] = 1
        train_target_pos[train_target_pos <= 0] = 0

        return train_target.join(train_target_pos).fillna(0).reset_index()

In [8]:
def get_train_log(elapsed_min):
    dataset = RetailDataset(elapsed_min*60, meta)
    train_input_log = dataset.get_train_input_log()
    y_train = dataset.get_train_target()
    print('train_session', y_train.shape)
    return train_input_log, y_train

In [9]:
ELAPSED_MIN = [0, 3, 5, 10]
# ELAPSED_MIN = [5]

In [10]:
def save_train_log():
    for elap_min in ELAPSED_MIN:
        train_log, train_y = get_train_log(elap_min)
        train_log = train_log.reset_index(drop=True)
        train_y = train_y.reset_index(drop=True)
        train_y.columns = [str(c) for c in train_y.columns]
        train_log.to_feather('../inputs/train2_log_{}.f'.format(elap_min))
        train_y.to_feather('../inputs/train2_y_{}.f'.format(elap_min))

In [11]:
def load_train_log():
    train_log = {}
    train_y = {}
    for elap_min in ELAPSED_MIN:
        log = pd.read_feather('../inputs/train2_log_{}.f'.format(elap_min))
        y = pd.read_feather('../inputs/train2_y_{}.f'.format(elap_min))
        train_log[elap_min] = log
        train_y[elap_min] = y
    return train_log, train_y

In [12]:
# save_train_log()

- 0, train_session (378594, 16)
- 3, train_session (389649, 16)
- 5, train_session (390621, 16)
- 10, train_session (391074, 16)

### 過去のログデータ

In [13]:
payed_item = agg_payment(all_train_log)

#### クーポン発行とターゲットカテゴリの購入数の関係

In [14]:
# def get_coupon_log_data(payed_item):
#     # train用のデータから購買した商品
#     payed_item_in_cat = payed_item[payed_item['category_id'].isin(target_category)]
#     cat_pivot = payed_item_in_cat.pivot_table(
#         index='session_id', columns='category_id', values='n_items').fillna(0).add_prefix('coupon_').reset_index()

#     # 発券されたクーポン
#     coupon = all_train_log[all_train_log['kind_1'] == 'クーポン']
#     coupon_count = coupon['value_1'].value_counts()
#     print(len(coupon_count))
#     session_coupon = coupon[coupon['value_1'].isin(list(coupon_count.index))][['session_id', 'value_1']]
#     session_coupon = session_coupon.rename(columns={'value_1':'coupon'})
#     session_coupon = session_coupon.groupby(['session_id', 'coupon']).size().reset_index()

#     session_cat_coupon = pd.merge(cat_pivot, session_coupon[['session_id', 'coupon']], on='session_id', how='left')
#     session_cat_coupon['coupon'] = session_cat_coupon['coupon'].fillna(0)
#     session_cat_coupon.drop(columns=['session_id'], inplace=True)
#     group_coupon = session_cat_coupon.groupby(['coupon']).mean()
#     group_coupon[group_coupon < 0] = 0
#     return group_coupon.reset_index()

In [15]:
# coupon_name = {
#     'ml', '水', '乳', '茶', 'ポテ', 'ウォーター', 'カップ', '柿の種', 'サブレ', 'バター', 'コーヒー', 'ネスレ', 'ミルク',
#     'チョコ', 'コーラ', 'チップス', 'バニラ', '炭酸水', '天然水', 'ビール', '飴', 'ラーメン', 'コンソメ', 'ホット', 'パック', 'プリン',
#     'キャラメル', 'カルビー', 'アルフォート', 'フルグラ', 'ゼリー', 'アソート', 'キットカット', 
# }

In [16]:
# def coupon_category():
#     coupon = all_train_log[all_train_log['kind_1'] == 'クーポン']
#     coupon_count = coupon['name_1'].value_counts()
#     print(len(coupon_count))
#     df_coupon = pd.DataFrame(coupon_count).reset_index()

#     for i, name in enumerate(coupon_name):
#         df_coupon.loc[df_coupon['index'].str.contains(name), 'coup_cat'] = str(i)

#     df_coupon = df_coupon[df_coupon['coup_cat'].notnull()]
#     return df_coupon.rename(columns={'index':'coupon'})[['coupon', 'coup_cat']]

In [17]:
# df_coupon_cat = coupon_category()

In [18]:
def LDA_topic(df_input, topic, index, prefix):
    df_cp = df_input.set_index(index)
    lda = LDA(n_components=topic)
    lda_out = pd.DataFrame(lda.fit_transform(df_cp), index=df_cp.index).add_prefix(prefix)
    return lda_out.reset_index()

In [19]:
# group_coupon = get_coupon_log_data(payed_item)
# lda_coupon = LDA_topic(group_coupon, 6, 'coupon', 'LDA_coup_')

### ユーザ情報

In [20]:
user_features = pd.merge(meta[["session_id", "user_id"]], user_master, on="user_id", how="left")
user_features.loc[user_features['age'] >= 80, 'age'] = np.NaN
user_features.loc[user_features['age'] < 10, 'age'] = np.NaN
user_features.loc[user_features['gender'] > 1, 'gender'] = np.NaN

In [21]:
def get_user_item(payed_item):
    # train用のデータから購買した商品
    user_payed_item = pd.merge(payed_item, meta[['session_id', 'user_id']], on='session_id', how='left')
    group_user_item = user_payed_item.groupby(['user_id', 'category_id'])[['n_items']].sum().reset_index()
    pivot_user_item = group_user_item.pivot_table(index='user_id', columns='category_id', values='n_items')
    # 全ユーザーの購入数の合計が5000以上のカテゴリに絞り込み
    sum_user_item = pivot_user_item.sum()
    user_item_index = sum_user_item[sum_user_item > 5000].index
    pivot_user_item = pivot_user_item[user_item_index].fillna(0).reset_index()
    # trainに存在しない人用に平均値で穴埋め
    user_item_mean = pivot_user_item.mean()
    # 全ユーザーとマージ
    all_user_item = pd.merge(user_master[['user_id']], pivot_user_item, on='user_id', how='left')
    # targetのカテゴリは除く
    for col in all_user_item.columns:
        if (col == 'user_id') or (col in target_category):
            continue
        new_col = 'user_pay_{}'.format(col)
        all_user_item[new_col] = all_user_item[col].fillna(user_item_mean[col]).astype('float32')
        all_user_item.loc[all_user_item[new_col]<0, new_col] = 0

    return all_user_item.drop(columns=list(user_item_mean.index))

In [22]:
all_user_item = get_user_item(payed_item)
print(all_user_item.shape)

(40350, 231)


In [23]:
# %%time
# user_lda = LDA_topic(all_user_item, 10, 'user_id', 'LDA_')
# user_lda.to_feather('../inputs/user_lda.f')

In [24]:
user_lda = pd.read_feather('../inputs/user_lda.f')
all_user_item = pd.merge(all_user_item, user_lda, on='user_id', how='left')

In [25]:
print(all_user_item.shape)

(40350, 241)


### メタ情報

In [26]:
def get_meta_features(meta):
    meta_features = meta.copy()
    meta_features['year'] = meta_features['date'].dt.year
    meta_features['month'] = meta_features['date'].dt.month
    meta_features['day'] = meta_features['date'].dt.day
    meta_features['dow'] = meta_features['date'].dt.dayofweek
    meta_features['doy'] = meta_features['date'].dt.dayofyear
    meta_features['week_time'] = meta_features['dow'] * 24 + meta_features['hour']
    le = preprocessing.LabelEncoder()
    meta_features['userid'] = le.fit_transform(meta_features['user_id'])
    
    # 曜日x時間の来店者数
    df_tz = meta_features.groupby(['week_time']).size().rename('timezone_count')
    df_tz =  pd.DataFrame(df_tz).reset_index()
    meta_features = pd.merge(meta_features, df_tz, on='week_time', how='left')
    
    # userごとに前の来店からどれくらいの日数が経ったか
    df = meta_features.groupby(['session_id', 'user_id', 'date']).first().reset_index().sort_values(['user_id', 'session_id'])
    df['date_diff'] = df['date'].diff(1)
    df['user_diff'] = df['user_id'].shift(1)
    df.loc[df['user_diff'] != df['user_id'], 'date_diff'] = pd.NaT
    df['date_diff'] = df['date_diff'].dt.days
    meta_features = meta_features.join(df[['date_diff']])

    # userごとの来店回数
    meta_features['date_rank'] = meta_features.groupby(['user_id'])['date'].rank(ascending=True)
    
    return meta_features.drop(columns=['user_id', 'date', 'time_elapsed', 'date_str'])

In [27]:
meta_features = get_meta_features(meta)

### ディスプレイアクション

In [28]:
disp_name_dic = {}
for i, disp in enumerate(display_action_id['display_name'].unique()):
    disp_name_dic[disp] = 'disp_cnt_{}'.format(i)
    
act_name_dic = {}
for i, action in enumerate(display_action_id['action_name'].unique()):
    act_name_dic[action] = 'act_cnt_{}'.format(i)

In [29]:
def get_display_name_feature(input_log):
    merge = pd.merge(input_log, display_action_id, on='display_action_id', how='left')
    disp_group_count = merge.groupby(['session_id', 'display_name']).size().reset_index().rename(columns={0:'disp_name_count'})
    disp_name_pivot = disp_group_count.pivot_table(index='session_id', columns='display_name', values='disp_name_count', aggfunc='sum')
    disp_name_pivot = disp_name_pivot.reset_index().fillna(0).rename(columns=disp_name_dic)
    
    disp_out = disp_name_pivot[['session_id']].copy()
    for val in disp_name_dic.values():
        disp_out[val] = 0
    
    for col in disp_name_pivot.columns:
        if col == 'session_id':
            continue
        disp_out[col] = disp_name_pivot[col]
    
    return disp_out

In [30]:
def get_action_name_feature(input_log):
    merge = pd.merge(input_log, display_action_id, on='display_action_id', how='left')
    act_group_count = merge.groupby(['session_id', 'action_name']).size().reset_index().rename(columns={0:'act_name_count'})
    act_name_pivot = act_group_count.pivot_table(index='session_id', columns='action_name', values='act_name_count', aggfunc='sum')
    act_name_pivot = act_name_pivot.reset_index().fillna(0).rename(columns=act_name_dic)
    
    act_out = act_name_pivot[['session_id']].copy()
    for val in act_name_dic.values():
        act_out[val] = 0
    
    for col in act_name_pivot.columns:
        if col == 'session_id':
            continue
        act_out[col] = act_name_pivot[col]
    
    return act_out

### セッション単位の特徴量

In [31]:
# def get_coupon_info(input_log):
#     session_coupon = input_log[input_log["kind_1"] == "クーポン"][['session_id', 'value_1']].rename(columns={'value_1':'coupon'})
#     session_coupon = pd.merge(session_coupon, lda_coupon, on='coupon', how='left').drop(columns=['coupon'])
#     session_coupon = session_coupon.groupby(['session_id']).max().reset_index()
#     return session_coupon

In [32]:
# def get_coupon_info(input_log):
#     session_coupon = input_log[input_log["kind_1"] == "クーポン"][['session_id', 'name_1']].rename(columns = {'name_1':'coupon'})
#     session_coupon = pd.merge(session_coupon, df_coupon_cat, on='coupon', how='left').drop(columns=['coupon'])
    
#     for cat in df_coupon_cat['coup_cat'].unique():
#         new_col = 'coup_cat_{}'.format(cat)
#         session_coupon[new_col] = 0
#         session_coupon.loc[session_coupon['coup_cat'] == cat, new_col] = 1
    
#     session_coupon.drop(columns=['coup_cat'], inplace=True)
#     return session_coupon.groupby('session_id').sum().reset_index()

In [33]:
def get_pre_payment_item(input_log):
    session_unique = input_log['session_id'].unique()
    agg = input_log.loc[input_log["kind_1"] == "商品"].groupby(["session_id", "value_1"])["n_items"].sum().reset_index()
    agg = agg.rename(columns={"value_1": "JAN"})
    agg = pd.merge(agg, product_master[["JAN", "category_id"]], on="JAN", how="inner")
    agg = agg[agg['category_id'].isin(target_category)]
    agg = agg.groupby(["session_id", "category_id"])["n_items"].sum().reset_index()
    
    sesi = np.zeros(len(target_category))
    cate = [ct for ct in target_category]
    
    dummy = pd.DataFrame({'session_id':sesi, 'category_id':cate, 'n_items':sesi})
    agg = pd.concat([agg, dummy])
    
    agg = agg.pivot_table(index='session_id', columns='category_id', values='n_items').fillna(0)
    src_columns = ['x_{}'.format(c) for c in agg.columns]
    agg.columns = src_columns
    
    col = ['pre_target_{}'.format(c) for c in target_category]
    df_out = pd.DataFrame(index=session_unique, columns=col)
    df_out.index.name = "session_id"
    df_out = df_out.join(agg)
    for ct in target_category:
        src = 'x_{}'.format(ct)
        dst = 'pre_target_{}'.format(ct)
        df_out[dst] = df_out[src]
    
    return df_out.drop(columns=src_columns).fillna(0).reset_index()

In [34]:
def get_session_kind_group(input_log):
    kind_name ={
        'クーポン': 'coupon',
        '会計': 'kaikei',
        'キー': 'key',
        'カテゴリ': 'categry',
        'バーコードスキャン': 'barcode',
        'UUID': 'uuid',
        '使用ポイント': 'usedpoint',
        '確認': 'confirm',
        'ブランドスイッチ': 'bland',
        'レシピ': 'recipe',
        'スマホスキャン': 'smartphone',
        '磁気スキャン': 'magnetic',
        'レコメンド': 'recommend',
        '倍率ポイント': 'point',
    }
    group_count = input_log[input_log['kind_1'] == '商品'].groupby(["session_id"]).size().rename('group_count_'+'item')
    for kind, name in kind_name.items():
        tmp = input_log[input_log['kind_1'] == kind].groupby(["session_id"]).size().rename('group_count_'+name)
        group_count = pd.concat([group_count, tmp], axis=1)
        
    return group_count.reset_index()

In [35]:
def get_session_item_info(input_log):
    item_log = input_log[input_log['kind_1'] == '商品'].copy()
    item_log = item_log.rename(columns={"value_1": "JAN"})
    item_log = pd.merge(item_log, product_master[["JAN", "category_id"]], on="JAN", how="inner")
    item_log['total'] = item_log['n_items'] * item_log['unit_price']
    session_item = item_log.groupby(['session_id']).agg({
        'total':'sum', 
        'number_1':'sum', 
        'n_items':'sum', 
        'name_1':'nunique',
        'category_id':'nunique',
    }).reset_index()
    session_item = session_item.rename(columns={
        'number_1':'cart_item_cnt', 
        'n_items':'total_item_cnt', 
        'name_1':'item_nunique',
        'category_id':'cat_nunique',
    })
    session_item['mean_price'] = session_item['total'] / session_item['total_item_cnt']
    session_item['item_cnt_per_nuniq'] = session_item['total_item_cnt'] / session_item['item_nunique']
    session_item['item_nuniq_per_cat_nuniq'] = session_item['item_nunique'] / session_item['cat_nunique']
    session_item['mean_price_per_cat'] = session_item['total'] / session_item['cat_nunique']
    return session_item

In [36]:
def get_session_info(input_log):
    # アクション数
    n_actions = input_log.groupby(["session_id"]).size().rename("n_actions")
    # 経過時間の平均
    mean_spend_time = input_log.groupby(["session_id"])["spend_time"].mean()
    # ユニークユーザー
    unique_user = input_log.groupby(["session_id"])['user_id'].nunique().rename("uniq_user")
    
    session_features = pd.concat([
        n_actions,
        mean_spend_time,
        unique_user,
    ], axis=1)
    
    session_features['n_actions_user'] = session_features['n_actions'] * session_features['uniq_user']
    session_features['spend_time_user'] = session_features['spend_time'] * session_features['uniq_user']
    
    return session_features.reset_index()

### セッション単位で集計

In [37]:
def get_session_features(input_log):
    df_ses = pd.DataFrame(columns=['session_id'])
    
    session_feat = [
        get_session_info(input_log),
        get_session_kind_group(input_log),
        get_display_name_feature(input_log),
        get_action_name_feature(input_log),
        get_pre_payment_item(input_log),
#         get_coupon_info(input_log),
        get_session_item_info(input_log),
    ]
    
    for feat in session_feat:
        df_ses = pd.merge(df_ses, feat, on='session_id', how='outer')
        
    return df_ses

### 特徴量を集約する

In [38]:
def merge_features(input_log, session):
    feat_list = [
        get_session_features(input_log),
        user_features,
        meta_features,
    ]
    out = pd.DataFrame({"session_id": session})
    for feat in feat_list:
        out = pd.merge(out, feat, on="session_id", how="left")
        
    # userの情報
    out = pd.merge(out, all_user_item, on='user_id', how='left').drop(columns='user_id')

    assert len(session) == len(out)
    return out

In [39]:
def get_train_all_features(elapsed_min, train_log_list, train_y_list):
    train_input_log = train_log_list[elapsed_min]
    y_train = train_y_list[elapsed_min]
    
    train_features = merge_features(train_input_log, y_train['session_id'])
    print('train_features', train_features.shape)
    return train_features, y_train

In [40]:
def get_test_all_feature(elapsed_min):
    test_meta = meta[meta['session_id'].isin(test_sessions)]
    test_meta = test_meta[test_meta['time_elapsed'] == elapsed_min]
    test_input_elapsed = pd.merge(test_input_log, test_meta[['session_id']], on='session_id', how='left')
    
    test_features = merge_features(test_input_elapsed, test_meta['session_id'])
    print('test_features', test_features.shape)
    return test_features

In [41]:
n_fold = 4

In [42]:
lgbm_param = {
    'objective' : 'binary',
    'boosting_type': 'gbdt',
    'metric': 'auc',
    'seed' : 0,
    'learning_rate':  0.05,
#   'max_depth': 6,
    'feature_fraction': 0.6,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1
}

In [43]:
def train_lgbm(X, y, params=lgbm_param):

    fold = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=0)
    cv = fold.split(X, y)
    
    models = []
    oof_pred = np.zeros_like(y, dtype=np.float)
    
    cat_feat = ['age', 'gender', 'dow']

    for i, (idx_train, idx_valid) in enumerate(cv): 
        x_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
        x_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]
        
        lgbm_train = lgbm.Dataset(x_train, y_train, categorical_feature = cat_feat)
        lgbm_eval = lgbm.Dataset(x_valid, y_valid, reference=lgbm_train, categorical_feature = cat_feat)
        
        lgbm_model = lgbm.train(params, 
                                                    lgbm_train, 
                                                    valid_sets=lgbm_eval,
                                                    categorical_feature = cat_feat,
                                                    num_boost_round=10000,
                                                    early_stopping_rounds=100,
                                                    verbose_eval=-1)
        y_pred = lgbm_model.predict(x_valid, num_iteration=lgbm_model.best_iteration)
        
        oof_pred[idx_valid] = y_pred
        models.append(lgbm_model)

    score = roc_auc_score(y, oof_pred)
    print('--- FINISHED \ whole score: {:.4f} ---'.format(score))
    return oof_pred, models, score

In [44]:
def predict(models, feature):
    pred_list = []
    for i, model in enumerate(models):
        pred = model.predict(feature, num_iteration = model.best_iteration)
        pred_list.append(pred)
    
    score = np.mean(pred_list, axis=0)
    return score

In [45]:
train_log_list, train_y_list = load_train_log()

In [46]:
gc.collect()

20

In [47]:
%%time
df_pred_all = pd.DataFrame()
df_score_all = pd.DataFrame(index=ELAPSED_MIN)
models_list_list = []

for elapsed_min in ELAPSED_MIN:
    print(f'===== {elapsed_min} =====')
    train_features, y_train = get_train_all_features(elapsed_min, train_log_list, train_y_list)
    test_features = get_test_all_feature(elapsed_min)

    df_pred = pd.DataFrame(index=test_features['session_id'])
    train_features.drop(columns=['session_id'], inplace=True)
    test_features.drop(columns=['session_id'], inplace=True)
    
    models_list = []
    for target in y_train.columns:
        if target == 'session_id':
            continue
        
        print(f"---- id = {target} -----")
        oof, models, score = train_lgbm(train_features, y_train[target])
        models_list.append(models)

        pred = predict(models, test_features)
        df_pred[target] = pred
        df_score_all.loc[elapsed_min, target] = score
        
    models_list_list.append(models_list)
    df_pred_all = pd.concat([df_pred_all, df_pred])
    print(len(df_pred_all))

===== 0 =====
train_features (378594, 345)
test_features (14277, 345)
---- id = 38 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1744]	valid_0's auc: 0.829248
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1313]	valid_0's auc: 0.832156
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1495]	valid_0's auc: 0.833393
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1366]	valid_0's auc: 0.830821
--- FINISHED \ whole score: 0.8314 ---
---- id = 110 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[444]	valid_0's auc: 0.803937
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[505]	valid_0's auc: 0.801332
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[760]	valid_0's auc: 0.807441
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[531]	valid_0's auc: 0.812489
--- FINISHED \ whole score: 0.8059 ---
---- id = 113 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1010]	valid_0's auc: 0.809348
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1647]	valid_0's auc: 0.808421
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1401]	valid_0's auc: 0.810123
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1068]	valid_0's auc: 0.806907
--- FINISHED \ whole score: 0.8084 ---
---- id = 114 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2099]	valid_0's auc: 0.762146
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2878]	valid_0's auc: 0.764925
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2092]	valid_0's auc: 0.762256
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2125]	valid_0's auc: 0.764093
--- FINISHED \ whole score: 0.7633 ---
---- id = 134 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1496]	valid_0's auc: 0.749724
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1630]	valid_0's auc: 0.749905
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1684]	valid_0's auc: 0.751463
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1247]	valid_0's auc: 0.748973
--- FINISHED \ whole score: 0.7500 ---
---- id = 171 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2918]	valid_0's auc: 0.872215
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2275]	valid_0's auc: 0.871734
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2016]	valid_0's auc: 0.871993
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2928]	valid_0's auc: 0.87587
--- FINISHED \ whole score: 0.8729 ---
---- id = 172 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1196]	valid_0's auc: 0.878067
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[987]	valid_0's auc: 0.879783
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[585]	valid_0's auc: 0.879638
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[992]	valid_0's auc: 0.875787
--- FINISHED \ whole score: 0.8776 ---
---- id = 173 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2772]	valid_0's auc: 0.869748
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2438]	valid_0's auc: 0.867854
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2311]	valid_0's auc: 0.868219
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2288]	valid_0's auc: 0.866907
--- FINISHED \ whole score: 0.8682 ---
---- id = 376 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1540]	valid_0's auc: 0.76016
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1021]	valid_0's auc: 0.76462
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1273]	valid_0's auc: 0.758918
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1158]	valid_0's auc: 0.759565
--- FINISHED \ whole score: 0.7607 ---
---- id = 435 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1613]	valid_0's auc: 0.869295
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1960]	valid_0's auc: 0.874846
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1286]	valid_0's auc: 0.873376
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1963]	valid_0's auc: 0.874498
--- FINISHED \ whole score: 0.8729 ---
---- id = 467 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1033]	valid_0's auc: 0.877905
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[970]	valid_0's auc: 0.877442
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1014]	valid_0's auc: 0.873039
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1097]	valid_0's auc: 0.877817
--- FINISHED \ whole score: 0.8765 ---
---- id = 537 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1304]	valid_0's auc: 0.914366
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1582]	valid_0's auc: 0.906948
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1375]	valid_0's auc: 0.907707
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1346]	valid_0's auc: 0.912256
--- FINISHED \ whole score: 0.9101 ---
---- id = 539 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2031]	valid_0's auc: 0.861301
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1364]	valid_0's auc: 0.86489
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1880]	valid_0's auc: 0.865654
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1322]	valid_0's auc: 0.864654
--- FINISHED \ whole score: 0.8638 ---
---- id = 629 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[546]	valid_0's auc: 0.920583
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[861]	valid_0's auc: 0.925626
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[467]	valid_0's auc: 0.930123
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[535]	valid_0's auc: 0.919933
--- FINISHED \ whole score: 0.9234 ---
---- id = 768 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1333]	valid_0's auc: 0.752378
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1940]	valid_0's auc: 0.752133
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1475]	valid_0's auc: 0.751115
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1724]	valid_0's auc: 0.754419
--- FINISHED \ whole score: 0.7525 ---
14277
===== 3 =====
train_features (389649, 345)
test_features (11304, 345)




---- id = 38 -----
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1199]	valid_0's auc: 0.825203
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1918]	valid_0's auc: 0.827671
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1504]	valid_0's auc: 0.823229
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1261]	valid_0's auc: 0.83022
--- FINISHED \ whole score: 0.8264 ---
---- id = 110 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[338]	valid_0's auc: 0.791965
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[479]	valid_0's auc: 0.792264
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[305]	valid_0's auc: 0.79259
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[617]	valid_0's auc: 0.793638
--- FINISHED \ whole score: 0.7920 ---
---- id = 113 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1145]	valid_0's auc: 0.806849
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1038]	valid_0's auc: 0.797481
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1142]	valid_0's auc: 0.800431
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1264]	valid_0's auc: 0.801561
--- FINISHED \ whole score: 0.8016 ---
---- id = 114 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1809]	valid_0's auc: 0.757108
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1817]	valid_0's auc: 0.759016
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1745]	valid_0's auc: 0.756447
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1787]	valid_0's auc: 0.755517
--- FINISHED \ whole score: 0.7570 ---
---- id = 134 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1234]	valid_0's auc: 0.742556
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1168]	valid_0's auc: 0.743313
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1052]	valid_0's auc: 0.743317
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1646]	valid_0's auc: 0.744278
--- FINISHED \ whole score: 0.7434 ---
---- id = 171 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2140]	valid_0's auc: 0.863416
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2918]	valid_0's auc: 0.8623
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2749]	valid_0's auc: 0.865623
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2395]	valid_0's auc: 0.866595
--- FINISHED \ whole score: 0.8644 ---
---- id = 172 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[552]	valid_0's auc: 0.869431
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1207]	valid_0's auc: 0.86486
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[906]	valid_0's auc: 0.869884
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1248]	valid_0's auc: 0.870072
--- FINISHED \ whole score: 0.8674 ---
---- id = 173 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2215]	valid_0's auc: 0.861774
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2272]	valid_0's auc: 0.859678
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2452]	valid_0's auc: 0.861362
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1981]	valid_0's auc: 0.859342
--- FINISHED \ whole score: 0.8605 ---
---- id = 376 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[930]	valid_0's auc: 0.751267
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1478]	valid_0's auc: 0.75435
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1185]	valid_0's auc: 0.757893
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1345]	valid_0's auc: 0.755812
--- FINISHED \ whole score: 0.7548 ---
---- id = 435 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1817]	valid_0's auc: 0.860939
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1325]	valid_0's auc: 0.855528
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1809]	valid_0's auc: 0.85844
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2127]	valid_0's auc: 0.858275
--- FINISHED \ whole score: 0.8581 ---
---- id = 467 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1059]	valid_0's auc: 0.86443
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[986]	valid_0's auc: 0.869596
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1009]	valid_0's auc: 0.865751
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[947]	valid_0's auc: 0.861624
--- FINISHED \ whole score: 0.8653 ---
---- id = 537 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1289]	valid_0's auc: 0.901081
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[864]	valid_0's auc: 0.895893
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1283]	valid_0's auc: 0.905208
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1608]	valid_0's auc: 0.901053
--- FINISHED \ whole score: 0.9005 ---
---- id = 539 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1099]	valid_0's auc: 0.851889
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1148]	valid_0's auc: 0.850228
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1404]	valid_0's auc: 0.854196
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1253]	valid_0's auc: 0.850147
--- FINISHED \ whole score: 0.8515 ---
---- id = 629 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[584]	valid_0's auc: 0.911278
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[639]	valid_0's auc: 0.915229
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[349]	valid_0's auc: 0.912101
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[536]	valid_0's auc: 0.920425
--- FINISHED \ whole score: 0.9143 ---
---- id = 768 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1568]	valid_0's auc: 0.739288
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1267]	valid_0's auc: 0.739993
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[818]	valid_0's auc: 0.736119
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1396]	valid_0's auc: 0.740801
--- FINISHED \ whole score: 0.7390 ---
25581
===== 5 =====
train_features (390621, 345)
test_features (14072, 345)
---- id = 38 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1657]	valid_0's auc: 0.829108
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1362]	valid_0's auc: 0.822278
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1244]	valid_0's auc: 0.823225
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1418]	valid_0's auc: 0.826644
--- FINISHED \ whole score: 0.8253 ---
---- id = 110 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[371]	valid_0's auc: 0.795687
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[644]	valid_0's auc: 0.789741
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[284]	valid_0's auc: 0.784018
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[531]	valid_0's auc: 0.777291
--- FINISHED \ whole score: 0.7854 ---
---- id = 113 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1140]	valid_0's auc: 0.796154
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1138]	valid_0's auc: 0.800914
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[918]	valid_0's auc: 0.793654
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1895]	valid_0's auc: 0.797707
--- FINISHED \ whole score: 0.7965 ---
---- id = 114 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1779]	valid_0's auc: 0.752611
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2121]	valid_0's auc: 0.754454
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2312]	valid_0's auc: 0.751973
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2312]	valid_0's auc: 0.752443
--- FINISHED \ whole score: 0.7529 ---
---- id = 134 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1645]	valid_0's auc: 0.738404
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1272]	valid_0's auc: 0.739523
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1428]	valid_0's auc: 0.740708
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1321]	valid_0's auc: 0.741722
--- FINISHED \ whole score: 0.7400 ---
---- id = 171 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2197]	valid_0's auc: 0.857125
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2255]	valid_0's auc: 0.858811
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2066]	valid_0's auc: 0.854521
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2444]	valid_0's auc: 0.860235
--- FINISHED \ whole score: 0.8577 ---
---- id = 172 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[780]	valid_0's auc: 0.863438
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[937]	valid_0's auc: 0.864262
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1196]	valid_0's auc: 0.861342
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[499]	valid_0's auc: 0.864953
--- FINISHED \ whole score: 0.8620 ---
---- id = 173 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2327]	valid_0's auc: 0.857687
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2592]	valid_0's auc: 0.8558
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2301]	valid_0's auc: 0.857327
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2411]	valid_0's auc: 0.854937
--- FINISHED \ whole score: 0.8564 ---
---- id = 376 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1147]	valid_0's auc: 0.752781
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1034]	valid_0's auc: 0.753147
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1016]	valid_0's auc: 0.754432
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1000]	valid_0's auc: 0.750634
--- FINISHED \ whole score: 0.7527 ---
---- id = 435 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1777]	valid_0's auc: 0.856644
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1436]	valid_0's auc: 0.846137
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1572]	valid_0's auc: 0.84948
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1212]	valid_0's auc: 0.854076
--- FINISHED \ whole score: 0.8514 ---
---- id = 467 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[763]	valid_0's auc: 0.862518
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[946]	valid_0's auc: 0.864997
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1013]	valid_0's auc: 0.860528
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1030]	valid_0's auc: 0.85668
--- FINISHED \ whole score: 0.8610 ---
---- id = 537 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1544]	valid_0's auc: 0.892977
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1204]	valid_0's auc: 0.895766
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1094]	valid_0's auc: 0.897552
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1390]	valid_0's auc: 0.894738
--- FINISHED \ whole score: 0.8949 ---
---- id = 539 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1415]	valid_0's auc: 0.849413
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[808]	valid_0's auc: 0.837208
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1083]	valid_0's auc: 0.84626
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1573]	valid_0's auc: 0.847359
--- FINISHED \ whole score: 0.8448 ---
---- id = 629 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[666]	valid_0's auc: 0.908723
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[414]	valid_0's auc: 0.908935
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[516]	valid_0's auc: 0.911725
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[350]	valid_0's auc: 0.902367
--- FINISHED \ whole score: 0.9072 ---
---- id = 768 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1124]	valid_0's auc: 0.738502
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1481]	valid_0's auc: 0.733439
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1447]	valid_0's auc: 0.738244
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1099]	valid_0's auc: 0.729548
--- FINISHED \ whole score: 0.7349 ---
39653
===== 10 =====
train_features (391074, 345)
test_features (16833, 345)
---- id = 38 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1490]	valid_0's auc: 0.82463
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1518]	valid_0's auc: 0.822737
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1645]	valid_0's auc: 0.825023
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1558]	valid_0's auc: 0.82494
--- FINISHED \ whole score: 0.8243 ---
---- id = 110 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[290]	valid_0's auc: 0.782856
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[358]	valid_0's auc: 0.794642
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[319]	valid_0's auc: 0.787353
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[362]	valid_0's auc: 0.768581
--- FINISHED \ whole score: 0.7830 ---
---- id = 113 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[776]	valid_0's auc: 0.784621
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1065]	valid_0's auc: 0.797211
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1481]	valid_0's auc: 0.802096
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1108]	valid_0's auc: 0.801432
--- FINISHED \ whole score: 0.7962 ---
---- id = 114 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1937]	valid_0's auc: 0.757795
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1493]	valid_0's auc: 0.760128
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1840]	valid_0's auc: 0.760099
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2214]	valid_0's auc: 0.759273
--- FINISHED \ whole score: 0.7593 ---
---- id = 134 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1388]	valid_0's auc: 0.745557
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1627]	valid_0's auc: 0.750892
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[823]	valid_0's auc: 0.747957
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1236]	valid_0's auc: 0.750195
--- FINISHED \ whole score: 0.7486 ---
---- id = 171 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1828]	valid_0's auc: 0.848496
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1799]	valid_0's auc: 0.846213
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2117]	valid_0's auc: 0.844904
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1752]	valid_0's auc: 0.846017
--- FINISHED \ whole score: 0.8463 ---
---- id = 172 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[523]	valid_0's auc: 0.841058
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[706]	valid_0's auc: 0.825799
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1304]	valid_0's auc: 0.857675
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[426]	valid_0's auc: 0.840071
--- FINISHED \ whole score: 0.8392 ---
---- id = 173 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2384]	valid_0's auc: 0.850799
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2205]	valid_0's auc: 0.85239
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2440]	valid_0's auc: 0.853778
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1829]	valid_0's auc: 0.851439
--- FINISHED \ whole score: 0.8520 ---
---- id = 376 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1245]	valid_0's auc: 0.763147
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1073]	valid_0's auc: 0.759364
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[695]	valid_0's auc: 0.768516
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[885]	valid_0's auc: 0.761914
--- FINISHED \ whole score: 0.7630 ---
---- id = 435 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1837]	valid_0's auc: 0.846793
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1520]	valid_0's auc: 0.846881
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1241]	valid_0's auc: 0.8451
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1479]	valid_0's auc: 0.844506
--- FINISHED \ whole score: 0.8454 ---
---- id = 467 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[790]	valid_0's auc: 0.842958
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[765]	valid_0's auc: 0.840925
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[785]	valid_0's auc: 0.858947
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[488]	valid_0's auc: 0.844848
--- FINISHED \ whole score: 0.8465 ---
---- id = 537 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1178]	valid_0's auc: 0.883665
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[784]	valid_0's auc: 0.880731
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1006]	valid_0's auc: 0.885145
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1103]	valid_0's auc: 0.880972
--- FINISHED \ whole score: 0.8823 ---
---- id = 539 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1252]	valid_0's auc: 0.836856
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[921]	valid_0's auc: 0.828807
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1509]	valid_0's auc: 0.836942
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1121]	valid_0's auc: 0.840898
--- FINISHED \ whole score: 0.8356 ---
---- id = 629 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[369]	valid_0's auc: 0.886791
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[511]	valid_0's auc: 0.892462
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[365]	valid_0's auc: 0.883691
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[461]	valid_0's auc: 0.897466
--- FINISHED \ whole score: 0.8899 ---
---- id = 768 -----




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1028]	valid_0's auc: 0.732977
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1105]	valid_0's auc: 0.735754
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[896]	valid_0's auc: 0.730222
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1160]	valid_0's auc: 0.741433
--- FINISHED \ whole score: 0.7351 ---
56486
CPU times: user 2d 19h 39min 44s, sys: 21h 46min 46s, total: 3d 17h 26min 30s
Wall time: 3h 53min 39s


In [48]:
df_pred_all.head()

Unnamed: 0_level_0,38,110,113,114,134,171,172,173,376,435,467,537,539,629,768
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
663721,0.092665,0.014219,0.012817,0.396272,0.074485,0.067461,0.004397,0.058387,0.117032,0.017035,0.029398,0.00231,0.032586,0.089829,0.016217
663761,0.242649,0.100834,0.420604,0.477669,0.325915,0.188416,0.361077,0.043487,0.321602,0.003638,0.004904,0.015535,0.183381,0.000831,0.455646
663763,0.227018,0.033312,0.088646,0.862953,0.475925,0.006648,0.006304,0.003521,0.257165,0.051511,0.066139,0.001035,0.014025,0.022463,0.068059
663775,0.035411,0.016179,0.017881,0.155518,0.26795,0.161665,0.014868,0.617147,0.052686,0.230837,0.029077,0.005381,0.044041,0.009911,0.238391
663778,0.187925,0.013212,0.139206,0.17863,0.113601,0.069761,0.014029,0.072576,0.052591,0.052869,0.007779,0.03362,0.058312,0.002632,0.07231


In [49]:
df_score_all

Unnamed: 0,38,110,113,114,134,171,172,173,376,435,467,537,539,629,768
0,0.831351,0.805947,0.808429,0.763332,0.750008,0.872885,0.877558,0.868187,0.760733,0.872878,0.876491,0.910131,0.863827,0.92342,0.752467
3,0.826433,0.792019,0.80156,0.757015,0.743352,0.864422,0.867392,0.860523,0.754814,0.858122,0.865322,0.900508,0.851548,0.91428,0.738995
5,0.825311,0.785368,0.796469,0.752859,0.740048,0.857675,0.861986,0.856392,0.752729,0.851429,0.861021,0.894892,0.844778,0.907186,0.734918
10,0.824329,0.782983,0.796218,0.759283,0.748562,0.846344,0.839182,0.852011,0.763035,0.845445,0.846529,0.882302,0.835606,0.889854,0.735102


In [50]:
cv = df_score_all.mean(axis=1)
print(cv)
print('- cv =', cv.mean())

0     0.835843
3     0.826420
5     0.821537
10    0.816452
dtype: float64
- cv = 0.8250632409328572


In [51]:
assert len(df_pred_all) == len(test)

In [52]:
submission = pd.merge(test[['session_id']], df_pred_all.reset_index(), on='session_id', how='inner')
assert len(submission) == len(test)

In [53]:
submission.drop(columns='session_id').to_csv('../outputs/submission.csv', index=False)

#### baseline_12: 特徴量削減, lr=0.05
- feat = 345
- Wall time: 2h 18min 19s
- cv = 0.82506 (0.835/0.826/0.821/0.816)
- LB = 0.7675

#### baseline_11: クーポンカテゴリの種類減らす, ユニークユーザー数, 平均単価, ユニーク商品数 
- feat = 375
- Wall time: 2h 18min 19s
- cv = 0.81843 (0.831/0.820/0.814/0.806)
- LB = 0.7626

#### baseline_10: クーポンカテゴリの表示回数
- feat = 412
- Wall time: 2h 8min 53s
- cv = 0.821003 (0.831/0.823/0.818/0.8103)
- LB = 0.7657

#### baseline_9-1: 'feature_fraction': 0.6,'bagging_fraction': 0.6,'bagging_freq': 2,
- feat = 333
- Wall time: 2h 2min 14s
- cv = 0.81961 (0.829/0.822/0.816/0.809)
- LB = 0.7658

#### baseline_9: 来店間隔、回数、曜日x時間の来店者数
- feat = 333
- Wall time: 2h 9min 51s
- cv = 0.82118 (0.831/0.824/0.818/0.810)
- LB = 0.7665

#### baseline_8: クーポン発行とターゲットの購買実績
- feat = 344
- Wall time: 2h 5min 13s
- cv = 0.820398 (0.832/0.822/0.817/0.809)
- LB = 0.7484

#### baseline_7: userごとのカテゴリの過去の購買実績
- feat = 329
- Wall time: 1h 59min 16s
- cv = 0.82148 (0.832/0.824/0.818/0.810)
- LB = 0.7671

#### baseline_6: sesssion内のターゲットの購買
- feat = 89
- Wall time: 37min 47s
- cv = 0.66015
- LB = 0.6247

#### baseline_5
- feat = 74
- Wall time: 39min 18s
- cv = 0.66859
- LB = 0.6230