- go back to ver 65
- change max_depth to 7

In [1]:
!pip install pytorch-tabnet
!pip install jpholiday

Collecting pytorch-tabnet
  Downloading pytorch_tabnet-3.1.1-py3-none-any.whl (39 kB)
Installing collected packages: pytorch-tabnet
Successfully installed pytorch-tabnet-3.1.1
You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m
Collecting jpholiday
  Downloading jpholiday-0.1.5-py3-none-any.whl (8.7 kB)
Installing collected packages: jpholiday
Successfully installed jpholiday-0.1.5
You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m


In [2]:
import os
import sys
import random
import numpy as np 
import pandas as pd 
import lightgbm as lgb
import datetime
import jpholiday
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import StratifiedKFold

import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputClassifier
sys.path.append('../input/multilabelstraifier/')
from ml_stratifiers import MultilabelStratifiedKFold

from pytorch_tabnet.metrics import Metric
from pytorch_tabnet.tab_model import TabNetRegressor
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import tensorflow as tf

from catboost import CatBoost, CatBoostClassifier, CatBoostRegressor, Pool

import warnings
warnings.filterwarnings('ignore')
pd.set_option("max_rows", 110)

In [3]:
NFOLDS = 5

In [4]:
DIR = '../input/atma-retail20210129/'
log_df = pd.read_csv(DIR+'carlog.csv', dtype={ 'value_1': str }, parse_dates=['date'])
test_df = pd.read_csv(DIR+'test.csv')
meta_df = pd.read_csv(DIR+"meta.csv")
display_action_id = pd.read_csv(DIR+"display_action_id.csv")
product_master_df = pd.read_csv(DIR+"product_master.csv", dtype={ 'JAN': str })
user_master = pd.read_csv(DIR+"user_master.csv")
submission = pd.read_csv(DIR+"atmaCup9__sample_submission.csv")
test_sessions = test_df['session_id'].unique()

In [5]:
all_actions = list(display_action_id.display_action_id.unique())
del display_action_id

In [6]:
TARGET_CATEGORIES = [
    # お酒に関するもの
    'ビール系__RTD', 'ビール系__ビール系', 'ビール系__ノンアルコール',
    
    # お菓子に関するもの
    'スナック・キャンディー__スナック', 
    'チョコ・ビスクラ__チョコレート', 
    'スナック・キャンディー__ガム', 
    'スナック・キャンディー__シリアル',
    'アイスクリーム__ノベルティー', 
    '和菓子__米菓',
    
    # 飲料に関するもの
    '水・炭酸水__大型PET（炭酸水）',
    '水・炭酸水__小型PET（炭酸水）',
    '缶飲料__コーヒー（缶）',
    '小型PET__コーヒー（小型PET）',
    '大型PET__無糖茶（大型PET）',
    
    # 麺類
    '麺類__カップ麺',
]

In [7]:
cat2id = dict(zip(product_master_df['category_name'], product_master_df['category_id']))
TARGET_IDS = pd.Series(TARGET_CATEGORIES).map(cat2id).values.tolist()
category_id2code = dict(zip(TARGET_IDS, TARGET_CATEGORIES))

for x in zip(TARGET_IDS, TARGET_CATEGORIES):
    print(x)

def only_purchase_records(input_df: pd.DataFrame) -> pd.DataFrame:
    idx = input_df['kind_1'] == '商品'
    out_df = input_df[idx].reset_index(drop=True)
    return out_df

def create_payment(input_df: pd.DataFrame) -> pd.DataFrame:
    """
    ログデータから session_id / JAN ごとの購買情報に変換します.

    Args:
        input_df:
            レジカートログデータ

    Returns:
        session_id, JAN, n_items (合計購買数) の DataFrame
    """

    # 購買情報は商品のものだけ.
    out_df = only_purchase_records(input_df)
    out_df = out_df.groupby(['session_id', 'value_1'])['n_items'].sum().reset_index()
    out_df = out_df.rename(columns={
        'value_1': 'JAN'
    })
    return out_df

def annot_category(input_df: pd.DataFrame,
                   master_df: pd.DataFrame):
    """
    カテゴリ ID をひも付けます.

    Args:
        input_df:
            変換するデータ.
            `value_1`  or `JAN` を column として持っている必要があります.
        master_df:
            商品マスタのデータフレーム

    Returns:

    """
    input_df = input_df.rename(columns={'value_1': 'JAN'})
    out_df = pd.merge(input_df['JAN'],
                      master_df[['JAN', 'category_id']], on='JAN', how='left')
    return out_df['category_id']

def only_payment_session_record(input_log_df):
    """支払いが紐づくセッションへ絞り込みを行なう"""
    payed_sessions = input_log_df[input_log_df['is_payment'] == 1]['session_id'].unique()
    idx = input_log_df['session_id'].isin(payed_sessions)
    out_df = input_log_df[idx].reset_index(drop=True)
    return out_df

def create_target_from_log(log_df: pd.DataFrame,
                           product_master_df: pd.DataFrame,
                          only_payment=True):

    if only_payment:
        log_df = only_payment_session_record(log_df)
    pay_df = create_payment(log_df)
    pay_df['category_id'] = annot_category(pay_df, master_df=product_master_df)

    # null の category を削除. JAN が紐付かない時に発生する.
    idx_null = pay_df['category_id'].isnull()
    pay_df = pay_df[~idx_null].reset_index(drop=True)
    # Nullが混じっている時 float になるため int へ明示的に戻す
    pay_df['category_id'] = pay_df['category_id'].astype(int)

    idx = pay_df['category_id'].isin(TARGET_IDS)
    target_df = pd.pivot_table(data=pay_df[idx],
                               index='session_id',
                               columns='category_id',
                               values='n_items',
                               aggfunc='sum')

    sessions = sorted(log_df['session_id'].unique())
    print(len(sessions))
    target_df = target_df.reindex(sessions)
    target_df = target_df.fillna(0).astype(int)
    return target_df, pay_df

(171, 'ビール系__RTD')
(173, 'ビール系__ビール系')
(172, 'ビール系__ノンアルコール')
(114, 'スナック・キャンディー__スナック')
(134, 'チョコ・ビスクラ__チョコレート')
(110, 'スナック・キャンディー__ガム')
(113, 'スナック・キャンディー__シリアル')
(38, 'アイスクリーム__ノベルティー')
(376, '和菓子__米菓')
(537, '水・炭酸水__大型PET（炭酸水）')
(539, '水・炭酸水__小型PET（炭酸水）')
(629, '缶飲料__コーヒー（缶）')
(467, '小型PET__コーヒー（小型PET）')
(435, '大型PET__無糖茶（大型PET）')
(768, '麺類__カップ麺')


In [8]:
def target_encoding(X_train, y_train, X_test, col_name, target_name, user_df, replace=True, option = "mean"):
    X_train = X_train.copy()
    X_test = X_test.copy()
    rare_users = list(user_df[user_df["total_appearance"]<10]["user_id"])

    Xy = pd.DataFrame({'trans_col': X_train[col_name], 'target': y_train})
    
    if option == "sum":
        target_mean_all = Xy.groupby('trans_col')['target'].sum()
    else:
        target_mean_all = Xy.groupby('trans_col')['target'].mean()  
        
    if replace:
        X_test[col_name+str(target_name)] = X_test[col_name].map(target_mean_all)
        X_test.loc[X_test.user_id.isin(rare_users), col_name+str(target_name)] = -1
        X_test[col_name+str(target_name)].fillna(-1, inplace=True)
    else:
        X_test[f'te_{col_name}_{target_name}'] = X_test[col_name].map(target_mean_all)
        X_test.loc[X_test.user_id.isin(rare_users), f'te_{col_name}_{target_name}'] = -1
        X_test[f'te_{col_name}_{target_name}'].fillna(-1, inplace=True)

    oof_target = np.zeros(X_train.shape[0])
    kf = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=0)
    for idx_1, idx_2 in kf.split(X_train, y_train):
        if option == "sum":
            target_mean = Xy.iloc[idx_1, :].groupby('trans_col')['target'].sum()
        else:
            target_mean = Xy.iloc[idx_1, :].groupby('trans_col')['target'].mean()
            
        oof_target[idx_2] = X_train[col_name].iloc[idx_2].map(target_mean)

    if replace:
        X_train[col_name+str(target_name)] = oof_target
        X_train.loc[X_train.user_id.isin(rare_users), col_name+str(target_name)] = -1
        X_train[col_name+str(target_name)].fillna(-1, inplace=True)

    else:
        X_train[f'te_{col_name}_{target_name}'] = oof_target
        X_train.loc[X_train.user_id.isin(rare_users), f'te_{col_name}_{target_name}'] = -1
        X_train[f'te_{col_name}_{target_name}'].fillna(-1, inplace=True)

    return X_train, X_test

In [9]:
def target_encoding_with_covid(X_train, y_train, X_test, col_name, target_name, user_df, replace=True, option = "mean", when="after"):
    X_train = X_train.copy()
    X_test = X_test.copy()
    rare_users = list(user_df[user_df["total_appearance"]<10]["user_id"])
    
    if when == "after":
        cons_index = X_train[X_train.date_date>="2020-03-01"].index
    else:
        cons_index = X_train[X_train.date_date<"2020-03-01"].index
        
    Xy = pd.DataFrame({'trans_col': X_train.loc[cons_index][col_name], 'target': y_train.loc[cons_index]})
    
    if option == "sum":
        target_mean_all = Xy.groupby('trans_col')['target'].sum()
    else:
        target_mean_all = Xy.groupby('trans_col')['target'].mean()  
        
    if replace:
        X_test[col_name+str(target_name)+"_"+str(when)] = X_test[col_name].map(target_mean_all)
        X_test.loc[X_test.user_id.isin(rare_users), col_name+str(target_name)+"_"+str(when)] = -1
        X_test[col_name+str(target_name)+"_"+str(when)].fillna(-1, inplace=True)
    else:
        X_test[f'te_{col_name}_{target_name}_{when}'] = X_test[col_name].map(target_mean_all)
        X_test.loc[X_test.user_id.isin(rare_users), f'te_{col_name}_{target_name}_{when}'] = -1
        X_test[f'te_{col_name}_{target_name}_{when}'].fillna(-1, inplace=True)

    oof_target = np.zeros(X_train.shape[0])
    kf = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=0)
    for idx_1, idx_2 in kf.split(X_train, y_train):
        idx_1 = list(set(idx_1) & set(cons_index))
        Xy = pd.DataFrame({'trans_col': X_train.loc[idx_1][col_name], 'target': y_train.loc[idx_1]})
        if option == "sum":
            target_mean = Xy.groupby('trans_col')['target'].sum()
        else:
            target_mean = Xy.groupby('trans_col')['target'].mean()
            
        oof_target[idx_2] = X_train[col_name].iloc[idx_2].map(target_mean)

    if replace:
        X_train[col_name+str(target_name)+"_"+str(when)] = oof_target
        X_train.loc[X_train.user_id.isin(rare_users), col_name+str(target_name)+"_"+str(when)] = -1
        X_train[col_name+str(target_name)+"_"+str(when)].fillna(-1, inplace=True)

    else:
        X_train[f'te_{col_name}_{target_name}_{when}'] = oof_target
        X_train.loc[X_train.user_id.isin(rare_users), f'te_{col_name}_{target_name}_{when}'] = -1
        X_train[f'te_{col_name}_{target_name}_{when}'].fillna(-1, inplace=True)

    return X_train, X_test

# data formation

In [10]:
user_data = meta_df["user_id"].value_counts().reset_index()
user_data.columns = ["user_id", "total_appearance"]

In [11]:
# user data
for id_ in all_actions:
    id_count_session = list(log_df[log_df["display_action_id"]==id_]["session_id"].unique())
    id_meta = meta_df[meta_df.session_id.isin(id_count_session)]["user_id"].value_counts().reset_index()
    id_meta.columns = ["user_id", "count"+str(id_)]
    user_data = pd.merge(user_data, id_meta, on="user_id", how="left").fillna(0)
    user_data["ratio_action"+str(id_)] =  user_data["count"+str(id_)] / user_data["total_appearance"]
    del user_data["count"+str(id_)]

In [12]:
def isBizDay(DATE):
    Date = datetime.date(int(DATE[0:4]), int(DATE[5:7]), int(DATE[8:10]))
    if jpholiday.is_holiday(Date):
        return 1
    else:
        return 0

meta_df["dayofweek"] = pd.to_datetime(meta_df["date"]).dt.dayofweek

meta_df["tmp_date"] = pd.to_datetime(meta_df["date"])
meta_df["days_from_prev"] = meta_df.groupby("user_id")["tmp_date"].diff().dt.days
del meta_df["tmp_date"]

In [13]:
# 完全なデータを持っているログに絞る
test_sessions = test_df['session_id'].unique()
idx_test = log_df['session_id'].isin(test_sessions)
whole_log_df = log_df[~idx_test].reset_index(drop=True)
payment_session_df = only_payment_session_record(whole_log_df)

In [14]:
# 商品購買の最後(max spend time)が10分より大きいセッションを取り出す
is_item_record = payment_session_df['kind_1'] == '商品'
max_payed_time = payment_session_df[is_item_record].groupby('session_id')['spend_time'].max()
max_payed_time_over_10min = max_payed_time[max_payed_time > 10 * 60]

train_sessions = max_payed_time_over_10min.index.tolist()
train_whole_log_df = payment_session_df[payment_session_df['session_id'].isin(train_sessions)].reset_index(drop=True)

del max_payed_time, max_payed_time_over_10min, payment_session_df

In [15]:
# ユーザーごとに10分以上の買い物でどれくらいの金額を使うのか、どれくらいの個数買うか、どれくらい時間を費やしているのか
train_whole_log_df["cost"] = train_whole_log_df["number_1"] * train_whole_log_df["unit_price"]

total_cost_per_session = train_whole_log_df[is_item_record].groupby("session_id")["cost"].sum()
total_cost_per_session = pd.merge(total_cost_per_session, meta_df[["user_id", "session_id"]], on="session_id", how="left")

total_nitems_per_session = train_whole_log_df[is_item_record].groupby("session_id")["n_items"].sum()
total_nitems_per_session = pd.merge(total_nitems_per_session, meta_df[["user_id", "session_id"]], on="session_id", how="left")

total_spend_time_per_session = train_whole_log_df[is_item_record].groupby("session_id")["spend_time"].sum()
total_spend_time_per_session = pd.merge(total_spend_time_per_session, meta_df[["user_id", "session_id"]], on="session_id", how="left")

user_data = pd.merge(user_data, total_cost_per_session.groupby("user_id")["cost"].agg(["mean", "std"]).add_prefix("total_cost_"),
                    on="user_id", how="left").fillna(-1)

user_data = pd.merge(user_data, total_nitems_per_session.groupby("user_id")["n_items"].agg(["mean", "std"]).add_prefix("total_nitems_"),
                    on="user_id", how="left").fillna(-1)

user_data = pd.merge(user_data, total_spend_time_per_session.groupby("user_id")["spend_time"].agg(["mean", "std"]).add_prefix("spend_time_"),
                    on="user_id", how="left").fillna(-1)

for i in user_data.columns: 
    if i not in ["user_id", "total_appearance"]:
        user_data.loc[user_data.total_appearance<10, i] = -1
        
del total_cost_per_session, total_nitems_per_session, total_spend_time_per_session, is_item_record

In [16]:
seed = 42
np.random.seed(seed)
random.seed(seed)

time_elasped_count = meta_df['time_elapsed'].value_counts(normalize=True)

train_time_elapsed = np.random.choice(time_elasped_count.index.astype(int), 
                                      p=time_elasped_count.values, 
                                      size=len(train_sessions))
train_meta_df = pd.DataFrame({
    'session_id': train_sessions,
    'time_elapsed': train_time_elapsed
})

train_meta_df = pd.merge(train_meta_df, 
                         meta_df.drop(columns=['time_elapsed']), 
                         on='session_id', 
                         how='left')
del train_time_elapsed, train_sessions

In [17]:
_df = pd.merge(train_whole_log_df[['session_id', 'spend_time']], train_meta_df, on='session_id', how='left')
idx_show = _df['spend_time'] <= _df['time_elapsed'] * 60

del _df

train_public_df = train_whole_log_df[idx_show].reset_index(drop=True)
train_private_df = train_whole_log_df[~idx_show].reset_index(drop=True)

del idx_show, train_whole_log_df

# テストのログデータと合わせて推論時に見ても良いログ `public_log_df` として保存しておく
public_log_df = pd.concat([
    train_public_df, log_df[log_df['session_id'].isin(test_sessions)]
], axis=0, ignore_index=True)
# meta に紐づく情報は後でよく使うので, テストデータにも meta 情報をマージしておきます. 
# train_meata_df / test_meta_df が今後特徴を作る上で key になるデータになります。
test_meta_df = pd.merge(test_df, meta_df, on='session_id', how='left')

del log_df, train_public_df

In [18]:
train_target_df, _  = create_target_from_log(train_private_df, 
                                             product_master_df=product_master_df,
                                            only_payment=False)

train_target_df = train_target_df.reset_index(drop=True)
train_target_df[train_target_df >= 1] = 1
train_target_df[train_target_df <= 0] = 0

del train_private_df

366478


# feature engineering 

In [19]:
class AbstractBaseBlock:
    def fit(self, input_df, y=None):
        return self.transform(input_df)

    def transform(self, input_df):
        raise NotImplementedError()
        
class MetaInformationBlock(AbstractBaseBlock):
    def transform(self, input_df):
        use_columns = [
            'hour', 'register_number', 'time_elapsed'
        ]
        return input_df[use_columns].copy()
    
class DateBlock(AbstractBaseBlock):
    def transform(self, input_df):
        is_holiday = input_df['date'].apply(lambda x: isBizDay(x))
        date = pd.to_datetime(input_df['date'])

        out_df = pd.DataFrame({
            'date': date,
            'dayofweek': date.dt.dayofweek,
            'day': date.dt.day,
            'year': date.dt.year,
            'month': date.dt.month,
            'is_holiday': is_holiday,
        })

        # 金曜日の夜はお祭り騒ぎ
        out_df['hanakin'] = np.where((date.dt.dayofweek == 4) & (input_df['hour'] > 17), 1, 0)
        return out_df.add_prefix('date_')
    
class CountEncodingBlock(AbstractBaseBlock):
    """CountEncodingを行なう block"""
    def __init__(self, column: str):
        self.column = column

    def fit(self, input_df, y=None):
        vc = input_df[self.column].value_counts()
        self.count_ = vc
        return self.transform(input_df)

    def transform(self, input_df):
        out_df = pd.DataFrame()
        out_df[self.column] = input_df[self.column].map(self.count_)
        return out_df.add_prefix('CE_')
    
class HourActionPortfolioBlock(AbstractBaseBlock):
    """時間ごとの `display_action_id` の出現回数を紐付ける block. """
    def fit(self, input_df, y=None):
        _df = pd.pivot_table(
               data=whole_log_df, 
              index='hour',
              columns=whole_log_df['display_action_id'].map(display_action2name),
              values='session_id',
              aggfunc='count').fillna(0)

        self.pivot_df_ = _df

        return self.transform(input_df)

    def transform(self, input_df):
        out_df = pd.merge(input_df['hour'], 
                          self.pivot_df_, 
                          on='hour', how='left').drop(columns=['hour'])
        return out_df.add_prefix('hour_ratio=')
    
class UserHistoryBlock(AbstractBaseBlock):
    """ユーザーの購買履歴を部門名ごとに集計したベクトルを付与する特徴量 block"""

    def fit(self, input_df, y=None):
        purchase_df = only_purchase_records(whole_log_df)
        purchase_df = purchase_df.rename(columns={ 'value_1': 'JAN' })
        category = annot_category(purchase_df, product_master_df)
        idx_null = category.isnull()  # JAN が紐付かないやつ

        # target の情報はリークになる可能性があるので削除する
        idx_none_target = ~category.isin(TARGET_IDS)

        # 商品マスタの部門名を取り出して集計
        bumon_name = pd.merge(purchase_df['JAN'], 
                              product_master_df[['JAN', '部門名']], on='JAN', how='left')['部門名']

        _df = pd.pivot_table(data=purchase_df[idx_none_target], 
               index='user_id', 
               columns=bumon_name[idx_none_target],
              values='n_items',
              aggfunc='sum')\
                .fillna(0)
        
        # ユーザーごとに平均化. 
        _df = _df.div(_df.sum(axis=1), axis=0)
        
        self.agg_df_ = _df
        del _df

        return self.transform(input_df)

    def transform(self, input_df):
        out_df = pd.merge(input_df['user_id'], self.agg_df_, on='user_id', how='left').drop(columns=['user_id'])
        out_df = out_df.fillna(0)
        return out_df.add_prefix('ratio_部門名=')
    
def join(df):
    x = [str(e) for e in list(df)]
    return " ".join(x)
    
class PublicLogBlock(AbstractBaseBlock):
    """見えているログに関する特徴量"""
    def fit(self, input_df, y=None):
        self.agg_df_ = pd.concat([
            # 買っている商品の数
            public_log_df.groupby('session_id')['n_items'].sum().rename('total_items'),
            # 買っている商品 (JANレベル) のユニーク数
            public_log_df[public_log_df['kind_1'] == '商品'].groupby('session_id')['value_1'].nunique().rename('JAN_nunique'),
        ], axis=1)        
        
        # public_logの範囲で特定カテゴリ商品を買っているか
        tmp = pd.merge(public_log_df[public_log_df['kind_1'] == '商品'][["session_id", "value_1"]], 
                       product_master_df[["JAN", "category_id"]], 
                       left_on="value_1",
                       right_on="JAN",how="left")
        
        # association analysis + targets themselves
        for cat in [114, 134, 135, 136, 143, 207, 209, 210, 368, 370, 376, 508, 509, 587, 716, 720, 724, 768,
                   171, 173, 172, 110, 113, 38, 537, 539, 629, 467, 435]:
            self.agg_df_["buy_catCD"+str(cat)] = tmp.groupby('session_id').apply(lambda x: cat in x["category_id"].values)
        del tmp
        
        return self.transform(input_df)

    def transform(self, input_df):
        out_df = pd.merge(input_df['session_id'], self.agg_df_, on='session_id', how='left').drop(columns=['session_id'])
        out_df = out_df.fillna(0)
        return out_df.add_prefix('pub_log=')
    
class UserInfoBlock(AbstractBaseBlock):
    def fit(self, input_df, y=None):
        user_hour_count = pd.pivot_table(data=meta_df,
                                         index='user_id', 
                                         values="date", 
                                         columns='hour', 
                                         aggfunc='count').fillna(0).add_prefix('visit_hour=')
        self.user_hour_count = user_hour_count
        
        user_dayofweek_count = pd.pivot_table(meta_df, 
                       index="user_id", 
                       columns="dayofweek",
                       values="date", 
                       aggfunc="count").fillna(0).add_prefix("user_dayofweek=")
        self.user_dayofweek_count = user_dayofweek_count
                
        return self.transform(input_df)
    
    def transform(self, input_df):
        out_df = pd.merge(input_df['user_id'], self.user_hour_count, on='user_id', how='left')
        out_df = pd.merge(out_df, self.user_dayofweek_count, on='user_id', how='left')
        out_df = pd.merge(out_df, user_master, on='user_id', how='left')
        out_df = pd.merge(out_df, user_data, on='user_id', how='left')
        return out_df
    
class BeforeBuyIntervalBlock(AbstractBaseBlock):
    def __init__(self, category_id):
        self.category_id = category_id

    def fit(self, input_df, y=None):
        log_df = pd.concat([public_log_df, whole_log_df], ignore_index=True)
        jans = product_master_df[product_master_df['category_id'] == self.category_id]['JAN'].unique()
        x = log_df[(log_df['kind_1'] == '商品') & (log_df['value_1'].isin(jans))].groupby('session_id')['n_items'].sum()

        buy_sessions = x[x > 0].index
        df = meta_df[['session_id', 'user_id', 'date']].copy()

        df['buy'] = df['session_id'].isin(buy_sessions).astype(int)
        out_df = pd.DataFrame()

        # 過去平均何回買っているか
        df['past_avg_buy'] = (df.groupby('user_id')['buy'].cumsum() - df['buy']) / (df.groupby('user_id').cumcount() + 1)

        # 直前買ってから何日か
        _x = pd.concat([
            df[['user_id', 'date']],
            (df.groupby('user_id')['buy'].cumsum() - df['buy'])
        ], axis=1)
        _x = _x.merge(_x.groupby(['user_id', 'buy'])['date'].first().rename('first_date'), on=['user_id', 'buy'], how='left')
        _x = _x.merge(_x.groupby('user_id')['date'].first().rename('user_first_date'), on='user_id', how='left')
        _x["date"] = pd.to_datetime(_x["date"])
        _x["first_date"] = pd.to_datetime(_x["first_date"])
        from_before_buy = np.where(_x['first_date'] == _x['user_first_date'], None, (_x['date'] - _x['first_date']).dt.days)
        df['days_before_buy'] = from_before_buy
        self.agg_df_ = df
        
        return self.transform(input_df)
    
    def transform(self, input_df):
        out_df = pd.merge(input_df['session_id'], 
                          self.agg_df_, 
                          on='session_id', 
                          how='left').drop(columns=['session_id', 'user_id', 'date', 'buy'])
        return out_df.add_prefix('BBInterval_{}_'.format(self.category_id))

In [20]:
feature_blocks = [
    #*[CountEncodingBlock(column=c) for c in ['user_id', 'register_number']],
    DateBlock(),
    PublicLogBlock(),
    MetaInformationBlock(),
    UserHistoryBlock(),
    UserInfoBlock(),
    *[BeforeBuyIntervalBlock(i) for i in TARGET_IDS]
]

feat_train_df = pd.DataFrame()
for block in feature_blocks:
    out_i = block.fit(train_meta_df)
    assert len(train_meta_df) == len(out_i), block
    feat_train_df = pd.concat([feat_train_df, out_i], axis=1)
del train_meta_df

feat_test_df = pd.DataFrame()
for block in feature_blocks:
    out_i = block.transform(test_meta_df)
    assert len(test_meta_df) == len(out_i), block
    feat_test_df = pd.concat([feat_test_df, out_i], axis=1)
del test_meta_df

del meta_df, public_log_df, user_master, feature_blocks, whole_log_df

In [21]:
for j in train_target_df.columns:
    feat_train_df, feat_test_df = target_encoding(feat_train_df, train_target_df[j], feat_test_df, 
                                                  "user_id", j, user_data, False, "mean")

In [22]:
for j in train_target_df.columns:
    feat_train_df, feat_test_df = target_encoding_with_covid(feat_train_df, train_target_df[j], feat_test_df, 
                                                  "user_id", j, user_data, False, "mean", "after")

In [23]:
del feat_train_df["user_id"], feat_test_df["user_id"], feat_train_df["date_date"], feat_test_df["date_date"], user_data

In [24]:
features = [i for i in feat_train_df.columns]
print(features)

['date_dayofweek', 'date_day', 'date_year', 'date_month', 'date_is_holiday', 'date_hanakin', 'pub_log=total_items', 'pub_log=JAN_nunique', 'pub_log=buy_catCD114', 'pub_log=buy_catCD134', 'pub_log=buy_catCD135', 'pub_log=buy_catCD136', 'pub_log=buy_catCD143', 'pub_log=buy_catCD207', 'pub_log=buy_catCD209', 'pub_log=buy_catCD210', 'pub_log=buy_catCD368', 'pub_log=buy_catCD370', 'pub_log=buy_catCD376', 'pub_log=buy_catCD508', 'pub_log=buy_catCD509', 'pub_log=buy_catCD587', 'pub_log=buy_catCD716', 'pub_log=buy_catCD720', 'pub_log=buy_catCD724', 'pub_log=buy_catCD768', 'pub_log=buy_catCD171', 'pub_log=buy_catCD173', 'pub_log=buy_catCD172', 'pub_log=buy_catCD110', 'pub_log=buy_catCD113', 'pub_log=buy_catCD38', 'pub_log=buy_catCD537', 'pub_log=buy_catCD539', 'pub_log=buy_catCD629', 'pub_log=buy_catCD467', 'pub_log=buy_catCD435', 'hour', 'register_number', 'time_elapsed', 'ratio_部門名=100円均一', 'ratio_部門名=AV家電', 'ratio_部門名=おもちゃ', 'ratio_部門名=たばこ', 'ratio_部門名=アイスクリーム', 'ratio_部門名=インテリア', 'ratio_部門名

In [25]:
# not to increase memory use during training
feat_train_np = np.ndarray(shape=(len(feat_train_df), len(features)), dtype=np.float32)
feat_test_np = np.ndarray(shape=(len(feat_test_df), len(features)), dtype=np.float32)

for idx, feature in enumerate(features):
    feat_train_np[:,idx] = feat_train_df[feature].astype(np.float32)
    feat_test_np[:,idx] = feat_test_df[feature].astype(np.float32)
    del feat_train_df[feature], feat_test_df[feature]
    
feat_train_df = feat_train_np
feat_test_df = feat_test_np
train_target_df = train_target_df.to_numpy()

train_target_df.shape, feat_train_df.shape, feat_test_df.shape

((366478, 15), (366478, 308), (56486, 308))

# xgb modelling

In [26]:
classifier = MultiOutputClassifier(XGBClassifier(tree_method='gpu_hist'))

clf = Pipeline([('classify', classifier)
               ])

params = {#'classify__estimator__colsample_bytree': 0.7522,
          'classify__estimator__gamma': 3.6975,
          'classify__estimator__learning_rate': 0.1,
          'classify__estimator__max_delta_step': 2.0706,
          'classify__estimator__max_depth': 7,
          'classify__estimator__min_child_weight': 31.5800,
          'classify__estimator__n_estimators': 166,
          #'classify__estimator__subsample': 0.8,
          'classify__estimator__eval_metric': "auc",
         }

clf.set_params(**params)

Pipeline(steps=[('classify',
                 MultiOutputClassifier(estimator=XGBClassifier(base_score=None,
                                                               booster=None,
                                                               colsample_bylevel=None,
                                                               colsample_bynode=None,
                                                               colsample_bytree=None,
                                                               eval_metric='auc',
                                                               gamma=3.6975,
                                                               gpu_id=None,
                                                               importance_type='gain',
                                                               interaction_constraints=None,
                                                               learning_rate=0.1,
                                                          

In [27]:
xgb_oof_preds = np.zeros((feat_train_df.shape[0], train_target_df.shape[1]))
xgb_test_preds = np.zeros((feat_test_df.shape[0], train_target_df.shape[1]))
oof_losses = []
mskf = MultilabelStratifiedKFold(n_splits=NFOLDS, random_state=0, shuffle=True)
for fn, (trn_idx, val_idx) in enumerate(mskf.split(feat_train_df, train_target_df)):
    print('Starting fold: ', fn)
    X_train, X_val = feat_train_df[trn_idx,:], feat_train_df[val_idx,:]
    y_train, y_val = train_target_df[trn_idx,:], train_target_df[val_idx,:]
        
    clf.fit(X_train, y_train)
    val_preds = clf.predict_proba(X_val) # list of preds per class
    val_preds = np.array(val_preds)[:,:,1].T # take the positive class
    xgb_oof_preds[val_idx] = val_preds
    
    score = roc_auc_score(y_val, val_preds, average='macro')
    print(score)
    oof_losses.append(score)
    preds = clf.predict_proba(feat_test_df)

    preds = np.array(preds)[:,:,1].T # take the positive class
    xgb_test_preds += preds / NFOLDS
    
print(oof_losses)
print('Mean OOF loss across folds', np.mean(oof_losses))
print('STD OOF loss across folds', np.std(oof_losses))
print('OOF score: ', roc_auc_score(train_target_df, xgb_oof_preds, average='macro'))

Starting fold:  0
0.8414680919409944
Starting fold:  1
0.8404503498337793
Starting fold:  2
0.8424446670756048
Starting fold:  3
0.8426184665345091
Starting fold:  4
0.8421301780800833
[0.8414680919409944, 0.8404503498337793, 0.8424446670756048, 0.8426184665345091, 0.8421301780800833]
Mean OOF loss across folds 0.8418223506929943
STD OOF loss across folds 0.000790407671328354
OOF score:  0.8417740594612987


In [28]:
# https://stackoverflow.com/questions/54562464/can-i-show-feature-importance-for-multioutputclassifier
feat_impts = [] 
for model in classifier.estimators_:
    feat_impts.append(model.feature_importances_)

feature_importance_values = np.mean(feat_impts, axis=0)
feature_importance_values = pd.DataFrame(feature_importance_values)
feature_importance_values.index = features

In [29]:
np.transpose(feature_importance_values)

Unnamed: 0,date_dayofweek,date_day,date_year,date_month,date_is_holiday,date_hanakin,pub_log=total_items,pub_log=JAN_nunique,pub_log=buy_catCD114,pub_log=buy_catCD134,...,te_user_id_171_after,te_user_id_172_after,te_user_id_173_after,te_user_id_376_after,te_user_id_435_after,te_user_id_467_after,te_user_id_537_after,te_user_id_539_after,te_user_id_629_after,te_user_id_768_after
0,0.003499,0.003001,0.004578,0.003964,0.002588,0.001057,0.00336,0.003498,0.004369,0.002233,...,0.002723,0.002492,0.002994,0.003255,0.003023,0.003285,0.002678,0.003222,0.002696,0.003078


# catboost modelling

In [30]:
# classifier = MultiOutputClassifier(CatBoostClassifier(task_type='GPU'))

# clf = Pipeline([('classify', classifier)
#                ])

# params = {'classify__estimator__learning_rate': 0.1,
#           'classify__estimator__depth': 6, 
#           'classify__estimator__l2_leaf_reg': 3, 
#           'classify__estimator__loss_function': 'Logloss', 
#           'classify__estimator__eval_metric': 'AUC', 
#           'classify__estimator__iterations': 100,
#           'classify__estimator__od_type': 'Iter', 
#           'classify__estimator__boosting_type': 'Plain', 
#           'classify__estimator__bootstrap_type': 'Bernoulli', 
#           'classify__estimator__allow_const_label': True, 
#           'classify__estimator__random_state': 0,
#           'classify__estimator__verbose': 0
#          }

# clf.set_params(**params)

In [31]:
# cat_oof_preds = np.zeros((feat_train_df.shape[0], train_target_df.shape[1]))
# cat_test_preds = np.zeros((feat_test_df.shape[0], train_target_df.shape[1]))
# oof_losses = []
# mskf = MultilabelStratifiedKFold(n_splits=NFOLDS, random_state=0, shuffle=True)
# for fn, (trn_idx, val_idx) in enumerate(mskf.split(feat_train_df, train_target_df)):
#     print('Starting fold: ', fn)
#     X_train, X_val = feat_train_df[trn_idx,:], feat_train_df[val_idx,:]
#     y_train, y_val = train_target_df[trn_idx,:], train_target_df[val_idx,:]
    
#     clf.fit(X_train, y_train)

#     val_preds = clf.predict_proba(X_val) # list of preds per class
#     val_preds = np.array(val_preds) 
#     val_preds = np.array(val_preds)[:,:,1].T # take the positive class
#     cat_oof_preds[val_idx] = val_preds
    
#     score = roc_auc_score(y_val, val_preds, average='macro')
#     print(score)
#     oof_losses.append(score)
#     preds = clf.predict_proba(feat_test_df)    
#     preds = np.array(preds)[:,:,1].T # take the positive class
#     cat_test_preds += preds / NFOLDS
    
# print(oof_losses)
# print('Mean OOF loss across folds', np.mean(oof_losses))
# print('STD OOF loss across folds', np.std(oof_losses))
# print('OOF score: ', roc_auc_score(train_target_df, cat_oof_preds, average='macro'))

# tabnet modelling

In [32]:
# class LogitsLogLoss(Metric):
#     def __init__(self):
#         self._name = "logits_ll"
#         self._maximize = False

#     def __call__(self, y_true, y_pred):

#         logits = 1 / (1 + np.exp(-y_pred))
        
#         aux = (1-y_true)*np.log(1-logits+1e-15) + y_true*np.log(logits+1e-15)
#         return np.mean(-aux)

In [33]:
# class AUC(Metric):
#     def __init__(self):
#         self._name = "auc_ll"
#         self._maximize = True

#     def __call__(self, y_true, y_pred):
#         logits = 1 / (1 + np.exp(-y_pred))
#         auc = roc_auc_score(y_true, logits, average="macro")
#         return auc

In [34]:
# MAX_EPOCH=200
# device = "cuda" if torch.cuda.is_available() else "cpu"

# def seed_everything(seed_value):
#     random.seed(seed_value)
#     np.random.seed(seed_value)
#     torch.manual_seed(seed_value)
#     os.environ['PYTHONHASHSEED'] = str(seed_value)
    
#     if torch.cuda.is_available(): 
#         torch.cuda.manual_seed(seed_value)
#         torch.cuda.manual_seed_all(seed_value)
#         torch.backends.cudnn.deterministic = True
#         torch.backends.cudnn.benchmark = False
        
# def modelling_tabnet(tr, target, te, sample_seed):
#     seed_everything(sample_seed) 
#     tabnet_params = dict(n_d=8, n_a=8, n_steps=1, gamma=1.3, seed = sample_seed,
#                      lambda_sparse=0, optimizer_fn=torch.optim.Adam,
#                      optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
#                      mask_type='entmax',
#                      scheduler_params=dict(mode="min",
#                                            patience=5,
#                                            min_lr=1e-5,
#                                            factor=0.9,),
#                      scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
#                      verbose=10,
#                      )
#     test_cv_preds = []

#     oof_preds = np.zeros([len(tr),target.shape[1]])
#     scores = []
#     NB_SPLITS = 5
#     mskf = MultilabelStratifiedKFold(n_splits=NB_SPLITS, random_state=0, shuffle=True)
#     for fold_nb, (train_idx, val_idx) in enumerate(mskf.split(tr, target)):
#         print("FOLDS : ", fold_nb+1)

#         ## model
#         X_train, y_train = tr[train_idx, :], target[train_idx, :]
#         X_val, y_val = tr[val_idx, :], target[val_idx, :]
#         model = TabNetRegressor(**tabnet_params)
        
#         model.fit(X_train=X_train,
#               y_train=y_train,
#               eval_set=[(X_val, y_val)],
#               eval_name = ["val"],
#               eval_metric = ["auc_ll"],
#               max_epochs=MAX_EPOCH,
#               patience=20, batch_size=1024, virtual_batch_size=128,
#               num_workers=1, drop_last=False,
#               # use binary cross entropy as this is not a regression problem
#               loss_fn=torch.nn.functional.binary_cross_entropy_with_logits)
    
#         preds_val = model.predict(X_val)
#         # Apply sigmoid to the predictions
#         preds =  1 / (1 + np.exp(-preds_val))
#         score = np.max(model.history["val_auc_ll"])
#         oof_preds[val_idx,:] = preds
#         scores.append(score)

#         # preds on test
#         preds_test = model.predict(te)
#         test_cv_preds.append(1 / (1 + np.exp(-preds_test)))
        
#     test_preds_all = np.stack(test_cv_preds)
#     aucs = []
#     for task_id in range(15):
#         aucs.append(roc_auc_score(y_true=target[:, task_id],y_score=oof_preds[:, task_id]))
#     print(f"Overall AUC : {np.mean(aucs)}")
#     return oof_preds, test_preds_all

# tabnet_oof = np.zeros([feat_train_df.shape[0], len(TARGET_CATEGORIES)])
# tabnet_pred = np.zeros([feat_test_df.shape[0], len(TARGET_CATEGORIES)])

# seeds = [0]
# for seed_ in seeds:
#     oof_preds, test_preds_all = modelling_tabnet(feat_train_df, train_target_df, feat_test_df, seed_)
#     tabnet_oof += oof_preds / len(seeds)
#     tabnet_pred += test_preds_all.mean(axis=0) / len(seeds)
    
# print('OOF score: ', roc_auc_score(train_target_df, tabnet_oof, average='macro'))
# score = roc_auc_score(train_target_df, oof_preds, average='macro')

# submission

In [35]:
submission.iloc[:,:] = xgb_test_preds
submission.to_csv("atmacup9_"+str(score)[:-10]+"_.csv", index=False)