In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import feather
import datetime
from matplotlib_venn import venn2
from sklearn.model_selection import KFold
import lightgbm as lgbm
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
pd.set_option('display.max_Columns', 100)

In [44]:
cartlog = pd.read_feather('../inputs/cartlog.f')
product_master = pd.read_feather('../inputs/product_master.f')
meta = pd.read_feather('../inputs/meta.f')
user_master = pd.read_feather('../inputs/user_master.f')
test = pd.read_csv('../inputs/test.csv')

product_master['JAN'] = product_master['JAN'].astype(str)
meta['time_elapsed_sec'] = meta['time_elapsed'] * 60

In [45]:
class RetailDataset:
    def __init__(self, thres_sec, cartlog, product, meta, user, test):
        self.thres_sec = thres_sec
        self.cartlog = cartlog.copy()
        self.product_master = product.copy()
        self.meta = meta.copy()
        self.user_master = user.copy()
        self.test = test.copy()
        
        self.meta.loc[self.meta['time_elapsed_sec'].isnull(), 'time_elapsed_sec'] = thres_sec
        self.target_category_ids = [
            38,  # アイスクリーム__ノベルティー
            110,  # スナック・キャンディー__ガム
            113,  # スナック・キャンディー__シリアル
            114,  # スナック・キャンディー__スナック
            134,  # チョコ・ビスクラ__チョコレート
            171,  # ビール系__RTD
            172,  # ビール系__ノンアルコール
            173,  # ビール系__ビール系
            376,  # 和菓子__米菓
            435,  # 大型PET__無糖茶（大型PET）
            467,  # 小型PET__コーヒー（小型PET）
            537,  # 水・炭酸水__大型PET（炭酸水）
            539,  # 水・炭酸水__小型PET（炭酸水）
            629,  # 缶飲料__コーヒー（缶）
            768,  # 麺類__カップ麺
        ]

    def get_test_sessions(self) -> set:
        """以下の条件を満たすセッションを取得する
        - 予測対象である
        """
        return set(self.test["session_id"].unique())

    def get_test_input_log(self) -> pd.DataFrame:
        """以下の条件を満たすログを取得する
        - 予測対象である

        ログが存在しないセッションもあるので注意.
        """
        test_sessions = self.get_test_sessions()
        return self.cartlog[self.cartlog["session_id"].isin(test_sessions)]

    def get_log_first_half(self) -> pd.DataFrame:
        """以下の条件を満たすログを取得する
        - 学習期間(2020-08-01の前日まで)のセッションである
        """
        first_half_sessions = set(
            self.meta.query("date < '2020-08-01'")["session_id"].unique()
        )
        return self.cartlog[self.cartlog["session_id"].isin(first_half_sessions)]

    def get_train_output_log(self) -> pd.DataFrame:
        """以下の条件を満たすログを取得する
        - 学習期間(2020-08-01の前日まで)のセッションである
        - 指定した時間(thres_sec)以降にログが存在している
        """
        return pd.merge(
            self.get_log_first_half(),
            self.meta[["session_id", "time_elapsed_sec"]],
            on=["session_id"],
            how="inner",
        ).query("spend_time > time_elapsed_sec")

    def get_train_sessions(self) -> set:
        """以下の条件を満たすセッションを取得する
        - 学習期間(2020-08-01の前日まで)のセッションである
        - 指定した時間(thres_sec)以降にログが存在している
        """
        return set(self.get_train_output_log()["session_id"].unique())

    def get_train_input_log(self) -> pd.DataFrame:
        """以下の条件を満たすログを取得する
        - 学習期間(2020-08-01の前日まで)のセッションである
        - 指定した時間(thres_sec)以降にログが存在している
        - 指定した時間(thres_sec)より前のログである
        """
        train_sessions = self.get_train_sessions()
        return pd.merge(
            self.get_log_first_half()[
                self.get_log_first_half()["session_id"].isin(train_sessions)
            ],
            self.meta[["session_id", "time_elapsed_sec"]],
            on=["session_id"],
            how="inner",
        ).query("spend_time <= time_elapsed_sec")

    def get_payment_sessions(self) -> set:
        """以下の条件を満たすセッションを取得する
        - 決済を行った
        """
        return set(self.cartlog.query("is_payment == 1")["session_id"].unique())

    def agg_payment(self, cartlog) -> pd.DataFrame:
        """セッションごと・商品ごとの購買個数を集計する"""
        # 購買情報は商品のものだけ.
        target_index = cartlog["kind_1"] == "商品"

        # JANコード (vale_1)ごとに商品の購入個数(n_items)を足し算
        agg = (
            cartlog.loc[target_index]
            .groupby(["session_id", "value_1"])["n_items"]
            .sum()
            .reset_index()
        )
        agg = agg.rename(columns={"value_1": "JAN"})
#         agg = agg.astype({"JAN": int})
        return agg

    def get_train_target(self) -> pd.DataFrame:
        """学習で使用するセッションの目的変数を取得する"""
        # 空のターゲット用データフレームを用意する
        train_sessions = self.get_train_sessions()
        train_target = pd.DataFrame(
            np.zeros((len(train_sessions), len(self.target_category_ids))),
            index=train_sessions,
            columns=self.target_category_ids,
        ).astype(int)
        train_target.index.name = "session_id"

        # 集計する
        train_output_log = self.get_train_output_log()
        train_items_per_session_jan = self.agg_payment(train_output_log)
        train_items_per_session_target_jan = pd.merge(
            train_items_per_session_jan,
            self.product_master[["JAN", "category_id"]],
            on="JAN",
            how="inner",
        ).query("category_id in @self.target_category_ids")
        train_target_pos = (
            train_items_per_session_target_jan.groupby(["session_id", "category_id"])[
                "n_items"
            ]
            .sum()
            .unstack()
            .fillna(0)
            .astype(int)
        )
        train_target_pos[train_target_pos > 0] = 1
        train_target_pos[train_target_pos <= 0] = 0

        train_target.loc[train_target_pos.index] = train_target_pos.values
        return train_target[self.target_category_ids]

In [3]:
# 10min以内のログ
log_sec = 10 * 60

In [46]:
dataset = RetailDataset(log_sec, cartlog, product_master, meta, user_master, test)
train_input_log = dataset.get_train_input_log()
print(train_input_log.shape)
y_train = dataset.get_train_target()
print(y_train.shape)

(5129175, 20)
(404500, 15)


In [5]:
train_input_log.head()

Unnamed: 0,session_id,register_number,date,hour,spend_time,display_action_id,user_id,value_1,name_1,kind_1,number_1,value_2,name_2,kind_2,number_2,unit_price,n_items,is_payment,date_str,time_elapsed_sec
0,105,1005,2019-02-14,9,0,136,CN9sWHXp6RdCuyFkW5aemG,,,,,,,,,,0,0,2019-02-14,600.0
1,105,1005,2019-02-14,9,50,209,CN9sWHXp6RdCuyFkW5aemG,4901670110210.0,サランラップ３０ｃｍ＊２,商品,1.0,,,,,138.0,1,0,2019-02-14,600.0
2,105,1005,2019-02-14,9,96,209,CN9sWHXp6RdCuyFkW5aemG,4522646718089.0,食品用ラップ　２０m,商品,1.0,,,,,59.0,1,0,2019-02-14,600.0
3,105,1005,2019-02-14,9,152,209,CN9sWHXp6RdCuyFkW5aemG,4901422153502.0,ｸﾚﾗｯﾌﾟ徳用,商品,1.0,,,,,379.0,1,0,2019-02-14,600.0
4,105,1005,2019-02-14,9,184,202,CN9sWHXp6RdCuyFkW5aemG,4901422153502.0,ｸﾚﾗｯﾌﾟ徳用,商品,1.0,,,,,379.0,-1,0,2019-02-14,600.0


In [6]:
test_input_log = dataset.get_test_input_log()
print(test_input_log.shape)
test_sessions = dataset.get_test_sessions()
print(len(test_sessions))

(333059, 19)
56486


### セッション単位で集計

In [7]:
def get_session_features(input_log):
    n_actions = input_log.groupby(["session_id"]).size().rename("n_actions")
    n_add_items = input_log[input_log['kind_1'] == '商品'].groupby(["session_id"]).size().rename("n_add_items")
    mean_spend_time = input_log.groupby(["session_id"])["spend_time"].mean()

    session_features = pd.concat([
        n_actions,
        n_add_items,
        mean_spend_time,
    ], axis=1)
    return session_features

In [8]:
train_session_features = get_session_features(train_input_log)

### ユーザ情報を取得する

In [9]:
user_features = pd.merge(
    dataset.meta[["session_id", "user_id"]],
    dataset.user_master,
    on="user_id",
    how="left",
).drop(columns=["user_id"])
user_features.head()

Unnamed: 0,session_id,age,gender
0,0,40.0,1.0
1,1,30.0,0.0
2,2,50.0,1.0
3,3,30.0,0.0
4,4,30.0,1.0


### 特徴量を集約する

In [10]:
train_session_ids = y_train.index
print(len(train_session_ids))
train_features = pd.DataFrame({"session_id": train_session_ids})
train_features = pd.merge(train_features, train_session_features, on="session_id", how="left")
train_features = pd.merge(train_features, user_features, on="session_id", how="left")
assert len(y_train) == len(train_features)
train_features.head()

404500


Unnamed: 0,session_id,n_actions,n_add_items,spend_time,age,gender
0,105,12.0,11.0,270.25,40.0,0.0
1,106,17.0,15.0,324.117647,60.0,1.0
2,107,22.0,9.0,247.0,30.0,0.0
3,108,11.0,6.0,194.090909,60.0,0.0
4,109,10.0,8.0,290.7,70.0,0.0


In [12]:
n_fold = 4

In [13]:
lgbm_param = {
    'objective' : 'binary',
    'boosting_type': 'gbdt',
    'metric': 'auc',
    'seed' : 0,
    'learning_rate':  0.1,
    'verbose': -1
}

In [14]:
def train_lgbm(X, y, params=lgbm_param):

    fold = KFold(n_splits=n_fold, shuffle=True, random_state=0)
    cv = fold.split(X)
    
    models = []
    oof_pred = np.zeros_like(y, dtype=np.float)

    for i, (idx_train, idx_valid) in enumerate(cv): 
        x_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
        x_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]
        print('x_train', x_train.shape)
        print('x_valid', x_valid.shape)
        
        lgbm_train = lgbm.Dataset(x_train, y_train)
        lgbm_eval = lgbm.Dataset(x_valid, y_valid, reference=lgbm_train)
        
        lgbm_model = lgbm.train(params, 
                                                    lgbm_train, 
                                                    valid_sets=lgbm_eval,
                                                    num_boost_round=10000,
                                                    early_stopping_rounds=100,
                                                    verbose_eval=-1)
        y_pred = lgbm_model.predict(x_valid, num_iteration=lgbm_model.best_iteration)
        
        oof_pred[idx_valid] = y_pred
        models.append(lgbm_model)
        print('--- Fold {} Score: {:.4f} ---'.format(i, roc_auc_score(y_valid, y_pred)))

    score = roc_auc_score(y, oof_pred)
    print('--- FINISHED \ whole score: {:.4f} ---'.format(score))
    return oof_pred, models, score

In [15]:
models_list = []
for target in y_train.columns:
    print(f"target category id = {target}")
    oof, models, score = train_lgbm(train_features, y_train[target])
    models_list.append(models)

target category id = 38
x_train (303375, 6)
x_valid (101125, 6)
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[32]	valid_0's auc: 0.604285
--- Fold 0 Score: 0.6043 ---
x_train (303375, 6)
x_valid (101125, 6)
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[51]	valid_0's auc: 0.601123
--- Fold 1 Score: 0.6011 ---
x_train (303375, 6)
x_valid (101125, 6)
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[38]	valid_0's auc: 0.607356
--- Fold 2 Score: 0.6074 ---
x_train (303375, 6)
x_valid (101125, 6)
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[24]	valid_0's auc: 0.59966
--- Fold 3 Score: 0.5997 ---
--- FINISHED \ whole score: 0.6030 ---
target category id = 110
x_train (303375, 6)
x_valid (101125, 6)
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration 

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[71]	valid_0's auc: 0.584722
--- Fold 3 Score: 0.5847 ---
--- FINISHED \ whole score: 0.5833 ---
target category id = 467
x_train (303375, 6)
x_valid (101125, 6)
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[41]	valid_0's auc: 0.581138
--- Fold 0 Score: 0.5811 ---
x_train (303375, 6)
x_valid (101125, 6)
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[29]	valid_0's auc: 0.570412
--- Fold 1 Score: 0.5704 ---
x_train (303375, 6)
x_valid (101125, 6)
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[118]	valid_0's auc: 0.575998
--- Fold 2 Score: 0.5760 ---
x_train (303375, 6)
x_valid (101125, 6)
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[16]	valid_0's auc: 0.574486
--- Fold 3 Score: 0.5745 ---


In [20]:
test_session_features = get_session_features(test_input_log)
print(len(test_sessions))
test_features = pd.DataFrame({"session_id": list(test_sessions)})
test_features = pd.merge(test_features, test_session_features, on="session_id", how="left")
test_features = pd.merge(test_features, user_features, on="session_id", how="left")
assert len(test_sessions) == len(test_features)
test_features.head()

56486


Unnamed: 0,session_id,n_actions,n_add_items,spend_time,age,gender
0,786432,1.0,,0.0,30.0,0.0
1,786433,1.0,,0.0,60.0,1.0
2,786435,1.0,,0.0,40.0,0.0
3,786436,16.0,15.0,352.1875,50.0,1.0
4,786437,28.0,23.0,253.678571,60.0,1.0


In [25]:
def predict(models, feature):
    pred_list = []
    for i, model in enumerate(models):
        pred = model.predict(feature, num_iteration = model.best_iteration)
        pred_list.append(pred)
    
    score = np.mean(pred_list, axis=0)
    return score

In [35]:
df_pred = pd.DataFrame(index=test_features.index)
for i, target in enumerate(y_train.columns):
    print(f"target category id = {target}")
    pred = predict(models_list[i], test_features)
    df_pred[target] = pred

target category id = 38
target category id = 110
target category id = 113
target category id = 114
target category id = 134
target category id = 171
target category id = 172
target category id = 173
target category id = 376
target category id = 435
target category id = 467
target category id = 537
target category id = 539
target category id = 629
target category id = 768


In [36]:
submission = pd.read_csv('../inputs/atmaCup#9__sample_submission.csv')
assert len(df_pred) == len(submission)

In [41]:
df_pred.to_csv('../outputs/submission.csv', index=False)