In [1]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
personal_access_token = user_secrets.get_secret("personal_access_token")

!rm -rf /kaggle/working/kaggle_otto
!git clone -b gcs-func-add https://$personal_access_token@github.com/coffeemountain/kaggle_otto.git

import sys
sys.path.append('/kaggle/working/kaggle_otto/src')

!pip install dataclasses_json
from covis_matrix_generator import *

Cloning into 'kaggle_otto'...
remote: Enumerating objects: 271, done.[K
remote: Counting objects: 100% (179/179), done.[K
remote: Compressing objects: 100% (101/101), done.[K
remote: Total 271 (delta 109), reused 125 (delta 72), pack-reused 92[K
Receiving objects: 100% (271/271), 296.10 KiB | 1.71 MiB/s, done.
Resolving deltas: 100% (142/142), done.
[0mcan not import cudf


In [2]:
import numpy as np
import pandas as pd
import collections
from collections import Counter
import lightgbm as lgb
from sklearn.model_selection import GroupKFold
import pickle
import glob
import gc

INPUT_DIR = '/kaggle/input/otto-train-and-test-data-for-local-validation/'
TARGET = 'clicks'

In [3]:
df_train = pd.read_parquet(INPUT_DIR+'train.parquet')
df_val = pd.read_parquet(INPUT_DIR+'test.parquet')

# stage1. candidate 生成

In [4]:
USE_GPU = True
weight_func_mixin = WeightFuncMixin()
cs_client = CloudStorageClient()
covis_matrix_generator = CovisMatrixGenerator(weight_func_mixin=weight_func_mixin, cloud_storage_client=cs_client, use_gpu=USE_GPU)

files = glob.glob('../input/otto-chunk-data-inparquet-format/*_parquet/*')

click_timeweight_data = covis_matrix_generator.load_or_generate(
    Config(
        target_types=[0, 1, 2],
        weight_func='time_weight_v1',
        min_event_threshold=30,
        max_sec_threshold=24 * 60 * 60,
        save_topk=20,
    ),
    files)

del covis_matrix_generator
gc.collect()

found same setting covis matrix. loading files ...


151

In [5]:
%%time

def pqt_to_dict(df):
    return df.groupby('aid_x').aid_y.apply(list).to_dict()

# train & valid covis
# LOAD THREE CO-VISITATION MATRICES
top_20_clicks = {}
for df in click_timeweight_data:
    top_20_clicks.update(pqt_to_dict(df))

del click_timeweight_data
gc.collect()
    
val_popular_aid = df_val.loc[df_val['type']==TARGET,'aid'].value_counts().index.values[:20]

CPU times: user 35.5 s, sys: 2.14 s, total: 37.6 s
Wall time: 37.5 s


In [6]:
# step1
import itertools

type_weight_multipliers = {0: 1, 1: 6, 2: 3}

def generate_candidate(df, k=50):

  # USER HISTORY AIDS AND TYPES
  aids=df.aid.tolist()
  types=df.type.tolist()
  unique_aids = list(dict.fromkeys(aids[::-1] ))
  # RERANK CANDIDATES USING WEIGHTS
  if len(unique_aids)>=20:
      weights=np.logspace(0.1,1,len(aids),base=2, endpoint=True)-1
      aids_temp = Counter() 
      # RERANK BASED ON REPEAT ITEMS AND TYPE OF ITEMS
      for aid,w,t in zip(aids,weights,types): 
          aids_temp[aid] += w * type_weight_multipliers[t]
      sorted_aids = [k for k,v in aids_temp.most_common(k)]
      return sorted_aids[:k]

  # USE "CLICKS" CO-VISITATION MATRIX
  aids2 = list(itertools.chain(*[top_20_clicks[aid] for aid in unique_aids if aid in top_20_clicks]))
  # RERANK CANDIDATES
  top_aids2 = [aid2 for aid2, cnt in Counter(aids2).most_common(k) if aid2 not in unique_aids]    
  result = unique_aids + top_aids2[:k]
  # USE TOP20 TEST CLICKS
  return (result + list(val_popular_aid)[:k])[:k]

In [7]:
from tqdm import tqdm
tqdm.pandas()

filename = 'v1.qpt'
bucket_name = '1st-phase-candidates'

if cs_client.exists_data(filename, bucket_name):
    candidates = cs_client.load_dataframe(filename, bucket_name)
else:
    candidates = df_val.groupby('session').progress_apply(lambda x: generate_candidate(x)).explode()
    candidates.name = 'item'
    candidates = candidates.to_frame().reset_index()
    cs_client.save_dataframe(candidates, filename, bucket_name)

del generate_candidate, top_20_clicks, val_popular_aid
gc.collect()

50

# step2 item特徴 : train + valid A

In [8]:
%%time

filename = 'v1.pqt'
bucket_name = 'item-features'

if cs_client.exists_data(filename, bucket_name):
    item_features = cs_client.load_dataframe(filename, bucket_name)
else:
    item_features = pd.merge(df_train, df_val, on=['session','aid','type'], how='left').groupby('aid').agg({'aid': 'count', 'session': 'nunique', 'type': 'mean'})
    item_features.columns = ['item_item_count', 'item_user_count', 'item_by_count']
    cs_client.save_dataframe(item_features, filename, bucket_name)

CPU times: user 193 ms, sys: 153 ms, total: 346 ms
Wall time: 390 ms


# step3 user特徴 : valid A

In [9]:
%%time

filename = 'v1.pqt'
bucket_name = 'user-features'

if cs_client.exists_data(filename, bucket_name):
    user_features = cs_client.load_dataframe(filename, bucket_name)
else:
    user_features = df_val.groupby('session').agg({'session': 'count', 'aid': 'nunique', 'type': 'mean'})
    user_features.columns = ['user_user_count','user_item_count','user_buy_ratio']
    cs_client.save_dataframe(user_features, filename, bucket_name)

CPU times: user 149 ms, sys: 68.3 ms, total: 217 ms
Wall time: 334 ms


# step4 user × item 特徴量生成 : valid A

In [10]:
def create_user_item_features():
    #user(session) x item(aid)に対するaction(type)の有無(1/0)を特徴量にする
    def create_action_flag(df, type_number=0, column_name="user_item_click_flag"):
        action_flag_df = df.loc[df["type"] == type_number, ["session", "aid"]].drop_duplicates()
        action_flag_df[column_name] = 1
        return action_flag_df

    user_item_click_flag = create_action_flag(df_val, type_number=0, column_name="user_item_click_flag")
    user_item_cart_flag = create_action_flag(df_val, type_number=1, column_name="user_item_cart_flag")
    user_item_order_flag = create_action_flag(df_val, type_number=2, column_name="user_item_order_flag")

    user_by_item_features = pd.merge(
        pd.merge(
            user_item_click_flag,
            user_item_cart_flag,
            on=["session", "aid"],
            how="outer"
        ),
        user_item_order_flag,
        on=["session", "aid"],
        how="outer"
    ).fillna(0)
    
    del user_item_click_flag, user_item_cart_flag, user_item_order_flag
    gc.collect()

    #click
    user_item_click_count = df_val[df_val['type']==0].groupby(['session','aid']).agg({'aid': 'count'})
    user_item_click_count.columns = ['user_item_click_count']
    user_item_click_count = df_val[['session','aid']].merge(user_item_click_count, how='left', on=['session','aid'])
    user_item_click_count

    #cart
    user_item_cart_count = df_val[df_val['type']==1].groupby(['session','aid']).agg({'aid': 'count'})
    user_item_cart_count.columns = ['user_item_cart_count']
    user_item_cart_count = df_val[['session','aid']].merge(user_item_cart_count, how='left', on=['session','aid'])
    user_item_cart_count

    #order
    user_item_order_count = df_val[df_val['type']==2].groupby(['session','aid']).agg({'aid': 'count'})
    user_item_order_count.columns = ['user_item_order_count']
    user_item_order_count = df_val[['session','aid']].merge(user_item_order_count, how='left', on=['session','aid'])
    user_item_order_count

    user_item_count = user_item_click_count.merge(user_item_cart_count, on=['session','aid'], how='outer').fillna(0)
    user_item_count = user_item_count.merge(user_item_order_count, on=['session','aid'], how='outer').fillna(0)
    user_item_count = user_item_count.drop_duplicates(['session','aid'])
    user_item_count
    
    del user_item_click_count, user_item_cart_count, user_item_order_count
    gc.collect()

    user_item_features = user_by_item_features.merge(user_item_count, on=['session','aid'], how='outer').fillna(0)
    
    del user_by_item_features
    gc.collect()

    return user_item_features

In [11]:
%%time

filename = 'v1.pqt'
bucket_name = 'user-item-features'

if cs_client.exists_data(filename, bucket_name):
    user_item_features = cs_client.load_dataframe(filename, bucket_name)
else:
    user_item_features = create_user_item_features()
    cs_client.save_dataframe(user_item_features, filename, bucket_name)

CPU times: user 797 ms, sys: 572 ms, total: 1.37 s
Wall time: 1.05 s


# step5 : step1に step2,3,4を追加

In [12]:
%%time
candidates = candidates.rename(columns={'session':'user'})

#step2
candidates = candidates.merge(item_features, left_on='item', right_index=True, how='left').fillna(-1)

#step3
user_features = user_features.rename(columns={'session':'user'})
candidates = candidates.merge(user_features, left_on='user', right_index=True, how='left').fillna(-1)

#step4
user_item_features = user_item_features.rename(columns={'session':'user'})
user_item_features = user_item_features.rename(columns={'aid':'item'})
candidates = candidates.merge(user_item_features, on=['user','item'], how='left').fillna(0)

del item_features, user_features, user_item_features
gc.collect()

CPU times: user 55.3 s, sys: 28.9 s, total: 1min 24s
Wall time: 1min 24s


0

# step6 : 学習できる形にするgt:0,gt:1

In [13]:
tar = pd.read_parquet(INPUT_DIR+'test_labels.parquet')
tar = tar.loc[ tar['type']==TARGET]
tar['item'] = tar['ground_truth'].explode().astype('int32')
tar[TARGET] = 1
tar = tar.rename(columns={'session':'user'})

tar = tar[['user', 'item', TARGET]]

candidates = candidates.merge(tar,on=['user','item'],how='left').fillna(0)

del tar
gc.collect()

24

# stage2. ranker

# Training

In [14]:
#スライスずれるのでtargetは最後のカラムにしておく必要あり。
FEATURES = list(candidates.columns[2:-1])
FEATURES

['item_item_count',
 'item_user_count',
 'item_by_count',
 'user_user_count',
 'user_item_count',
 'user_buy_ratio',
 'user_item_click_flag',
 'user_item_cart_flag',
 'user_item_order_flag',
 'user_item_click_count',
 'user_item_cart_count',
 'user_item_order_count']

In [15]:
!pip install lightgbm

[0m

In [20]:
OUTPUT_DIR = '/kaggle/working/'

features = [
    'item_item_count',
    'item_user_count',
    'item_by_count',
    'user_user_count',
    'user_item_count',
    'user_buy_ratio',
    'user_item_click_flag',
    'user_item_cart_flag',
    'user_item_order_flag',
    'user_item_click_count',
    'user_item_cart_count',
    'user_item_order_count'
]
X_train = candidates[features]
y_train = candidates[TARGET]

params = {
    'objective': 'lambdarank',
    'metric': 'ndcg', #
    'lambdarank_truncation_level': 10,
    'ndcg_eval_at': 20,
    'n_estimators': 20,
    'boosting_type': 'dart',
    'random_state': 0,
}

train_sets = lgb.Dataset(X_train, y_train, group=candidates.loc[X_train.index,:].groupby('user')['item'].agg('count').values ) #sessionごとに長さ異なること考慮(20ではない可能性あり)

model = lgb.train(
    params,
    train_sets,
    valid_sets=train_sets,
    valid_names='validation',
    verbose_eval=1  # 10 round毎に metric を表示
)

pickle.dump(model, open(OUTPUT_DIR + f'lgb_model.pkl','wb'))

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1309
[LightGBM] [Info] Number of data points in the train set: 56506027, number of used features: 12
[1]	validation's ndcg@20: 0.742974
[2]	validation's ndcg@20: 0.735119
[3]	validation's ndcg@20: 0.73464
[4]	validation's ndcg@20: 0.733941
[5]	validation's ndcg@20: 0.733249
[6]	validation's ndcg@20: 0.733228
[7]	validation's ndcg@20: 0.733525
[8]	validation's ndcg@20: 0.733355
[9]	validation's ndcg@20: 0.732524
[10]	validation's ndcg@20: 0.730444
[11]	validation's ndcg@20: 0.729597
[12]	validation's ndcg@20: 0.728835
[13]	validation's ndcg@20: 0.729314
[14]	validation's ndcg@20: 0.728183
[15]	validation's ndcg@20: 0.728539
[16]	validation's ndcg@20: 0.728324
[17]	validation's ndcg@20: 0.728376
[18]	validation's ndcg@20: 0.72849
[19]	validation's ndcg@20: 0.727541
[20]	validation's ndcg@20: 0.726589


In [22]:
model.current_iteration()

20