#まとめ

* train (最初の 3 週間)
* valid A（第4週の前半）
* valid B（第4週の後半）
* all train (train + validA + validB)
* test (第5週の前半)
* LB (第5週の後半)


In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


#config

In [2]:
class CFG:
  VER = 'baseline_ver14'
  types = 'carts'

In [3]:
import numpy as np
import pandas as pd

import collections
from collections import Counter

import lightgbm as lgb
from sklearn.model_selection import GroupKFold
import pickle

import glob

import gc

# Data Load

In [4]:
INPUT_DIR = '/content/drive/MyDrive/kaggle/2022/OTTO/input/otto-train-and-test-data-for-local-validation/'
OUTPUT_DIR = '/content/drive/MyDrive/kaggle/2022/OTTO/input/cris_baseline/output/'

df_train = pd.read_parquet(INPUT_DIR+'train.parquet')
df_val = pd.read_parquet(INPUT_DIR+'test.parquet')

print(len(df_train))
print(len(df_val))

163955180
7683577


In [5]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [6]:
#debug
#df_train = df_train.head(100000)
#df_val = df_val.head(100000)
#df_val

#Inference

In [7]:
RE_INPUT_DIR = '/content/drive/MyDrive/kaggle/2022/OTTO/input/otto-chunk-data-inparquet-format/'

In [8]:
type_labels = {'clicks':0, 'carts':1, 'orders':2}

def load_test():    
    dfs = []
    for e, chunk_file in enumerate(glob.glob(RE_INPUT_DIR + 'test_parquet/*')):
        chunk = pd.read_parquet(chunk_file)
        chunk.ts = (chunk.ts/1000).astype('int32')
        chunk['type'] = chunk['type'].map(type_labels).astype('int8')
        dfs.append(chunk)
    return pd.concat(dfs).reset_index(drop=True) #.astype({"ts": "datetime64[ms]"})

df_test = load_test()
print('Test data has shape',df_test.shape)
df_test.head()

Test data has shape (6928123, 4)


Unnamed: 0,session,aid,ts,type
0,12899779,59625,1661724000,0
1,12899780,1142000,1661724000,0
2,12899780,582732,1661724058,0
3,12899780,973453,1661724109,0
4,12899780,736515,1661724136,0


# 再step1 sessionのごとにitem候補生成 : test 

### convis matrix

In [9]:
%%time
VER = 6
DISK_PIECES = 4

def pqt_to_dict(df):
    return df.groupby('aid_x').aid_y.apply(list).to_dict()

# LOAD THREE CO-VISITATION MATRICES
top_15_carts_orders = pqt_to_dict( pd.read_parquet(f'/content/drive/MyDrive/kaggle/2022/OTTO/input/cris_baseline/output/top_15_carts_orders_v{VER}_0.pqt') )
for k in range(1,DISK_PIECES): 
    top_15_carts_orders.update( pqt_to_dict( pd.read_parquet(f'/content/drive/MyDrive/kaggle/2022/OTTO/input/cris_baseline/output/top_15_carts_orders_v{VER}_{k}.pqt') ) )

top_20_buy2buy = pqt_to_dict( pd.read_parquet(f'/content/drive/MyDrive/kaggle/2022/OTTO/input/cris_baseline/output/top_15_buy2buy_v{VER}_0.pqt') )


CPU times: user 53.3 s, sys: 2.46 s, total: 55.8 s
Wall time: 1min


In [10]:
#人気
test_popular_aid = df_test.loc[df_test['type']==1,'aid'].value_counts().index.values[:20]

#def popular_candidate(df):
#  return test_popular_aid

In [11]:
# step1
import itertools

type_weight_multipliers = {0: 1, 1: 6, 2: 3}


def generate_candidate(df, k=50):

  # USER HISTORY AIDS AND TYPES
  aids=df.aid.tolist()
  types = df.type.tolist()
  # UNIQUE AIDS AND UNIQUE BUYS
  unique_aids = list(dict.fromkeys(aids[::-1] ))
  df = df.loc[(df['type']==1)|(df['type']==2)]
  unique_buys = list(dict.fromkeys( df.aid.tolist()[::-1] ))
  # RERANK CANDIDATES USING WEIGHTS
  if len(unique_aids)>=20:
      weights=np.logspace(0.5,1,len(aids),base=2, endpoint=True)-1
      aids_temp = Counter() 
      # RERANK BASED ON REPEAT ITEMS AND TYPE OF ITEMS
      for aid,w,t in zip(aids,weights,types): 
          aids_temp[aid] += w * type_weight_multipliers[t]
      # RERANK CANDIDATES USING "BUY2BUY" CO-VISITATION MATRIX
      aids3 = list(itertools.chain(*[top_20_buy2buy[aid] for aid in unique_buys if aid in top_20_buy2buy]))
      for aid in aids3: aids_temp[aid] += 0.1
      sorted_aids = [k for k,v in aids_temp.most_common(k)]
      return sorted_aids[:k]
  # USE "CART ORDER" CO-VISITATION MATRIX
  aids2 = list(itertools.chain(*[top_15_carts_orders[aid] for aid in unique_aids if aid in top_15_carts_orders]))
  # USE "BUY2BUY" CO-VISITATION MATRIX
  aids3 = list(itertools.chain(*[top_20_buy2buy[aid] for aid in unique_buys if aid in top_20_buy2buy]))
  # RERANK CANDIDATES
  top_aids2 = [aid2 for aid2, cnt in Counter(aids2+aids3).most_common(k) if aid2 not in unique_aids] 
  result = unique_aids + top_aids2[:k]
  # USE TOP20 TEST ORDERS
  return (result + list(test_popular_aid)[:k])[:k]

In [62]:
candidates[candidates['user']==12899781]

Unnamed: 0,user,item,item_item_count,item_user_count,item_by_count,item_click_count_features,item_cart_count_features,item_order_count_features,user_user_count,user_item_count,user_buy_ratio,user_item_click_flag,user_item_cart_flag,user_item_order_flag,user_item_click_count,user_item_cart_count,user_item_order_count,user_per_item_click_count,user_per_item_cart_count,user_per_item_order_count
86,12899781,1307153,2659,1329,0.093628,2194.0,139.0,44.0,11,5,0.090881,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
87,12899781,1066725,1064,741,0.010338,891.0,9.0,1.0,11,5,0.090881,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
88,12899781,1228668,2059,1173,0.056335,1390.0,70.0,15.0,11,5,0.090881,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
89,12899781,1022447,2608,1375,0.046021,2121.0,63.0,18.0,11,5,0.090881,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
90,12899781,3542,18864,10425,0.085754,11723.0,840.0,182.0,11,5,0.090881,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
91,12899781,1008624,12597,7289,0.058838,9061.0,563.0,0.0,11,5,0.090881,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
92,12899781,1242608,12937,6646,0.105103,8483.0,634.0,161.0,11,5,0.090881,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
93,12899781,379160,1147,846,0.035736,676.0,14.0,1.0,11,5,0.090881,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
94,12899781,986164,76988,33038,0.147949,53107.0,5616.0,1800.0,11,5,0.090881,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95,12899781,1583317,511,369,0.056763,0.0,0.0,0.0,11,5,0.090881,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
from tqdm import tqdm
tqdm.pandas()

candidates = df_test.groupby('session').progress_apply(generate_candidate).explode()
candidates

100%|██████████| 1671803/1671803 [22:24<00:00, 1243.56it/s]


session
12899779      59625
12899779     469285
12899779    1657590
12899779     731692
12899779     941596
             ...   
14571581     558573
14571581     471073
14571581     332654
14571581     688602
14571581      29735
Length: 70582058, dtype: object

In [13]:
candidates.name = 'item'
candidates = candidates.to_frame().reset_index()
candidates

Unnamed: 0,session,item
0,12899779,59625
1,12899779,469285
2,12899779,1657590
3,12899779,731692
4,12899779,941596
...,...,...
70582053,14571581,558573
70582054,14571581,471073
70582055,14571581,332654
70582056,14571581,688602


In [14]:
#なぜかNanがいるので処理
candidates['item'].isnull().sum()

0

In [15]:
candidates['item'] = candidates['item'].fillna(-1)
candidates['item'] = candidates['item'].astype('int32')
candidates = candidates.sort_values('session').reset_index(drop=True)
candidates

Unnamed: 0,session,item
0,12899779,59625
1,12899779,554660
2,12899779,660655
3,12899779,1116095
4,12899779,152547
...,...,...
70582053,14571581,1072049
70582054,14571581,1571699
70582055,14571581,196038
70582056,14571581,485256


# 再step2 item特徴 : all_train + test

In [16]:
def load_train():    
    dfs = []
    for e, chunk_file in enumerate(glob.glob(RE_INPUT_DIR + 'train_parquet/*')):
        chunk = pd.read_parquet(chunk_file)
        chunk.ts = (chunk.ts/1000).astype('int32')
        chunk['type'] = chunk['type'].map(type_labels).astype('int8')
        dfs.append(chunk)
    return pd.concat(dfs).reset_index(drop=True) #.astype({"ts": "datetime64[ms]"})

all_train = load_train()
print('all_train has shape',all_train.shape)
all_train.head()

all_train has shape (216716096, 4)


Unnamed: 0,session,aid,ts,type
0,0,1517085,1659304800,0
1,0,1563459,1659304904,0
2,0,1309446,1659367439,0
3,0,16246,1659367719,0
4,0,1781822,1659367871,0


In [17]:
all_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216716096 entries, 0 to 216716095
Data columns (total 4 columns):
 #   Column   Dtype
---  ------   -----
 0   session  int64
 1   aid      int64
 2   ts       int32
 3   type     int8 
dtypes: int32(1), int64(2), int8(1)
memory usage: 4.2 GB


In [18]:
all_train = reduce_mem_usage(all_train)
all_train.info()

Memory usage of dataframe is 4340.21 MB
Memory usage after optimization is: 2686.80 MB
Decreased by 38.1%
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216716096 entries, 0 to 216716095
Data columns (total 4 columns):
 #   Column   Dtype
---  ------   -----
 0   session  int32
 1   aid      int32
 2   ts       int32
 3   type     int8 
dtypes: int32(3), int8(1)
memory usage: 2.6 GB


In [19]:
%%time
#all_train = pd.merge(df_train, df_val, on=['session','aid','type'], how='left')
item_features = pd.merge(all_train, df_test, on=['session','aid','type'], how='left').groupby('aid').agg({'aid': 'count', 'session': 'nunique', 'type': 'mean'})
item_features

CPU times: user 3min 34s, sys: 26.8 s, total: 4min 1s
Wall time: 4min


Unnamed: 0_level_0,aid,session,type
aid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,44,36,0.000000
1,34,30,0.029412
2,17,16,0.000000
3,2557,1277,0.102464
4,213,138,0.042254
...,...,...,...
1855598,7,7,0.000000
1855599,13,11,0.000000
1855600,89,56,0.089888
1855601,92,62,0.076087


In [20]:
item_features.columns = ['item_item_count', 'item_user_count', 'item_by_count']
item_features.to_parquet('item_features.pqt')
item_features

Unnamed: 0_level_0,item_item_count,item_user_count,item_by_count
aid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,44,36,0.000000
1,34,30,0.029412
2,17,16,0.000000
3,2557,1277,0.102464
4,213,138,0.042254
...,...,...,...
1855598,7,7,0.000000
1855599,13,11,0.000000
1855600,89,56,0.089888
1855601,92,62,0.076087


このアイテムがトレイン内でクリック,cart,orderされた回数

In [21]:
%%time
item_click_count_features = df_train[df_train['type']==0].groupby('aid').agg({'session': 'count'})
item_click_count_features.columns = ['item_click_count_features']
#item_features.to_parquet('item_features.pqt')
item_click_count_features

item_cart_count_features = df_train[df_train['type']==1].groupby('aid').agg({'session': 'count'})
item_cart_count_features.columns = ['item_cart_count_features']
#item_features.to_parquet('item_features.pqt')
item_cart_count_features

item_order_count_features = df_train[df_train['type']==2].groupby('aid').agg({'session': 'count'})
item_order_count_features.columns = ['item_order_count_features']
#item_features.to_parquet('item_features.pqt')
item_order_count_features

CPU times: user 17.1 s, sys: 376 ms, total: 17.4 s
Wall time: 17.3 s


Unnamed: 0_level_0,item_order_count_features
aid,Unnamed: 1_level_1
3,15
10,1
11,2
14,3
16,1
...,...
1855589,1
1855592,3
1855594,54
1855597,1


In [22]:
item_features = item_features.merge(item_click_count_features, on=['aid'], how='outer').fillna(0)
item_features = item_features.merge(item_cart_count_features, on=['aid'], how='outer').fillna(0)
item_features = item_features.merge(item_order_count_features, on=['aid'], how='outer').fillna(0)
#item_features = item_features.drop_duplicates('aid')
#user_item_count = user_item_count.rename(columns={'session':'user','aid':'item'})
item_features

Unnamed: 0_level_0,item_item_count,item_user_count,item_by_count,item_click_count_features,item_cart_count_features,item_order_count_features
aid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,44,36,0.000000,33.0,0.0,0.0
1,34,30,0.029412,31.0,1.0,0.0
2,17,16,0.000000,12.0,0.0,0.0
3,2557,1277,0.102464,1120.0,66.0,15.0
4,213,138,0.042254,135.0,5.0,0.0
...,...,...,...,...,...,...
1855598,7,7,0.000000,6.0,0.0,0.0
1855599,13,11,0.000000,9.0,0.0,0.0
1855600,89,56,0.089888,64.0,3.0,1.0
1855601,92,62,0.076087,85.0,7.0,0.0


In [23]:
del item_click_count_features, item_cart_count_features, item_order_count_features
gc.collect

<function gc.collect(generation=2)>

In [24]:
del all_train
gc.collect()

26

# 再step3 user特徴 :test

In [25]:
user_features = df_test.groupby('session').agg({'session': 'count', 'aid': 'nunique', 'type': 'mean'})
user_features.columns = ['user_user_count','user_item_count','user_buy_ratio']
# CONVERT COLUMNS TO INT32 and FLOAT32 HERE
user_features.to_parquet('user_features.pqt')
user_features

Unnamed: 0_level_0,user_user_count,user_item_count,user_buy_ratio
session,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12899779,1,1,0.000000
12899780,5,4,0.000000
12899781,11,5,0.090909
12899782,70,38,0.457143
12899783,11,9,0.000000
...,...,...,...
14571577,1,1,0.000000
14571578,1,1,0.000000
14571579,1,1,0.000000
14571580,1,1,0.000000


# 再step4 user × item 特徴量生成 : valid A

このアイテムはユーザーによって既にクリック,cart,orderされていますか

In [26]:
#user(session) x item(aid)に対するaction(type)の有無(1/0)を特徴量にする
def create_action_flag(df, type_number=0, column_name="user_item_click_flag"):
    action_flag_df = df.loc[df["type"] == type_number, ["session", "aid"]].drop_duplicates()
    action_flag_df[column_name] = 1
    return action_flag_df

In [27]:
user_item_click_flag = create_action_flag(df_val, type_number=0, column_name="user_item_click_flag")
user_item_cart_flag = create_action_flag(df_val, type_number=1, column_name="user_item_cart_flag")
user_item_order_flag = create_action_flag(df_val, type_number=2, column_name="user_item_order_flag")

In [28]:
tmp = pd.merge(
    user_item_click_flag,
    user_item_cart_flag,
    on=["session", "aid"],
    how="outer"
)
user_by_item_features = pd.merge(
    tmp,
    user_item_order_flag,
    on=["session", "aid"],
    how="outer"
).fillna(0)
user_by_item_features

Unnamed: 0,session,aid,user_item_click_flag,user_item_cart_flag,user_item_order_flag
0,11098528,11830,1.0,0.0,0.0
1,11098529,1105029,1.0,0.0,0.0
2,11098530,264500,1.0,0.0,0.0
3,11098530,409236,1.0,1.0,0.0
4,11098531,452188,1.0,0.0,1.0
...,...,...,...,...,...
5535985,12896465,1551275,0.0,0.0,1.0
5535986,12896465,1306971,0.0,0.0,1.0
5535987,12896768,1303029,0.0,0.0,1.0
5535988,12898765,73333,0.0,0.0,1.0


In [29]:
#df_val[df_val['type']==0].groupby('session').agg({'type': 'count'})

ユーザーはこの項目を複数回クリックしましたか? 幾つか

In [30]:
#click
user_item_click_count = df_val[df_val['type']==0].groupby(['session','aid']).agg({'aid': 'count'})
user_item_click_count.columns = ['user_item_click_count']
user_item_click_count = df_val[['session','aid']].merge(user_item_click_count, how='left', on=['session','aid'])
#user_item_click_count.to_parquet('user_item_click_count.pqt')
user_item_click_count

#cart
user_item_cart_count = df_val[df_val['type']==1].groupby(['session','aid']).agg({'aid': 'count'})
user_item_cart_count.columns = ['user_item_cart_count']
user_item_cart_count = df_val[['session','aid']].merge(user_item_cart_count, how='left', on=['session','aid'])
#user_item_click_count.to_parquet('user_item_click_count.pqt')
user_item_cart_count

#order
user_item_order_count = df_val[df_val['type']==2].groupby(['session','aid']).agg({'aid': 'count'})
user_item_order_count.columns = ['user_item_order_count']
user_item_order_count = df_val[['session','aid']].merge(user_item_order_count, how='left', on=['session','aid'])
#user_item_click_count.to_parquet('user_item_click_count.pqt')
user_item_order_count

user_item_count = user_item_click_count.merge(user_item_cart_count, on=['session','aid'], how='outer').fillna(0)
user_item_count = user_item_count.merge(user_item_order_count, on=['session','aid'], how='outer').fillna(0)
user_item_count = user_item_count.drop_duplicates(['session','aid'])
#user_item_count = user_item_count.rename(columns={'session':'user','aid':'item'})
user_item_count

Unnamed: 0,session,aid,user_item_click_count,user_item_cart_count,user_item_order_count
0,11098528,11830,1.0,0.0,0.0
1,11098529,1105029,1.0,0.0,0.0
2,11098530,264500,2.0,0.0,0.0
10,11098530,409236,3.0,1.0,0.0
74,11098531,452188,2.0,0.0,1.0
...,...,...,...,...,...
112908356,12899774,33035,1.0,0.0,0.0
112908357,12899775,1743151,1.0,0.0,0.0
112908358,12899776,548599,1.0,0.0,0.0
112908359,12899777,384045,1.0,0.0,0.0


In [31]:
#oka × suzuki merge

user_item_features = user_by_item_features.merge(user_item_count, on=['session','aid'], how='outer').fillna(0)
user_item_features

Unnamed: 0,session,aid,user_item_click_flag,user_item_cart_flag,user_item_order_flag,user_item_click_count,user_item_cart_count,user_item_order_count
0,11098528,11830,1.0,0.0,0.0,1.0,0.0,0.0
1,11098529,1105029,1.0,0.0,0.0,1.0,0.0,0.0
2,11098530,264500,1.0,0.0,0.0,2.0,0.0,0.0
3,11098530,409236,1.0,1.0,0.0,3.0,1.0,0.0
4,11098531,452188,1.0,0.0,1.0,2.0,0.0,1.0
...,...,...,...,...,...,...,...,...
5535985,12896465,1551275,0.0,0.0,1.0,0.0,0.0,1.0
5535986,12896465,1306971,0.0,0.0,1.0,0.0,0.0,1.0
5535987,12896768,1303029,0.0,0.0,1.0,0.0,0.0,1.0
5535988,12898765,73333,0.0,0.0,1.0,0.0,0.0,1.0


ユーザーがすでにクリック,cart,orderしたアイテムの数

In [32]:
#click
user_per_item_click_count = df_val[df_val['type']==0].groupby('session').agg({'aid': 'count'})
user_per_item_click_count.columns = ['user_per_item_click_count']
user_per_item_click_count = df_val[['session','aid']].merge(user_per_item_click_count, how='left', on='session')
#user_item_click_count.to_parquet('user_item_click_count.pqt')
user_per_item_click_count

#cart
user_per_item_cart_count = df_val[df_val['type']==1].groupby('session').agg({'aid': 'count'})
user_per_item_cart_count.columns = ['user_per_item_cart_count']
user_per_item_cart_count = df_val[['session','aid']].merge(user_per_item_cart_count, how='left', on='session')
#user_item_click_count.to_parquet('user_item_click_count.pqt')
user_per_item_cart_count

#order
user_per_item_order_count = df_val[df_val['type']==2].groupby('session').agg({'aid': 'count'})
user_per_item_order_count.columns = ['user_per_item_order_count']
user_per_item_order_count = df_val[['session','aid']].merge(user_per_item_order_count, how='left', on='session')
#user_item_click_count.to_parquet('user_item_click_count.pqt')
user_per_item_order_count

user_item_count = user_per_item_click_count.merge(user_per_item_cart_count, on=['session','aid'], how='outer').fillna(0)
user_item_count = user_item_count.merge(user_per_item_order_count, on=['session','aid'], how='outer').fillna(0)
user_item_count = user_item_count.drop_duplicates(['session','aid'])
#user_item_count = user_item_count.rename(columns={'session':'user','aid':'item'})
user_item_count

Unnamed: 0,session,aid,user_per_item_click_count,user_per_item_cart_count,user_per_item_order_count
0,11098528,11830,1.0,0.0,0.0
1,11098529,1105029,1.0,0.0,0.0
2,11098530,264500,5.0,1.0,0.0
10,11098530,409236,5.0,1.0,0.0
74,11098531,452188,20.0,0.0,4.0
...,...,...,...,...,...
112908356,12899774,33035,1.0,0.0,0.0
112908357,12899775,1743151,1.0,0.0,0.0
112908358,12899776,548599,1.0,0.0,0.0
112908359,12899777,384045,1.0,0.0,0.0


In [33]:
user_item_features = user_item_features.merge(user_item_count, on=['session','aid'], how='outer').fillna(0)
user_item_features

Unnamed: 0,session,aid,user_item_click_flag,user_item_cart_flag,user_item_order_flag,user_item_click_count,user_item_cart_count,user_item_order_count,user_per_item_click_count,user_per_item_cart_count,user_per_item_order_count
0,11098528,11830,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,11098529,1105029,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,11098530,264500,1.0,0.0,0.0,2.0,0.0,0.0,5.0,1.0,0.0
3,11098530,409236,1.0,1.0,0.0,3.0,1.0,0.0,5.0,1.0,0.0
4,11098531,452188,1.0,0.0,1.0,2.0,0.0,1.0,20.0,0.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...
5535985,12896465,1551275,0.0,0.0,1.0,0.0,0.0,1.0,5.0,3.0,9.0
5535986,12896465,1306971,0.0,0.0,1.0,0.0,0.0,1.0,5.0,3.0,9.0
5535987,12896768,1303029,0.0,0.0,1.0,0.0,0.0,1.0,11.0,3.0,2.0
5535988,12898765,73333,0.0,0.0,1.0,0.0,0.0,1.0,5.0,5.0,4.0


最後にクリック,cart,orderされたアイテムにflag

In [34]:
'''
#click
user_item_click_last = df_val[df_val['type']==0].groupby('session').agg({'aid': 'last'})
user_item_click_last.columns = ['user_item_click_last']
user_item_click_last = df_val[['session','aid']].merge(user_item_click_last, how='left', on='session')
#user_item_click_last.to_parquet('user_item_click_last.pqt')
user_item_click_last

#cart
user_item_cart_last = df_val[df_val['type']==1].groupby('session').agg({'aid': 'last'})
user_item_cart_last.columns = ['user_item_cart_last']
user_item_cart_last = df_val[['session','aid']].merge(user_item_cart_last, how='left', on='session')
#user_item_cuser_item_cart_lastlick_count.to_parquet('user_item_cart_last.pqt')
user_item_cart_last

#order
user_item_order_last = df_val[df_val['type']==2].groupby('session').agg({'aid': 'last'})
user_item_order_last.columns = ['user_item_order_last']
user_item_order_last = df_val[['session','aid']].merge(user_item_order_last, how='left', on='session')
#user_item_order_order.to_parquet('user_item_order_order.pqt')
user_item_order_last

user_item_count = user_item_click_last.merge(user_item_cart_last, on=['session','aid'], how='outer').fillna(0)
user_item_count = user_item_count.merge(user_item_order_last, on=['session','aid'], how='outer').fillna(0)
user_item_count = user_item_count.drop_duplicates(['session','aid'])
#user_item_count = user_item_count.rename(columns={'session':'user','aid':'item'})
user_item_count

user_item_features = user_item_features.merge(user_item_count, on=['session','aid'], how='outer').fillna(0)
user_item_features
'''

"\n#click\nuser_item_click_last = df_val[df_val['type']==0].groupby('session').agg({'aid': 'last'})\nuser_item_click_last.columns = ['user_item_click_last']\nuser_item_click_last = df_val[['session','aid']].merge(user_item_click_last, how='left', on='session')\n#user_item_click_last.to_parquet('user_item_click_last.pqt')\nuser_item_click_last\n\n#cart\nuser_item_cart_last = df_val[df_val['type']==1].groupby('session').agg({'aid': 'last'})\nuser_item_cart_last.columns = ['user_item_cart_last']\nuser_item_cart_last = df_val[['session','aid']].merge(user_item_cart_last, how='left', on='session')\n#user_item_cuser_item_cart_lastlick_count.to_parquet('user_item_cart_last.pqt')\nuser_item_cart_last\n\n#order\nuser_item_order_last = df_val[df_val['type']==2].groupby('session').agg({'aid': 'last'})\nuser_item_order_last.columns = ['user_item_order_last']\nuser_item_order_last = df_val[['session','aid']].merge(user_item_order_last, how='left', on='session')\n#user_item_order_order.to_parquet('u

In [35]:
#del user_by_item_features, user_per_item_cart_count, user_per_item_order_count, user_item_click_last, user_item_cart_last, user_item_order_last, user_item_count
#gc.collect

In [36]:
del user_by_item_features, user_per_item_cart_count, user_per_item_order_count, user_item_count
gc.collect

<function gc.collect(generation=2)>

# 再step5 : 再step1に 再step2,3を追加

In [37]:
candidates.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70582058 entries, 0 to 70582057
Data columns (total 2 columns):
 #   Column   Dtype
---  ------   -----
 0   session  int64
 1   item     int32
dtypes: int32(1), int64(1)
memory usage: 807.7 MB


In [38]:
candidates = reduce_mem_usage(candidates)
candidates.info()

Memory usage of dataframe is 807.75 MB
Memory usage after optimization is: 538.50 MB
Decreased by 33.3%
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70582058 entries, 0 to 70582057
Data columns (total 2 columns):
 #   Column   Dtype
---  ------   -----
 0   session  int32
 1   item     int32
dtypes: int32(2)
memory usage: 538.5 MB


In [39]:
item_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1855603 entries, 0 to 1855602
Data columns (total 6 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   item_item_count            int64  
 1   item_user_count            int64  
 2   item_by_count              float64
 3   item_click_count_features  float64
 4   item_cart_count_features   float64
 5   item_order_count_features  float64
dtypes: float64(4), int64(2)
memory usage: 99.1 MB


In [40]:
item_features = reduce_mem_usage(item_features)
item_features.info()

Memory usage of dataframe is 99.10 MB
Memory usage after optimization is: 46.01 MB
Decreased by 53.6%
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1855603 entries, 0 to 1855602
Data columns (total 6 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   item_item_count            int32  
 1   item_user_count            int32  
 2   item_by_count              float16
 3   item_click_count_features  float32
 4   item_cart_count_features   float16
 5   item_order_count_features  float16
dtypes: float16(3), float32(1), int32(2)
memory usage: 46.0 MB


In [41]:
user_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1671803 entries, 12899779 to 14571581
Data columns (total 3 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   user_user_count  1671803 non-null  int64  
 1   user_item_count  1671803 non-null  int64  
 2   user_buy_ratio   1671803 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 51.0 MB


In [42]:
user_features = reduce_mem_usage(user_features)
user_features.info()

Memory usage of dataframe is 51.02 MB
Memory usage after optimization is: 22.32 MB
Decreased by 56.2%
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1671803 entries, 12899779 to 14571581
Data columns (total 3 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   user_user_count  1671803 non-null  int16  
 1   user_item_count  1671803 non-null  int16  
 2   user_buy_ratio   1671803 non-null  float16
dtypes: float16(1), int16(2)
memory usage: 22.3 MB


In [43]:
user_item_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5535990 entries, 0 to 5535989
Data columns (total 11 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   session                    int32  
 1   aid                        int32  
 2   user_item_click_flag       float64
 3   user_item_cart_flag        float64
 4   user_item_order_flag       float64
 5   user_item_click_count      float64
 6   user_item_cart_count       float64
 7   user_item_order_count      float64
 8   user_per_item_click_count  float64
 9   user_per_item_cart_count   float64
 10  user_per_item_order_count  float64
dtypes: float64(9), int32(2)
memory usage: 464.6 MB


In [44]:
user_item_features = reduce_mem_usage(user_item_features)
user_item_features.info()

Memory usage of dataframe is 464.60 MB
Memory usage after optimization is: 179.50 MB
Decreased by 61.4%
<class 'pandas.core.frame.DataFrame'>
Int64Index: 5535990 entries, 0 to 5535989
Data columns (total 11 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   session                    int32  
 1   aid                        int32  
 2   user_item_click_flag       float16
 3   user_item_cart_flag        float16
 4   user_item_order_flag       float16
 5   user_item_click_count      float16
 6   user_item_cart_count       float16
 7   user_item_order_count      float16
 8   user_per_item_click_count  float16
 9   user_per_item_cart_count   float16
 10  user_per_item_order_count  float16
dtypes: float16(9), int32(2)
memory usage: 179.5 MB


In [45]:
candidates = candidates.rename(columns={'session':'user'})

#step2
#item_features = pd.read_parquet('item_features.pqt')
candidates = candidates.merge(item_features, left_on='item', right_index=True, how='left').fillna(-1)

#step3
#user_features = pd.read_parquet('user_features.pqt')
user_features = user_features.rename(columns={'session':'user'})
candidates = candidates.merge(user_features, left_on='user', right_index=True, how='left').fillna(-1)

#step4
#user_features = pd.read_parquet('user_features.pqt')
user_item_features = user_item_features.rename(columns={'session':'user'})
user_item_features = user_item_features.rename(columns={'aid':'item'})
candidates = candidates.merge(user_item_features, on=['user','item'], how='left').fillna(0)
candidates

Unnamed: 0,user,item,item_item_count,item_user_count,item_by_count,item_click_count_features,item_cart_count_features,item_order_count_features,user_user_count,user_item_count,user_buy_ratio,user_item_click_flag,user_item_cart_flag,user_item_order_flag,user_item_click_count,user_item_cart_count,user_item_order_count,user_per_item_click_count,user_per_item_cart_count,user_per_item_order_count
0,12899779,59625,12,11,0.000000,10.0,0.0,0.0,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,12899779,554660,80197,41515,0.108948,53809.0,5136.0,452.0,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,12899779,660655,5532,1990,0.345215,1.0,0.0,0.0,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,12899779,1116095,54644,28747,0.106995,30519.0,2720.0,384.0,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,12899779,152547,51153,10419,0.362305,17644.0,9824.0,0.0,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70582053,14571581,1072049,685,396,0.160645,427.0,64.0,12.0,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
70582054,14571581,1571699,591,351,0.054138,378.0,14.0,3.0,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
70582055,14571581,196038,6527,4052,0.090576,5514.0,393.0,66.0,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
70582056,14571581,485256,126836,27497,0.234009,0.0,0.0,0.0,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
del item_features, user_features, user_item_features
gc.collect

<function gc.collect(generation=2)>

In [47]:
candidates.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 70582058 entries, 0 to 70582057
Data columns (total 20 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   user                       int32  
 1   item                       int32  
 2   item_item_count            int32  
 3   item_user_count            int32  
 4   item_by_count              float16
 5   item_click_count_features  float32
 6   item_cart_count_features   float16
 7   item_order_count_features  float16
 8   user_user_count            int16  
 9   user_item_count            int16  
 10  user_buy_ratio             float16
 11  user_item_click_flag       float16
 12  user_item_cart_flag        float16
 13  user_item_order_flag       float16
 14  user_item_click_count      float16
 15  user_item_cart_count       float16
 16  user_item_order_count      float16
 17  user_per_item_click_count  float16
 18  user_per_item_cart_count   float16
 19  user_per_item_order_count  float16
dtype

In [48]:
candidates = reduce_mem_usage(candidates)
candidates.info()

Memory usage of dataframe is 3904.11 MB
Memory usage after optimization is: 3904.11 MB
Decreased by 0.0%
<class 'pandas.core.frame.DataFrame'>
Int64Index: 70582058 entries, 0 to 70582057
Data columns (total 20 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   user                       int32  
 1   item                       int32  
 2   item_item_count            int32  
 3   item_user_count            int32  
 4   item_by_count              float16
 5   item_click_count_features  float32
 6   item_cart_count_features   float16
 7   item_order_count_features  float16
 8   user_user_count            int16  
 9   user_item_count            int16  
 10  user_buy_ratio             float16
 11  user_item_click_flag       float16
 12  user_item_cart_flag        float16
 13  user_item_order_flag       float16
 14  user_item_click_count      float16
 15  user_item_cart_count       float16
 16  user_item_order_count      float16
 17  user_per_item_c

#pred

In [49]:
#スライスずれるのでtargetは最後のカラムにしておく必要あり。
FEATURES = list(candidates.columns[2:])
FEATURES

['item_item_count',
 'item_user_count',
 'item_by_count',
 'item_click_count_features',
 'item_cart_count_features',
 'item_order_count_features',
 'user_user_count',
 'user_item_count',
 'user_buy_ratio',
 'user_item_click_flag',
 'user_item_cart_flag',
 'user_item_order_flag',
 'user_item_click_count',
 'user_item_cart_count',
 'user_item_order_count',
 'user_per_item_click_count',
 'user_per_item_cart_count',
 'user_per_item_order_count']

In [50]:
!pip install lightgbm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [51]:
preds = np.zeros(len(candidates))
for fold in range(4):
    with open(OUTPUT_DIR + f'lgb_model_{CFG.VER}_{fold}.pkl', 'rb') as f:
      model = pickle.load(f)
    preds += model.predict(candidates[FEATURES], num_iteration=model.best_iteration)/5

predictions = candidates[['user','item']].copy()
predictions['pred'] = preds
predictions

Unnamed: 0,user,item,pred
0,12899779,59625,-0.172866
1,12899779,554660,-0.091880
2,12899779,660655,-0.133983
3,12899779,1116095,-0.091880
4,12899779,152547,-0.049762
...,...,...,...
70582053,14571581,1072049,-0.108448
70582054,14571581,1571699,-0.172866
70582055,14571581,196038,-0.130884
70582056,14571581,485256,-0.136229


In [52]:
predictions = predictions.sort_values(['user','pred'], ascending=[True,False]).reset_index(drop=True)
predictions

Unnamed: 0,user,item,pred
0,12899779,152547,-0.049762
1,12899779,33343,-0.080432
2,12899779,1022566,-0.090749
3,12899779,544144,-0.090749
4,12899779,329725,-0.090749
...,...,...,...
70582053,14571581,1158237,-0.172866
70582054,14571581,1124107,-0.172866
70582055,14571581,940217,-0.172866
70582056,14571581,1236674,-0.172866


In [53]:
predictions['n'] = predictions.groupby('user').item.cumcount().astype('int8')
predictions

Unnamed: 0,user,item,pred,n
0,12899779,152547,-0.049762,0
1,12899779,33343,-0.080432,1
2,12899779,1022566,-0.090749,2
3,12899779,544144,-0.090749,3
4,12899779,329725,-0.090749,4
...,...,...,...,...
70582053,14571581,1158237,-0.172866,31
70582054,14571581,1124107,-0.172866,32
70582055,14571581,940217,-0.172866,33
70582056,14571581,1236674,-0.172866,34


In [54]:
predictions = predictions.loc[predictions.n<20]
predictions

Unnamed: 0,user,item,pred,n
0,12899779,152547,-0.049762,0
1,12899779,33343,-0.080432,1
2,12899779,1022566,-0.090749,2
3,12899779,544144,-0.090749,3
4,12899779,329725,-0.090749,4
...,...,...,...,...
70582037,14571581,1460571,-0.092380,15
70582038,14571581,558573,-0.092814,16
70582039,14571581,579690,-0.104987,17
70582040,14571581,1072049,-0.108448,18


In [55]:
sub = predictions.groupby('user').item.apply(list)
sub = sub.to_frame().reset_index()
sub

Unnamed: 0,user,item
0,12899779,"[152547, 33343, 1022566, 544144, 329725, 10435..."
1,12899780,"[152547, 33343, 1460571, 231487, 215561, 11256..."
2,12899781,"[1460571, 986164, 754412, 331708, 893268, 8110..."
3,12899782,"[603159, 303142, 1258776, 1445562, 1840480, 12..."
4,12899783,"[1500659, 74735, 1383529, 1586171, 588923, 887..."
...,...,...
1671798,14571577,"[152547, 33343, 1022566, 544144, 1043508, 1006..."
1671799,14571578,"[152547, 33343, 1022566, 544144, 986164, 32972..."
1671800,14571579,"[152547, 33343, 1022566, 544144, 1043508, 1006..."
1671801,14571580,"[152547, 33343, 1022566, 544144, 986164, 32972..."


In [56]:
sub.item = sub.item.apply(lambda x: " ".join(map(str,x)))
sub

Unnamed: 0,user,item
0,12899779,152547 33343 1022566 544144 329725 1043508 332...
1,12899780,152547 33343 1460571 231487 215561 1125638 832...
2,12899781,1460571 986164 754412 331708 893268 811084 918...
3,12899782,603159 303142 1258776 1445562 1840480 1295242 ...
4,12899783,1500659 74735 1383529 1586171 588923 887179 73...
...,...,...
1671798,14571577,152547 33343 1022566 544144 1043508 1006198 33...
1671799,14571578,152547 33343 1022566 544144 986164 329725 1043...
1671800,14571579,152547 33343 1022566 544144 1043508 1006198 33...
1671801,14571580,152547 33343 1022566 544144 986164 329725 1043...


In [57]:
sub.columns = ['session_type','labels']
sub.session_type = sub.session_type.astype('str')+ f'_{CFG.types}'
sub

Unnamed: 0,session_type,labels
0,12899779_carts,152547 33343 1022566 544144 329725 1043508 332...
1,12899780_carts,152547 33343 1460571 231487 215561 1125638 832...
2,12899781_carts,1460571 986164 754412 331708 893268 811084 918...
3,12899782_carts,603159 303142 1258776 1445562 1840480 1295242 ...
4,12899783_carts,1500659 74735 1383529 1586171 588923 887179 73...
...,...,...
1671798,14571577_carts,152547 33343 1022566 544144 1043508 1006198 33...
1671799,14571578_carts,152547 33343 1022566 544144 986164 329725 1043...
1671800,14571579_carts,152547 33343 1022566 544144 1043508 1006198 33...
1671801,14571580_carts,152547 33343 1022566 544144 986164 329725 1043...


# save

In [58]:
with open(OUTPUT_DIR+f'{CFG.VER}_{CFG.types}_preds.pkl','wb') as f:
  pickle.dump(sub, f)