# Create covisitation matrices to be used in candidate generation #
We create three types of covisitation matrix:
1. What else did people who clicked/carted/ordered the focal product also click/cart/order weighted to those occuring more recently in the data
2. What else did people who clicked/carted/ordered the focal product also click/cart/order weighted to items being carted and ordered. 
3. What else did people who carted/ordered the product also cart/order. 

The joining operations are memory intensive so we split the dataframe into chunks and work through piecewise to avoid running out of ram.

This process is inspired by this kaggle post: https://www.kaggle.com/code/cdeotte/candidate-rerank-model-lb-0-575

In [2]:
%cd kaggle-otto-recommender-2022/

/home/jupyter/kaggle-otto-recommender-2022


In [5]:
# !rm -rf train_candidate_features/

# Config 

In [33]:
from config import local, data_path, path_to_module

sample_prop = None
validation = True

In [34]:
local

False

In [35]:
data_path

'/home/jupyter/kaggle-otto-recommender-2022/data'

In [36]:
path_to_module

'/home/jupyter/kaggle-otto-recommender-2022'

In [37]:
if local:
  from google.colab import drive
  drive.mount('/content/drive')
  %cd /content/drive/MyDrive/'Kaggle Otto Reccommender'/data
  path_to_module = '/content/drive/MyDrive/Kaggle Otto Reccommender/'
else:
  # !mkdir /my_mnt_dir
  # !google-drive-ocamlfuse /my_mnt_dir
  %cd {data_path}
  # path_to_module = '/home/jupyter/kaggle-otto-recommender-2022'

import sys    
sys.path.append(path_to_module)

/home/jupyter/kaggle-otto-recommender-2022/data


In [38]:
# path_to_module

In [39]:
import glob
import numpy as np
import pandas as pd
import gc
from tqdm import tqdm
from otto_utils import get_train, get_test, convert_columns, make_directory
import os

In [40]:
!pip install fastparquet

[0m

In [41]:

path_to_candidate_features = './train_candidate_features' if validation else './test_candidate_features'
make_directory(path_to_candidate_features)
make_directory(f'{path_to_candidate_features}/covisitation_parquet')
make_directory(f'{path_to_candidate_features}/cart_order_parquet')
make_directory(f'{path_to_candidate_features}/also_buy_parquet')
n=20

In [42]:
path_to_candidate_features

'./train_candidate_features'

In [43]:
%pwd

'/home/jupyter/kaggle-otto-recommender-2022/data'

## Read data 

In [44]:
reduced_df = pd.concat([get_train(validation=validation, sample_prop=sample_prop), get_test(validation=validation, sample_prop=sample_prop)])
reduced_df['ts'] = reduced_df['ts'] / 1000
reduced_df['ts'] = reduced_df['ts'].astype('int32')

In [45]:
reduced_df = convert_columns(reduced_df)

In [46]:
sessions = reduced_df['session'].unique()
sessions.sort()
aids = reduced_df['aid'].unique()
aids.sort()

session_lists = [np_array.tolist() for np_array in np.array_split(np.array(sessions), int(reduced_df.shape[0]*200 / 163955180)) ]
aid_lists = [np_array.tolist() for np_array in np.array_split(np.array(aids), int(reduced_df.shape[0]*100 / 163955180)) ]
max_ts = 1662328791
min_ts = 1659304800
diff_ts = max_ts - min_ts

In [47]:
sorted_chunks = []
for session_list in tqdm(session_lists):
  chunk = reduced_df[(reduced_df['session'] >= min(session_list)) & (reduced_df['session'] <= max(session_list))]
  chunk = chunk.sort_values(['session','ts'],ascending=[True,False])
  chunk = chunk.reset_index(drop=True)
  chunk['n'] = chunk.groupby('session').cumcount()
  chunk = chunk.loc[chunk.n<30].drop('n',axis=1)
  sorted_chunks.append(chunk)
reduced_df = pd.concat(sorted_chunks)
del sorted_chunks

100% 209/209 [01:18<00:00,  2.66it/s]


In [51]:
reduced_df.shape

(115162296, 4)

## Matrix 1 

Build the clicks/cart/order to clicks/cart/order covisitation matrix weighted towards things happening more recently. 

In [6]:
! ls

10_lightgbm_inference.ipynb		8_evaulate_model.ipynb
11_combine_candidates_and_submit.ipynb	9_xgboost_inference.ipynb
1_get_data.ipynb			README.md
2_generate_covisitation_matrix.ipynb	__pycache__
3_generate_candidates_parquets.ipynb	config.py
4_merge_candidates.ipynb		data
5_add_features.ipynb			docs
6_train_xgboost.ipynb			kaggle.json
7_train_lightgbm.ipynb			otto_utils.py


In [7]:
ls data/train_candidate_features/covisitation_parquet/

wgt_covisitation_0_top20.parquet  wgt_covisitation_2_top20.parquet
wgt_covisitation_1_top20.parquet  wgt_covisitation_3_top20.parquet


In [8]:
! rm data/train_candidate_features/covisitation_parquet/*

In [49]:
for i, aid_list in enumerate(tqdm(aid_lists)):
  tmp_list = []

  for session_list in session_lists:
    df = reduced_df[(reduced_df['session'] >= min(session_list)) & (reduced_df['session'] <= max(session_list))]
    tmp = (
        df.loc[(df['aid'] >= min(aid_list)) & (df['aid'] <= max(aid_list))]
        .merge(df,
              how = 'inner',
              on = 'session')
    )
    tmp = (
        tmp
        .loc[ ((tmp.ts_x - tmp.ts_y).abs() < 24 * 60 * 60) & (tmp.aid_x != tmp.aid_y) ]
        .drop_duplicates(['session', 'aid_x', 'aid_y'])
    )
    tmp['wgt'] = 1 + 3 * (tmp.ts_x - min_ts) / diff_ts
    tmp = (
        tmp.groupby(['aid_x', 'aid_y'], as_index=False)
        .agg({'wgt' : 'sum'})
        .rename(columns={'wgt' : 'pairings'})
    )
    tmp['pairings'] = tmp['pairings'].astype('float32')
    tmp_list.append(tmp)
  out = pd.concat(tmp_list)
  out = (
      pd.concat(tmp_list)
      .groupby(['aid_x', 'aid_y'], as_index=False)
      .agg({'pairings' : 'sum'})
      .sort_values(by=['aid_x', 'pairings'], ascending=[True, False])
  )

  out['n'] = out.groupby(['aid_x']).cumcount() + 1
  out = out.loc[out['n'] <= n]
  for column in ['aid_x', 'aid_y']:
    out[column] = out[column].astype('int32')
  out.to_parquet(f'{path_to_candidate_features}/covisitation_parquet/wgt_covisitation_{i}_top{n}.parquet', index=False)
  del tmp_list, out

  4% 4/104 [02:21<58:52, 35.33s/it]


KeyboardInterrupt: 

Build the clicks/cart/order to clicks/cart/order weighted towards carts/orders matrix weighted by the type of interaction

## Covisitation Matrix 

In [None]:
type_weight_map = {
    'clicks' : 1,
    'carts' : 6,
    'orders' : 3
}

for i, aid_list in enumerate(tqdm(aid_lists)):
  tmp_list = []

  for session_list in session_lists:
    df = reduced_df[(reduced_df['session'] >= min(session_list)) & (reduced_df['session'] <= max(session_list))]
    tmp = (
        df.loc[(df['aid'] >= min(aid_list)) & (df['aid'] <= max(aid_list))]
        .merge(df,
              how = 'inner',
              on = 'session')
    )
    tmp = (
        tmp
        .loc[ ((tmp.ts_x - tmp.ts_y).abs() < 24 * 60 * 60) & (tmp.aid_x != tmp.aid_y) ]
    )
    tmp['wgt'] = tmp['type_y'].map(type_weight_map)
    tmp = (
        tmp.groupby(['aid_x', 'aid_y'], as_index=False)
        .agg({'wgt' : 'sum'})
        .rename(columns={'wgt' : 'pairings'})
    )
    tmp['pairings'] = tmp['pairings'].astype('int32')
    tmp_list.append(tmp)
  out = pd.concat(tmp_list)
  out = (
      pd.concat(tmp_list)
      .groupby(['aid_x', 'aid_y'], as_index=False)
      .agg({'pairings' : 'sum'})
      .sort_values(by=['aid_x', 'pairings'], ascending=[True, False])
  )

  out['n'] = out.groupby(['aid_x']).cumcount() + 1
  out = out.loc[out['n'] <= 15]

  for column in ['aid_x', 'aid_y']:
    out[column] = out[column].astype('int32')
  out.to_parquet(f'{path_to_candidate_features}/cart_order_parquet/cart_order_top15_{i}.parquet', index=False)
  
  del tmp_list, out

Build the covisitation of what people carted/ordered alongside carts and orders. E.g those that bought x also bought... 

In [None]:
%pwd

In [None]:
! ls

## Also Buy 

In [None]:
session_lists = [np_array.tolist() for np_array in np.array_split(np.array(sessions), int(reduced_df.shape[0]*3 / 163955180)) ]
aid_lists = [np_array.tolist() for np_array in np.array_split(np.array(aids), int(reduced_df.shape[0]*10 / 163955180)) ]

In [None]:
for i, aid_list in enumerate(tqdm(aid_lists)):
  tmp_list = []

  for session_list in session_lists:
    df = reduced_df[(reduced_df['session'] >= min(session_list)) & (reduced_df['session'] <= max(session_list))]
    df = df.loc[df['type'].isin(['carts', 'orders'])]
    tmp = (
        df.loc[(df['aid'] >= min(aid_list)) & (df['aid'] <= max(aid_list))]
        .merge(df,
              how = 'inner',
              on = 'session')
    )
    tmp = (
        tmp
        .loc[ ((tmp.ts_x - tmp.ts_y).abs() < 14 * 24 * 60 * 60) & (tmp.aid_x != tmp.aid_y) ]
    )
    tmp['wgt'] = 1
    tmp = (
        tmp.groupby(['aid_x', 'aid_y'], as_index=False)
        .agg({'wgt' : 'sum'})
        .rename(columns={'wgt' : 'pairings'})
    )
    tmp['pairings'] = tmp['pairings'].astype('int32')
    tmp_list.append(tmp)
  out = pd.concat(tmp_list)
  out = (
      pd.concat(tmp_list)
      .groupby(['aid_x', 'aid_y'], as_index=False)
      .agg({'pairings' : 'sum'})
      .sort_values(by=['aid_x', 'pairings'], ascending=[True, False])
  )

  out['n'] = out.groupby(['aid_x']).cumcount() + 1
  out = out.loc[out['n'] <= 15]

  for column in ['aid_x', 'aid_y']:
    out[column] = out[column].astype('int32')
  out.to_parquet(f'{path_to_candidate_features}/also_buy_parquet/also_buy_top15_{i}.parquet', index=False)
  
  del tmp_list, out

In [None]:
path_to_candidate_features