# Generate Candidates #
We take candidates from a range of sources:
* Items already interacted with in the session
* Our covisitation matrices
* A word2vec model
* An ALS recommender


In [1]:
from config import data_path, path_to_module
print(f"path to module: {path_to_module}")
print(f"data path: {data_path}")
import sys   
sys.path.append(path_to_module)
%cd {data_path}
%pwd

path to module: /home/jupyter/kaggle-otto-recommender-2022
data path: /home/jupyter/kaggle-otto-recommender-2022/data
/home/jupyter/kaggle-otto-recommender-2022/data


'/home/jupyter/kaggle-otto-recommender-2022/data'

In [2]:
!pip install implicit
!pip install Annoy
!pip install fastparquet

[0mCollecting fastparquet
  Downloading fastparquet-0.8.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m35.1 MB/s[0m eta [36m0:00:00[0m00:01[0m
Collecting cramjam>=2.3.0
  Downloading cramjam-2.6.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m122.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: cramjam, fastparquet
Successfully installed cramjam-2.6.2 fastparquet-0.8.1
[0m

In [3]:
import glob
import numpy as np
import pandas as pd
import gc
import os
import seaborn as sns
from otto_utils import get_train, get_test, convert_columns, save_parquet, make_directory, create_sub
from tqdm import tqdm
import scipy.sparse as sps
import implicit
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
from annoy import AnnoyIndex


  f"CUDA extension is built, but disabling GPU support because of '{e}'",


In [4]:
sample_prop = None
validation = True
covisitation = True
cart_order = True
also_buy = True
als = True
word2vec = True

path_to_candidate_features = './train_candidate_features' if validation else './test_candidate_features'
n=20

In [5]:
reduced_df = get_test(validation, sample_prop)
reduced_df.rename(columns = {'type' : 'source'}, inplace=True)
reduced_df['joiner'] = 1

In [6]:
reduced_df.shape

(7683577, 5)

## Calculate the candidates from items already in the basket ##

In [8]:
%pwd

'/home/jupyter/kaggle-otto-recommender-2022/data'

In [7]:
path_to_candidate_features

'./train_candidate_features'

In [9]:
training_skeleton = reduced_df
training_skeleton['time_weight'] = training_skeleton.groupby('session')['aid'].transform(lambda x: np.logspace(0.1, 1, x.shape[0], base=2, endpoint=True)) - 1 # Changing this from 0.1 to 0.5 impacts recall by 0.01
training_skeleton['type_weight'] = 1
training_skeleton.loc[training_skeleton.source == 'carts', 'type_weight'] = 3
training_skeleton.loc[training_skeleton.source == 'orders', 'type_weight'] = 6
training_skeleton['weight'] = training_skeleton['time_weight'] * training_skeleton['type_weight']
training_skeleton = training_skeleton.groupby(['session', 'aid'], as_index=False).agg({'weight' : 'sum'})
training_skeleton.sort_values(by=['session', 'weight'], ascending=[True, False], inplace=True)
training_skeleton['n_basket'] = training_skeleton.groupby('session').cumcount() + 1
training_skeleton = convert_columns(training_skeleton)

save_parquet(training_skeleton, f'{path_to_candidate_features}/basket', files=100, split_column = 'session')

100% 100/100 [00:01<00:00, 66.26it/s]


## Calculate the candidates from the covisitation matrices ##

In [17]:
files = glob.glob(f'{path_to_candidate_features}/also_buy_parquet/*.pqt')

In [18]:
files

['./train_candidate_features/also_buy_parquet/top_15_buy2buy_v5_0.pqt']

In [20]:
%pwd

'/home/jupyter/kaggle-otto-recommender-2022/data'

In [23]:
%ls train_candidate_features/cart_order_parquet/

top_15_carts_orders_v5_0.pqt  top_15_carts_orders_v5_2.pqt
top_15_carts_orders_v5_1.pqt  top_15_carts_orders_v5_3.pqt


In [27]:
!pip install pyarrow

[0m

In [30]:
!pwd

/home/jupyter/kaggle-otto-recommender-2022/data


In [31]:
# ! ls also_buy

In [10]:
if covisitation:
  files = glob.glob(f'{path_to_candidate_features}/covisitation_parquet/top_20_clicks_*_*.pqt')
  covisitation_matrix = convert_columns(pd.read_parquet(files))
  for column in ['aid_x', 'aid_y']:
    covisitation_matrix[column] = covisitation_matrix[column].astype('int32')

  sessions = reduced_df['session'].unique()
  sessions.sort()
  session_lists = [np_array.tolist() for np_array in np.array_split(np.array(sessions), 10 ) ]

  covisitation_list = []
  for i, session_list in enumerate(tqdm(session_lists)):
    chunk = reduced_df.loc[(reduced_df['session'] >= min(session_list)) & (reduced_df['session'] <= max(session_list))]
    covisitation_options = (
        chunk.merge(
            covisitation_matrix,
            how='left',
            left_on = ['aid'],
            right_on = ['aid_x']).drop_duplicates(subset=['session', 'aid_x', 'aid_y'], keep='last')
        .groupby(['session', 'aid_y'], as_index=False)
        .agg({'ts': 'max', 'aid' : 'count', 'pairings' : 'sum'}) 
        .sort_values(by=['session', 'aid', 'ts', 'pairings'], ascending=[True, False, False, False])
        .drop(columns={'aid', 'ts'})
        .rename(columns={'aid_y' : 'aid'})
    )
    covisitation_options['n'] = covisitation_options.groupby('session').cumcount() + 1
    covisitation_options = covisitation_options.loc[covisitation_options['n'] <= 150]

    #covisitation_options.drop(columns='n', inplace=True)
    for column in ['aid', 'session']:
      covisitation_options[column] = covisitation_options[column].astype('int32')
    covisitation_list.append(covisitation_options)
    del chunk

  covisitation_options = pd.concat(covisitation_list)
  del covisitation_list

  save_parquet(covisitation_options, f'{path_to_candidate_features}/covisitation', files=100, split_column = 'session')

  del covisitation_options

ArrowInvalid: Error creating dataset. Could not read schema from './train_candidate_features/covisitation_parquet/top_20_clicks_v5_0.pqt': Could not open Parquet input source './train_candidate_features/covisitation_parquet/top_20_clicks_v5_0.pqt': Parquet magic bytes not found in footer. Either the file is corrupted or this is not a parquet file.. Is this a 'parquet' file?

In [None]:
if cart_order:
  files = glob.glob(f'{path_to_candidate_features}/cart_order_parquet/*')
  covisitation_matrix = convert_columns(pd.read_parquet(files))
  for column in ['aid_x', 'aid_y']:
    covisitation_matrix[column] = covisitation_matrix[column].astype('int32')

  sessions = reduced_df['session'].unique()
  sessions.sort()
  session_lists = [np_array.tolist() for np_array in np.array_split(np.array(sessions), 10 ) ]

  covisitation_list = []
  for i, session_list in enumerate(tqdm(session_lists)):
    chunk = reduced_df.loc[(reduced_df['session'] >= min(session_list)) & (reduced_df['session'] <= max(session_list))]
    covisitation_options = (
        chunk.merge(
            covisitation_matrix,
            how='left',
            left_on = ['aid'],
            right_on = ['aid_x']).drop_duplicates(subset=['session', 'aid_x', 'aid_y'], keep='last')
        .groupby(['session', 'aid_y'], as_index=False)
        .agg({'ts': 'max', 'aid' : 'count', 'pairings' : 'sum'})
        .sort_values(by=['session', 'aid', 'ts', 'pairings'], ascending=[True, False, False, False])
        .drop(columns={'aid', 'ts'})
        .rename(columns={'aid_y' : 'aid'})
    )
    covisitation_options['n'] = covisitation_options.groupby('session').cumcount() + 1
    covisitation_options = covisitation_options.loc[covisitation_options['n'] <= 150]

    #covisitation_options.drop(columns='n', inplace=True)
    for column in ['aid', 'session']:
      covisitation_options[column] = covisitation_options[column].astype('int32')
    covisitation_list.append(covisitation_options)
    del chunk

  covisitation_options = pd.concat(covisitation_list)
  del covisitation_list

  save_parquet(covisitation_options, f'{path_to_candidate_features}/cart_order', files=100, split_column = 'session')

  del covisitation_options


In [32]:
also_buy

True

In [34]:
files

['./train_candidate_features/also_buy_parquet/top_15_buy2buy_v5_0.pqt']

In [37]:
pd.read_parquet(files)

ValueError: cannot construct a FileSource from a path without a FileSystem

Exception ignored in: 'pyarrow._dataset._make_file_source'
ValueError: cannot construct a FileSource from a path without a FileSystem


ArrowInvalid: Called Open() on an uninitialized FileSource

In [38]:
if also_buy:
  files = glob.glob(f'{path_to_candidate_features}/also_buy_parquet/*')
  covisitation_matrix = convert_columns(pd.read_parquet(files))
  for column in ['aid_x', 'aid_y']:
    covisitation_matrix[column] = covisitation_matrix[column].astype('int32')

  sessions = reduced_df['session'].unique()
  sessions.sort()
  session_lists = [np_array.tolist() for np_array in np.array_split(np.array(sessions), 10 ) ]

  covisitation_list = []
  for i, session_list in enumerate(tqdm(session_lists)):
    chunk = reduced_df.loc[
        (reduced_df['session'] >= min(session_list))
        & (reduced_df['session'] <= max(session_list))
        & (reduced_df['source'].isin(['carts', 'orders']))]
    covisitation_options = (
        chunk.merge(
            covisitation_matrix,
            how='left',
            left_on = ['aid'],
            right_on = ['aid_x']).drop_duplicates(subset=['session', 'aid_x', 'aid_y'], keep='last')
        .groupby(['session', 'aid_y'], as_index=False)
        .agg({'ts': 'max', 'aid' : 'count', 'pairings' : 'sum'})
        .sort_values(by=['session', 'aid', 'ts', 'pairings'], ascending=[True, False, False, False])
        .drop(columns={'aid', 'ts'})
        .rename(columns={'aid_y' : 'aid'})
    )
    covisitation_options['n'] = covisitation_options.groupby('session').cumcount() + 1
    covisitation_options = covisitation_options.loc[covisitation_options['n'] <= 150]

    #covisitation_options.drop(columns='n', inplace=True)
    for column in ['aid', 'session']:
      covisitation_options[column] = covisitation_options[column].astype('int32')
    covisitation_list.append(covisitation_options)
    del chunk

  covisitation_options = pd.concat(covisitation_list)
  del covisitation_list

  save_parquet(covisitation_options, f'{path_to_candidate_features}/also_buy', files=100, split_column = 'session')
  del covisitation_options

  0% 0/10 [00:00<?, ?it/s]


KeyError: "Column(s) ['pairings'] do not exist"

## Create the word2vec candidates

In [None]:
## Word2Vec functions:
def get_session_vector(df, w2vec):
  aids = df.aid.unique()
  for i, aid in enumerate(aids):
    vec = w2vec.wv[aid] if i == 0 else vec + w2vec.wv[aid]
  vec = vec / len(aids)
  return vec

def get_close_aids(df, w2vec, index, idx2aid, n=20):
  session_vec = get_session_vector(df, w2vec)
  close_aids = get_nearest_neighbours(session_vec, index, idx2aid, n)
  return close_aids

def get_nearest_neighbours(x, index, idx2aid, n=20):
  indexes, distances = index.get_nns_by_vector(x, n, search_k=-1, include_distances=True)
  aids = [idx2aid[i] for i in indexes]
  df = pd.DataFrame(data={'aid' : aids, 'w2vec_dist' : distances})
  return df

def get_word2vec_recs(train, test, n=20):
  vector_size = 32
  epochs = 9
  sg = 1
  pop_thresh = 0.82415
  window = 8
  distance = 'angular'

  reduced_df = pd.concat([train, test[['session','aid']]])
  del train
  sentences = reduced_df.groupby('session', as_index=False).agg({'aid' : lambda x: [str(i) for i in x.tolist()]}).rename(columns={'aid' : 'sentence'})
  sentences = sentences['sentence'].to_list()

  w2vec = Word2Vec(sentences=sentences, size=vector_size, iter = epochs, sg=sg, min_count=1, workers=14, window=window)

  index = AnnoyIndex(vector_size, distance)
  aid2idx = {}

  popular_aids = test.groupby('aid', as_index=False).agg({'session' : 'count'})
  popular_aids = popular_aids.loc[popular_aids['session'] > popular_aids['session'].quantile(pop_thresh)]
  popular_aid_list = popular_aids.aid.unique()

  for i, aid in enumerate(popular_aid_list):
    aid = str(aid)
    aid2idx[aid] = i
    index.add_item(i, w2vec.wv[aid])
  idx2aid = { v : k for k, v in aid2idx.items()}
  index.build(40)

  reduced_test = test.copy()
  reduced_test['aid'] = reduced_test['aid'].astype('str')
  reduced_test['aid_vector'] = reduced_test['aid'].apply(lambda x: w2vec.wv[x])

  reduced_test = reduced_test.groupby('session').apply(lambda x: get_close_aids(x, w2vec, index, idx2aid, n)).reset_index().drop(columns='level_1')
  reduced_test['aid'] = reduced_test['aid'].astype('int32')
  reduced_test['n'] = reduced_test.groupby('session').cumcount() + 1

  return reduced_test

In [None]:
if word2vec:
  train = get_train(validation, sample_prop)
  word2vec_recs = get_word2vec_recs(train, reduced_df, 100)
  
  save_parquet(word2vec_recs, f'{path_to_candidate_features}/word2vec', files=100, split_column = 'session')

## Create the ALS candidates

In [None]:
## ALS functions
def get_items_to_exclude(reduced_df, proportion=0):
  ''' returns items with low popularity in the test set to exclude from predictions '''
  items_to_exclude = reduced_df.loc[reduced_df['dataset'] == 'test'].groupby('item_id', as_index=False).agg({'test_set_actions' : 'sum'})
  n = items_to_exclude['test_set_actions'].quantile(proportion)
  items_to_exclude = items_to_exclude.loc[items_to_exclude['test_set_actions'] <= n]
  items_to_exclude = items_to_exclude['item_id'].tolist()
  return items_to_exclude

def get_users_to_keep(reduced_df, n=0):
  ''' get a list of all user codes with total interactions >= n '''
  users_to_keep = reduced_df.groupby('user_id', as_index=False).agg({'aid' : 'count'})
  users_to_keep = users_to_keep.loc[users_to_keep['aid'] >= n]
  users_to_keep = users_to_keep.user_id.tolist()
  return users_to_keep

def get_als_recommendations(train, test, n_recs=20):
  iterations = 2
  factors = 800
  regularization = 1.7050
  minimum_clicks = 22
  popularity_threshold = 0.10

  train['dataset'] = 'train'
  test['dataset'] = 'test'
  reduced_df = pd.concat([train, test])
  del train

  reduced_df.reset_index(inplace=True)

  reduced_df['user'] = reduced_df['session'].astype('category')
  reduced_df['user_id'] = reduced_df['user'].cat.codes
  reduced_df['item'] = reduced_df['aid'].astype('category')
  reduced_df['item_id'] = reduced_df['item'].cat.codes
  reduced_df['test_set_actions'] = 0
  reduced_df.loc[reduced_df['dataset'] == 'test', 'test_set_actions'] = 1
  reduced_df = convert_columns(reduced_df)

  test_indices_start = len(reduced_df.loc[reduced_df['dataset'] == 'train'].session.unique())
  test_indices_end = len(reduced_df.session.unique())
  item_ids = {k: v for k, v in zip(reduced_df['item_id'], reduced_df['item'])}
  validation_user_ids = [id for id in range(test_indices_start, test_indices_end)]
  reduced_df.drop(columns=['user', 'ts', 'index','item'], inplace=True)

  user_item = sps.coo_matrix(
      (np.ones(reduced_df.shape[0]), # We're using a matrix of ones, but using type weights or repurchase weights could help!
      (reduced_df['user_id'],
      reduced_df['item_id'])),
      dtype='int8'
    ).tocsr()

  model = implicit.als.AlternatingLeastSquares(
      iterations = iterations,
      factors=factors,
      regularization=regularization,
      dtype=np.float32
  )

  users_to_keep = get_users_to_keep(reduced_df, n=minimum_clicks)
  items_to_exclude = get_items_to_exclude(reduced_df, proportion=popularity_threshold)

  user_item_train = user_item[users_to_keep, :]

  model.fit(user_item_train, show_progress=True)

  args = {'userid' : validation_user_ids,
          'user_items' : user_item[validation_user_ids,:],
          'filter_items' : items_to_exclude,
          'filter_already_liked_items' : False,
          'recalculate_user' : True,
          'N' : n_recs
          }

  recs = model.recommend(**args)

  recs = pd.DataFrame(data={'session' : reduced_df.loc[reduced_df['dataset'] == 'test']['session'].unique(),
                                'aid' : recs[0][:].tolist(),
                                  'confidence' : recs[1][:].tolist()})
  recs = recs.set_index('session').apply(pd.Series.explode).reset_index()
  recs['aid'] = recs['aid'].map(item_ids)
  recs['n'] = recs.groupby('session').cumcount() + 1
  return recs

In [None]:
if als:
  train = get_train(validation, sample_prop, columns=['session', 'aid'])
  als_recs = get_als_recommendations(train, reduced_df, 200)

  save_parquet(als_recs, f'{path_to_candidate_features}/als', files=100, split_column = 'session')