In [1]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
personal_access_token = user_secrets.get_secret("personal_access_token")

!rm -rf /kaggle/working/kaggle_otto
!git clone -b covis-matrix https://$personal_access_token@github.com/coffeemountain/kaggle_otto.git
    
import sys
sys.path.append('/kaggle/working/kaggle_otto/src')

from covis_matrix_generator import *

Cloning into 'kaggle_otto'...
remote: Enumerating objects: 110, done.[K
remote: Counting objects: 100% (110/110), done.[K
remote: Compressing objects: 100% (89/89), done.[K
remote: Total 110 (delta 51), reused 60 (delta 17), pack-reused 0[K
Receiving objects: 100% (110/110), 63.61 KiB | 1.16 MiB/s, done.
Resolving deltas: 100% (51/51), done.


In [2]:
USE_GPU = True
weight_func_mixin = WeightFuncMixin()
covis_matrix_generator = CovisMatrixGenerator(weight_func_mixin, use_gpu=USE_GPU)

In [None]:
files = glob.glob('../input/otto-chunk-data-inparquet-format/*_parquet/*')

click_config = Config(
    target_types=[0, 1, 2], 
    weight_func='type_weight_1_6_3',
    min_event_threshold=30,
    max_sec_threshold=24 * 60 * 60,
    save_topk=15,
    output_dir='click'
)
covis_matrix_generator.generate(click_config, files)

In [None]:
buy_config = Config(
    target_types=[1, 2],
    weight_func='type_weight_1_1_1',
    min_event_threshold=30,
    max_sec_threshold=14 * 24 * 60 * 60,
    save_topk=15,
    output_dir='buy2buy'
)
covis_matrix_generator.generate(buy_config, files)

In [None]:
click_timeweight_config = Config(
    target_types=[0, 1, 2],
    weight_func='time_weight_v1',
    min_event_threshold=30,
    max_sec_threshold=24 * 60 * 60,
    save_topk=20,
    output_dir='click_timeweight',
)
covis_matrix_generator.generate(click_timeweight_config, files)

In [None]:
del covis_matrix_generator

In [None]:
type_labels = {'clicks':0, 'carts':1, 'orders':2}

def load_test():
    dfs = []
    for e, chunk_file in enumerate(glob.glob('../input/otto-chunk-data-inparquet-format/test_parquet/*')):
        chunk = pd.read_parquet(chunk_file)
        chunk.ts = (chunk.ts/1000).astype('int32')
        chunk['type'] = chunk['type'].map(type_labels).astype('int8')
        dfs.append(chunk)
    return pd.concat(dfs).reset_index(drop=True) #.astype({"ts": "datetime64[ms]"})

test_df = load_test()
print('Test data has shape',test_df.shape)
test_df.head()

In [None]:
%%time

def pqt_to_dict(df):
    return df.groupby('aid_x').aid_y.apply(list).to_dict()

# LOAD THREE CO-VISITATION MATRICES
path_to_dir = '/kaggle/working/click'
top_20_buys = pqt_to_dict( pd.read_parquet(f'{path_to_dir}/part_0.pqt') )
for k in range(1,4): 
    top_20_buys.update( pqt_to_dict( pd.read_parquet(f'{path_to_dir}/part_{k}.pqt') ) )

path_to_dir = '/kaggle/working/buy2buy'
top_20_buy2buy = pqt_to_dict( pd.read_parquet(f'{path_to_dir}/part_0.pqt') )
for k in range(1,4): 
    top_20_buy2buy.update( pqt_to_dict( pd.read_parquet(f'{path_to_dir}/part_{k}.pqt') ) )
    
path_to_dir = '/kaggle/working/click_timeweight'
top_20_clicks = pqt_to_dict( pd.read_parquet(f'{path_to_dir}/part_0.pqt') )
for k in range(1,4): 
    top_20_clicks.update( pqt_to_dict( pd.read_parquet(f'{path_to_dir}/part_{k}.pqt') ) )

# TOP CLICKS AND ORDERS IN TEST
top_clicks = test_df.loc[test_df['type']=='clicks','aid'].value_counts().index.values[:20]
top_orders = test_df.loc[test_df['type']=='orders','aid'].value_counts().index.values[:20]

print('Here are size of our 3 co-visitation matrices:')
print( len( top_20_clicks ), len( top_20_buy2buy ), len( top_20_buys ) )

In [None]:
#type_weight_multipliers = {'clicks': 1, 'carts': 6, 'orders': 3}
type_weight_multipliers = {0: 1, 1: 6, 2: 3}

def suggest_clicks(df):
    # USER HISTORY AIDS AND TYPES
    aids=df.aid.tolist()
    types = df.type.tolist()
    unique_aids = list(dict.fromkeys(aids[::-1] ))
    # RERANK CANDIDATES USING WEIGHTS
    if len(unique_aids)>=20:
        weights=np.logspace(0.1,1,len(aids),base=2, endpoint=True)-1
        aids_temp = Counter() 
        # RERANK BASED ON REPEAT ITEMS AND TYPE OF ITEMS
        for aid,w,t in zip(aids,weights,types): 
            aids_temp[aid] += w * type_weight_multipliers[t]
        sorted_aids = [k for k,v in aids_temp.most_common(20)]
        return sorted_aids
    # USE "CLICKS" CO-VISITATION MATRIX
    aids2 = list(itertools.chain(*[top_20_clicks[aid] for aid in unique_aids if aid in top_20_clicks]))
    # RERANK CANDIDATES
    top_aids2 = [aid2 for aid2, cnt in Counter(aids2).most_common(20) if aid2 not in unique_aids]    
    result = unique_aids + top_aids2[:20 - len(unique_aids)]
    # USE TOP20 TEST CLICKS
    return result + list(top_clicks)[:20-len(result)]

def suggest_buys(df):
    # USER HISTORY AIDS AND TYPES
    aids=df.aid.tolist()
    types = df.type.tolist()
    # UNIQUE AIDS AND UNIQUE BUYS
    unique_aids = list(dict.fromkeys(aids[::-1] ))
    df = df.loc[(df['type']==1)|(df['type']==2)]
    unique_buys = list(dict.fromkeys( df.aid.tolist()[::-1] ))
    # RERANK CANDIDATES USING WEIGHTS
    if len(unique_aids)>=20:
        weights=np.logspace(0.5,1,len(aids),base=2, endpoint=True)-1
        aids_temp = Counter() 
        # RERANK BASED ON REPEAT ITEMS AND TYPE OF ITEMS
        for aid,w,t in zip(aids,weights,types): 
            aids_temp[aid] += w * type_weight_multipliers[t]
        # RERANK CANDIDATES USING "BUY2BUY" CO-VISITATION MATRIX
        aids3 = list(itertools.chain(*[top_20_buy2buy[aid] for aid in unique_buys if aid in top_20_buy2buy]))
        for aid in aids3: aids_temp[aid] += 0.1
        sorted_aids = [k for k,v in aids_temp.most_common(20)]
        return sorted_aids
    # USE "CART ORDER" CO-VISITATION MATRIX
    aids2 = list(itertools.chain(*[top_20_buys[aid] for aid in unique_aids if aid in top_20_buys]))
    # USE "BUY2BUY" CO-VISITATION MATRIX
    aids3 = list(itertools.chain(*[top_20_buy2buy[aid] for aid in unique_buys if aid in top_20_buy2buy]))
    # RERANK CANDIDATES
    top_aids2 = [aid2 for aid2, cnt in Counter(aids2+aids3).most_common(20) if aid2 not in unique_aids] 
    result = unique_aids + top_aids2[:20 - len(unique_aids)]
    # USE TOP20 TEST ORDERS
    return result + list(top_orders)[:20-len(result)]

In [None]:
%%time

import pandas as pd, numpy as np
from tqdm.notebook import tqdm
import os, sys, pickle, glob, gc
from collections import Counter
import cudf, itertools

pred_df_clicks = test_df.sort_values(["session", "ts"]).groupby(["session"]).apply(
    lambda x: suggest_clicks(x)
)

pred_df_buys = test_df.sort_values(["session", "ts"]).groupby(["session"]).apply(
    lambda x: suggest_buys(x)
)

In [None]:
clicks_pred_df = pd.DataFrame(pred_df_clicks.add_suffix("_clicks"), columns=["labels"]).reset_index()
orders_pred_df = pd.DataFrame(pred_df_buys.add_suffix("_orders"), columns=["labels"]).reset_index()
carts_pred_df = pd.DataFrame(pred_df_buys.add_suffix("_carts"), columns=["labels"]).reset_index()

In [None]:
pred_df = pd.concat([clicks_pred_df, orders_pred_df, carts_pred_df])
pred_df.columns = ["session_type", "labels"]
pred_df["labels"] = pred_df.labels.apply(lambda x: " ".join(map(str,x)))
pred_df.to_csv("submission.csv", index=False)
pred_df.head()