In [1]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
personal_access_token = user_secrets.get_secret("personal_access_token")

!rm -rf /kaggle/working/kaggle_otto
!git clone https://$personal_access_token@github.com/coffeemountain/kaggle_otto.git
    
import sys
sys.path.append('/kaggle/working/kaggle_otto/src')

!pip install dataclasses_json
from covis_matrix_generator import *

Cloning into 'kaggle_otto'...
remote: Enumerating objects: 244, done.[K
remote: Counting objects: 100% (152/152), done.[K
remote: Compressing objects: 100% (77/77), done.[K
remote: Total 244 (delta 90), reused 119 (delta 69), pack-reused 92[K
Receiving objects: 100% (244/244), 269.12 KiB | 466.00 KiB/s, done.
Resolving deltas: 100% (123/123), done.
Collecting dataclasses_json
  Downloading dataclasses_json-0.5.7-py3-none-any.whl (25 kB)
Collecting marshmallow<4.0.0,>=3.3.0
  Downloading marshmallow-3.19.0-py3-none-any.whl (49 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.1/49.1 kB[0m [31m599.7 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting typing-inspect>=0.4.0
  Downloading typing_inspect-0.8.0-py3-none-any.whl (8.7 kB)
Collecting marshmallow-enum<2.0.0,>=1.5.1
  Downloading marshmallow_enum-1.5.1-py2.py3-none-any.whl (4.2 kB)
Installing collected packages: typing-inspect, marshmallow, marshmallow-enum, dataclasses_json
Successfully inst

In [2]:
USE_GPU = True
weight_func_mixin = WeightFuncMixin()
cs_client = CloudStorageClient()
covis_matrix_generator = CovisMatrixGenerator(weight_func_mixin=weight_func_mixin, cloud_storage_client=cs_client, use_gpu=USE_GPU)

files = glob.glob('../input/otto-chunk-data-inparquet-format/*_parquet/*')

In [3]:
buy_data = covis_matrix_generator.load_or_generate(
    Config(
        target_types=[0, 1, 2], 
        weight_func='type_weight_1_6_3',
        min_event_threshold=30,
        max_sec_threshold=24 * 60 * 60,
        save_topk=15,
    ),
    files)

found same setting covis matrix. loading files ...


In [4]:
buy2buy_data = covis_matrix_generator.load_or_generate(
    Config(
        target_types=[1, 2],
        weight_func='type_weight_1_1_1',
        min_event_threshold=30,
        max_sec_threshold=14 * 24 * 60 * 60,
        save_topk=15,
        output_dir='buy2buy'
    ),
    files)

found same setting covis matrix. loading files ...


In [5]:
click_timeweight_data = covis_matrix_generator.load_or_generate(
    Config(
        target_types=[0, 1, 2],
        weight_func='time_weight_v1',
        min_event_threshold=30,
        max_sec_threshold=24 * 60 * 60,
        save_topk=20,
    ),
    files)

found same setting covis matrix. loading files ...


In [6]:
del covis_matrix_generator

In [7]:
type_labels = {'clicks':0, 'carts':1, 'orders':2}

def load_test():
    dfs = []
    for e, chunk_file in enumerate(glob.glob('../input/otto-chunk-data-inparquet-format/test_parquet/*')):
        chunk = pd.read_parquet(chunk_file)
        chunk.ts = (chunk.ts/1000).astype('int32')
        chunk['type'] = chunk['type'].map(type_labels).astype('int8')
        dfs.append(chunk)
    return pd.concat(dfs).reset_index(drop=True) #.astype({"ts": "datetime64[ms]"})

test_df = load_test()
print('Test data has shape',test_df.shape)
test_df.head()

Test data has shape (6928123, 4)


Unnamed: 0,session,aid,ts,type
0,13099779,245308,1661795832,0
1,13099779,245308,1661795862,1
2,13099779,972319,1661795888,0
3,13099779,972319,1661795898,1
4,13099779,245308,1661795907,0


In [10]:
%%time

def pqt_to_dict(df):
    return df.groupby('aid_x').aid_y.apply(list).to_dict()

# LOAD THREE CO-VISITATION MATRICES
top_20_buys = {}
for df in buy_data:
    top_20_buys.update(pqt_to_dict(df))

top_20_buy2buy = {}
for df in buy2buy_data:
    top_20_buy2buy.update(pqt_to_dict(df))
    
top_20_clicks = {}
for df in click_timeweight_data:
    top_20_clicks.update(pqt_to_dict(df))

del buy_data, buy2buy_data, click_timeweight_data

print('Here are size of our 3 co-visitation matrices:')
print( len( top_20_buys ), len( top_20_buy2buy ), len( top_20_clicks ) )

Here are size of our 3 co-visitation matrices:
1837166 1168768 1837166
CPU times: user 1min 45s, sys: 3.79 s, total: 1min 49s
Wall time: 1min 50s


In [11]:
# TOP CLICKS AND ORDERS IN TEST
top_clicks = test_df.loc[test_df['type']=='clicks','aid'].value_counts().index.values[:20]
top_orders = test_df.loc[test_df['type']=='orders','aid'].value_counts().index.values[:20]

In [12]:
#type_weight_multipliers = {'clicks': 1, 'carts': 6, 'orders': 3}
type_weight_multipliers = {0: 1, 1: 6, 2: 3}

def suggest_clicks(df):
    # USER HISTORY AIDS AND TYPES
    aids=df.aid.tolist()
    types = df.type.tolist()
    unique_aids = list(dict.fromkeys(aids[::-1] ))
    # RERANK CANDIDATES USING WEIGHTS
    if len(unique_aids)>=20:
        weights=np.logspace(0.1,1,len(aids),base=2, endpoint=True)-1
        aids_temp = Counter() 
        # RERANK BASED ON REPEAT ITEMS AND TYPE OF ITEMS
        for aid,w,t in zip(aids,weights,types): 
            aids_temp[aid] += w * type_weight_multipliers[t]
        sorted_aids = [k for k,v in aids_temp.most_common(20)]
        return sorted_aids
    # USE "CLICKS" CO-VISITATION MATRIX
    aids2 = list(itertools.chain(*[top_20_clicks[aid] for aid in unique_aids if aid in top_20_clicks]))
    # RERANK CANDIDATES
    top_aids2 = [aid2 for aid2, cnt in Counter(aids2).most_common(20) if aid2 not in unique_aids]    
    result = unique_aids + top_aids2[:20 - len(unique_aids)]
    # USE TOP20 TEST CLICKS
    return result + list(top_clicks)[:20-len(result)]

def suggest_buys(df):
    # USER HISTORY AIDS AND TYPES
    aids=df.aid.tolist()
    types = df.type.tolist()
    # UNIQUE AIDS AND UNIQUE BUYS
    unique_aids = list(dict.fromkeys(aids[::-1] ))
    df = df.loc[(df['type']==1)|(df['type']==2)]
    unique_buys = list(dict.fromkeys( df.aid.tolist()[::-1] ))
    # RERANK CANDIDATES USING WEIGHTS
    if len(unique_aids)>=20:
        weights=np.logspace(0.5,1,len(aids),base=2, endpoint=True)-1
        aids_temp = Counter() 
        # RERANK BASED ON REPEAT ITEMS AND TYPE OF ITEMS
        for aid,w,t in zip(aids,weights,types): 
            aids_temp[aid] += w * type_weight_multipliers[t]
        # RERANK CANDIDATES USING "BUY2BUY" CO-VISITATION MATRIX
        aids3 = list(itertools.chain(*[top_20_buy2buy[aid] for aid in unique_buys if aid in top_20_buy2buy]))
        for aid in aids3: aids_temp[aid] += 0.1
        sorted_aids = [k for k,v in aids_temp.most_common(20)]
        return sorted_aids
    # USE "CART ORDER" CO-VISITATION MATRIX
    aids2 = list(itertools.chain(*[top_20_buys[aid] for aid in unique_aids if aid in top_20_buys]))
    # USE "BUY2BUY" CO-VISITATION MATRIX
    aids3 = list(itertools.chain(*[top_20_buy2buy[aid] for aid in unique_buys if aid in top_20_buy2buy]))
    # RERANK CANDIDATES
    top_aids2 = [aid2 for aid2, cnt in Counter(aids2+aids3).most_common(20) if aid2 not in unique_aids] 
    result = unique_aids + top_aids2[:20 - len(unique_aids)]
    # USE TOP20 TEST ORDERS
    return result + list(top_orders)[:20-len(result)]

In [13]:
%%time

import pandas as pd, numpy as np
from tqdm.notebook import tqdm
import os, sys, pickle, glob, gc
from collections import Counter
import cudf, itertools

pred_df_clicks = test_df.sort_values(["session", "ts"]).groupby(["session"]).apply(
    lambda x: suggest_clicks(x)
)

pred_df_buys = test_df.sort_values(["session", "ts"]).groupby(["session"]).apply(
    lambda x: suggest_buys(x)
)

CPU times: user 37min 56s, sys: 16.1 s, total: 38min 12s
Wall time: 38min 21s


In [14]:
clicks_pred_df = pd.DataFrame(pred_df_clicks.add_suffix("_clicks"), columns=["labels"]).reset_index()
orders_pred_df = pd.DataFrame(pred_df_buys.add_suffix("_orders"), columns=["labels"]).reset_index()
carts_pred_df = pd.DataFrame(pred_df_buys.add_suffix("_carts"), columns=["labels"]).reset_index()

In [15]:
pred_df = pd.concat([clicks_pred_df, orders_pred_df, carts_pred_df])
pred_df.columns = ["session_type", "labels"]
pred_df["labels"] = pred_df.labels.apply(lambda x: " ".join(map(str,x)))
pred_df.to_csv("submission.csv", index=False)
pred_df.head()

Unnamed: 0,session_type,labels
0,12899779_clicks,59625 1253524 737445 438191 731692 1790770 942...
1,12899780_clicks,1142000 736515 973453 582732 1502122 889686 48...
2,12899781_clicks,918667 199008 194067 57315 141736 1460571 7594...
3,12899782_clicks,834354 595994 740494 889671 987399 779477 1344...
4,12899783_clicks,1817895 607638 1754419 1216820 1729553 300127 ...
