In [2]:
!pip install pickle5

Collecting pickle5
  Downloading pickle5-0.0.12-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (256 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m256.4/256.4 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: pickle5
Successfully installed pickle5-0.0.12
[0m

In [3]:
import os
import pandas as pd

from datetime import datetime
from tqdm import tqdm

from collections import defaultdict
import math
import numpy as np
import random
import copy
from collections import Counter

In [4]:
train_df = pd.read_parquet('../input/otto-full-optimized-memory-footprint/train.parquet')
test_df = pd.read_parquet('../input/otto-full-optimized-memory-footprint/test.parquet')

import pickle5 as pickle

with open('../input/otto-full-optimized-memory-footprint/id2type.pkl', "rb") as fh:
    id2type = pickle.load(fh)
with open('../input/otto-full-optimized-memory-footprint/type2id.pkl', "rb") as fh:
    type2id = pickle.load(fh)
    
sample_sub_df = pd.read_csv('../input/otto-recommender-system/sample_submission.csv')

In [19]:
config = {
    'train_session_num':3000000,
}

In [6]:
train_df['aid'] = train_df['aid'].astype('int32').astype('str')
test_df['aid'] = test_df['aid'].astype('int32').astype('str')

In [20]:
train_session = random.sample(list(train_df['session'].unique()),config['train_session_num'])
train_df = train_df.query('session in @train_session').reset_index(drop=True)

In [8]:
train_df['time_stamp'] = pd.to_datetime(train_df['ts'],unit='s').dt.strftime('%Y-%m-%d')
test_df['time_stamp'] = pd.to_datetime(test_df['ts'],unit='s').dt.strftime('%Y-%m-%d')

In [9]:
def generate_pairs(df):
    df = df.sort_values(by=['session','ts'])
    df['aid_next'] = df['aid'].shift(-1)
    df['session_day'] = df['session'].astype('str')+'_'+df['time_stamp']
    df['session_day_count'] = df['session_day'].map(df['session_day'].value_counts())
    df['ranking'] = df.groupby(['session_day'])['ts'].rank(method='first', ascending=True)
    df = df.query('session_day_count!=ranking').reset_index(drop=True)
    
    sim_aids = df.groupby('aid').apply(lambda df: Counter(df.aid_next).most_common(50)).to_dict()
    sim_aids = {aid: Counter(dict(top)) for aid, top in sim_aids.items()}
    return sim_aids

In [21]:
sim_aids = generate_pairs(train_df)

In [31]:
def recommend(aids,popular_items):
    
    if len(aids) >= 20:
        return aids[-20:]

    aids = set(aids)
    new_aids = Counter()
    for aid in aids:
        new_aids.update(sim_aids.get(aid, Counter()))
    
    top_aids2 = [aid2 for aid2, cnt in new_aids.most_common(40) if aid2 not in aids] 
    final_rec_list = list(aids) + top_aids2[:20 - len(aids)]
    
    if len(final_rec_list)<20:
        return final_rec_list + popular_items[:20-len(final_rec_list)]
    else:
        return final_rec_list

In [32]:
test_df = test_df.sort_values(["session", "type", "ts"])
test_session_dict = test_df.groupby('session')['aid'].agg(list).to_dict()
session_id_list = []
item_id_list = []

popular_items = list(train_df['aid'].value_counts().index)

for session_id,session_item_list in tqdm(test_session_dict.items()):
    item_list = recommend(session_item_list,popular_items)
    
    session_id_list.append(session_id)
    item_id_list.append(list(item_list))

res_df = pd.DataFrame()
res_df['session_type'] = session_id_list
res_df['labels'] = [' '.join([str(l) for l in lls]) for lls in item_id_list]

100%|██████████| 1671803/1671803 [00:48<00:00, 34583.58it/s]


In [33]:
res_list = []
for type_ in [0,1,2]:
    temp_df = copy.deepcopy(res_df)
    temp_df['session_type'] = temp_df['session_type'].apply(lambda x:'{}_{}'.format(x,id2type[type_]))
    res_list.append(temp_df)
res_df = pd.concat(res_list,axis=0)

In [34]:
res_df.to_csv('baseline.csv',index=False)