In [1]:
!pip install recbole

Collecting recbole
  Downloading recbole-1.0.1-py3-none-any.whl (2.0 MB)
     |████████████████████████████████| 2.0 MB 607 kB/s            
Collecting scipy==1.6.0
  Downloading scipy-1.6.0-cp37-cp37m-manylinux1_x86_64.whl (27.4 MB)
     |████████████████████████████████| 27.4 MB 51.3 MB/s            
Collecting colorlog==4.7.2
  Downloading colorlog-4.7.2-py2.py3-none-any.whl (10 kB)
Installing collected packages: scipy, colorlog, recbole
  Attempting uninstall: scipy
    Found existing installation: scipy 1.7.3
    Uninstalling scipy-1.7.3:
      Successfully uninstalled scipy-1.7.3
  Attempting uninstall: colorlog
    Found existing installation: colorlog 6.6.0
    Uninstalling colorlog-6.6.0:
      Successfully uninstalled colorlog-6.6.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
yellowbrick 1.3.post1 requires numpy<1.20,>=1.16.

In [2]:
import gc
import os
import numpy as np
import pandas as pd

import torch
import logging
from logging import getLogger
from recbole.config import Config
from recbole.data import create_dataset, data_preparation
from recbole.model.sequential_recommender import GRU4RecF, SASRecF, SASRec
from recbole.trainer import Trainer
from recbole.utils import init_seed, init_logger
from recbole.utils.case_study import full_sort_topk
from recbole.data.interaction import Interaction

In [3]:
VALID = True
model_label = SASRecF

In [4]:
def apk(actual, predicted, k=12):
    if len(predicted)>k:
        predicted = predicted[:k]
        
    score = 0.0
    num_hits = 0.0
    
    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=12):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

# 1. Create atomic file

### 1.A create atomic of item features

In [5]:
df = pd.read_csv(r"/kaggle/input/h-and-m-personalized-fashion-recommendations/articles.csv", dtype={'article_id': 'str'})

for col in df.columns:
    print(col, len(pd.unique(df[col])))

article_id 105542
product_code 47224
prod_name 45875
product_type_no 132
product_type_name 131
product_group_name 19
graphical_appearance_no 30
graphical_appearance_name 30
colour_group_code 50
colour_group_name 50
perceived_colour_value_id 8
perceived_colour_value_name 8
perceived_colour_master_id 20
perceived_colour_master_name 20
department_no 299
department_name 250
index_code 10
index_name 10
index_group_no 5
index_group_name 5
section_no 57
section_name 56
garment_group_no 21
garment_group_name 21
detail_desc 43405


In [6]:
df = df.drop(columns = ['product_type_name', 'graphical_appearance_name', 'colour_group_name', 'perceived_colour_value_name',
                        'perceived_colour_master_name', 'index_name', 'index_group_name', 'section_name', 
                        'garment_group_name', 'prod_name', 'department_name', 'detail_desc'])
temp = df.rename(
    columns={'article_id': 'item_id:token', 'product_code': 'product_code:token', 'product_type_no': 'product_type_no:float',
             'product_group_name': 'product_group_name:token_seq', 'graphical_appearance_no': 'graphical_appearance_no:token', 
             'colour_group_code': 'colour_group_code:token', 'perceived_colour_value_id': 'perceived_colour_value_id:token', 
             'perceived_colour_master_id': 'perceived_colour_master_id:token', 'department_no': 'department_no:token', 
             'index_code': 'index_code:token', 'index_group_no': 'index_group_no:token', 'section_no': 'section_no:token', 
             'garment_group_no': 'garment_group_no:token'})
temp.head()

Unnamed: 0,item_id:token,product_code:token,product_type_no:float,product_group_name:token_seq,graphical_appearance_no:token,colour_group_code:token,perceived_colour_value_id:token,perceived_colour_master_id:token,department_no:token,index_code:token,index_group_no:token,section_no:token,garment_group_no:token
0,108775015,108775,253,Garment Upper body,1010016,9,4,5,1676,A,1,16,1002
1,108775044,108775,253,Garment Upper body,1010016,10,3,9,1676,A,1,16,1002
2,108775051,108775,253,Garment Upper body,1010017,11,1,9,1676,A,1,16,1002
3,110065001,110065,306,Underwear,1010016,9,4,5,1339,B,1,61,1017
4,110065002,110065,306,Underwear,1010016,10,3,9,1339,B,1,61,1017


In [7]:
!mkdir /kaggle/working/recbox_data
temp.to_csv(r'/kaggle/working/recbox_data/recbox_data.item', index=False, sep='\t')

### 1.B create atomic of interation features

In [8]:
df = pd.read_csv(r"/kaggle/input/h-and-m-personalized-fashion-recommendations/transactions_train.csv", 
                 dtype={'article_id': 'str'})
df['t_dat'] = pd.to_datetime(df['t_dat'], format="%Y-%m-%d")
df['timestamp'] = df.t_dat.values.astype(np.int64) // 10 ** 9

**We fill with data in only 2020(timestamp > > 1585620000) and create inter file**
For anyone need instruction about inter file, please check below links:
* https://recbole.io/docs/user_guide/data_intro.html
* https://recbole.io/docs/user_guide/data/atomic_files.html

if you want a full of iterations without limiting timestamp, please check here:

In [9]:
if VALID:
    valid_time = df[df.t_dat=="2020-09-16"]["timestamp"].values[0]
    valid = df[df['timestamp'] >= valid_time].groupby('customer_id')['article_id'].apply(list).reset_index()
    valid = valid.rename({'article_id':'prediction'},axis=1)
    valid['prediction'] = valid.prediction.apply(lambda x: ' '.join([str(k) for k in x]))
    temp = df[(df['timestamp'] > 1585620000)][['customer_id', 'article_id', 'timestamp']].rename( # & (df['timestamp'] < valid_time)
    columns={'customer_id': 'user_id:token', 'article_id': 'item_id:token', 'timestamp': 'timestamp:float'})    
else:
    temp = df[df['timestamp'] > 1585620000][['customer_id', 'article_id', 'timestamp']].rename(
    columns={'customer_id': 'user_id:token', 'article_id': 'item_id:token', 'timestamp': 'timestamp:float'})    

In [10]:
df[df['timestamp'] >= valid_time].shape[0] / temp.shape[0]

0.03059662469616447

In [11]:
# We save atomic file in dataset format for using with recbole
temp.to_csv('/kaggle/working/recbox_data/recbox_data.inter', index=False, sep='\t')
del temp, df
gc.collect()

55

# Create dataset and train model with Recbole
link: https://recbole.io/docs/user_guide/usage/use_modules.html

In [12]:
# can change parameters here
parameter_dict = {
    'data_path': '/kaggle/working',
    'USER_ID_FIELD': 'user_id',
    'ITEM_ID_FIELD': 'item_id',
    'TIME_FIELD': 'timestamp',
    'user_inter_num_interval': "[35,inf)",
    'item_inter_num_interval': "[40,inf)",
    'load_col': {'inter': ['user_id', 'item_id', 'timestamp'],
                 'item': ['item_id', 'product_code', 'product_type_no', 'product_group_name', 'graphical_appearance_no',
                      'colour_group_code', 'perceived_colour_value_id', 'perceived_colour_master_id',
                      'department_no', 'index_code', 'index_group_no', 'section_no', 'garment_group_no']
             },
    'selected_features': ['product_code', 'product_type_no', 'product_group_name', 'graphical_appearance_no',
                          'colour_group_code', 'perceived_colour_value_id', 'perceived_colour_master_id',
                          'department_no', 'index_code', 'index_group_no', 'section_no', 'garment_group_no'],
    'learning_rate': 0.005,
    'weight_decay': 1e-4,
    'neg_sampling': None,
    'epochs': 20,
    'metrics' : ['MRR', 'MAP'],
    'valid_metric': 'MAP@10',
    'eval_args': {
        'split': {'RS': [99.9694, 0.0306, 0]},
        'group_by': 'user',
        'order': 'TO',
        'mode': 'full'}
}

config = Config(model=str(model_label).split("'")[1].split(".")[-1], 
                dataset='recbox_data', 
                config_dict=parameter_dict)

# init random seed
init_seed(config['seed'], config['reproducibility'])

# logger initialization
init_logger(config)
logger = getLogger()
# Create handlers
c_handler = logging.StreamHandler()
c_handler.setLevel(logging.INFO)
logger.addHandler(c_handler)

# write config info into log
logger.info(config)


General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = /kaggle/working/recbox_data
checkpoint_dir = saved
show_progress = True
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 20
train_batch_size = 2048
learner = adam
learning_rate = 0.005
neg_sampling = None
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0001
loss_decimal_place = 4

Evaluation Hyper Parameters:
eval_args = {'split': {'RS': [99.9694, 0.0306, 0]}, 'group_by': 'user', 'order': 'TO', 'mode': 'full'}
repeatable = True
metrics = ['MRR', 'MAP']
topk = [10]
valid_metric = MAP@10
valid_metric_bigger = True
eval_batch_size = 4096
metric_decimal_place = 4

Dataset Hyper Parameters:
field_separator = 	
seq_separator =  
USER_ID_FIELD = user_id
ITEM_ID_FIELD = item_id
RATING_FIELD = rating
TIME_FIELD = timestamp
seq_len = None
LABEL_FIELD = labe

In [13]:
dataset = create_dataset(config)
logger.info(dataset)

recbox_data
The number of users: 25039
Average actions of users: 53.40770029555076
The number of items: 9149
Average actions of items: 146.17643200699607
The number of inters: 1337222
The sparsity of the dataset: 99.41626880189104%
Remain Fields: ['user_id', 'item_id', 'timestamp', 'product_code', 'product_type_no', 'product_group_name', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id', 'perceived_colour_master_id', 'department_no', 'index_code', 'index_group_no', 'section_no', 'garment_group_no']


In [14]:
# dataset splitting
train_data, valid_data, test_data = data_preparation(config, dataset)

[Training]: train_batch_size = [2048] negative sampling: [None]
[Evaluation]: eval_batch_size = [4096] eval_args: [{'split': {'RS': [99.9694, 0.0306, 0]}, 'group_by': 'user', 'order': 'TO', 'mode': 'full'}]


In [15]:
# model loading and initialization
model = model_label(config, train_data.dataset).to(config['device'])
logger.info(model)

# trainer loading and initialization
trainer = Trainer(config, model)

# model training
if VALID:
    best_valid_score, best_valid_result = trainer.fit(train_data, valid_data)
else:
    best_valid_score, best_valid_result = trainer.fit(train_data)

SASRecF(
  (item_embedding): Embedding(9149, 64, padding_idx=0)
  (position_embedding): Embedding(50, 64)
  (feature_embed_layer): FeatureSeqEmbLayer(
    (token_embedding_table): ModuleDict(
      (item): FMEmbedding(
        (embedding): Embedding(4850, 64)
      )
    )
    (float_embedding_table): ModuleDict(
      (item): Embedding(1, 64)
    )
    (token_seq_embedding_table): ModuleDict(
      (item): ModuleList(
        (0): Embedding(16, 64)
      )
    )
  )
  (trm_encoder): TransformerEncoder(
    (layer): ModuleList(
      (0): TransformerLayer(
        (multi_head_attention): MultiHeadAttention(
          (query): Linear(in_features=64, out_features=64, bias=True)
          (key): Linear(in_features=64, out_features=64, bias=True)
          (value): Linear(in_features=64, out_features=64, bias=True)
          (softmax): Softmax(dim=-1)
          (attn_dropout): Dropout(p=0.5, inplace=False)
          (dense): Linear(in_features=64, out_features=64, bias=True)
          (Lay

# 4. Create recommendation result from trained model

I note document here for any one want to customize it: https://recbole.io/docs/user_guide/usage/case_study.html

In [16]:
external_user_ids = dataset.id2token(
    dataset.uid_field, list(range(dataset.user_num)))[1:]#fist element in array is 'PAD'(default of Recbole) ->remove it 

In [17]:
def add_last_item(old_interaction, last_item_id, max_len=50):
    new_seq_items = old_interaction['item_id_list'][-1]
    if old_interaction['item_length'][-1].item() < max_len:
        new_seq_items[old_interaction['item_length'][-1].item()] = last_item_id
    else:
        new_seq_items = torch.roll(new_seq_items, -1)
        new_seq_items[-1] = last_item_id
    return new_seq_items.view(1, len(new_seq_items))

def predict_for_all_item(external_user_id, dataset, model):
    model.eval()
    with torch.no_grad():
        uid_series = dataset.token2id(dataset.uid_field, [external_user_id])
        index = np.isin(dataset.inter_feat[dataset.uid_field].numpy(), uid_series)
        input_interaction = dataset[index]
        test = {
            'item_id_list': add_last_item(input_interaction, 
                                          input_interaction['item_id'][-1].item(), model.max_seq_length),
            'item_length': torch.tensor(
                [input_interaction['item_length'][-1].item() + 1
                 if input_interaction['item_length'][-1].item() < model.max_seq_length else model.max_seq_length])
        }
        new_inter = Interaction(test)
        new_inter = new_inter.to(config['device'])
        new_scores = model.full_sort_predict(new_inter)
        new_scores = new_scores.view(-1, test_data.dataset.item_num)
        new_scores[:, 0] = -np.inf  # set scores of [pad] to -inf
    return torch.topk(new_scores, 12)

In [18]:
topk_items = []
for external_user_id in external_user_ids:
    _, topk_iid_list = predict_for_all_item(external_user_id, dataset, model)
    last_topk_iid_list = topk_iid_list[-1]
    external_item_list = dataset.id2token(dataset.iid_field, last_topk_iid_list.cpu()).tolist()
    topk_items.append(external_item_list)
print(len(topk_items))

25038


In [19]:
external_item_str = [' '.join(x) for x in topk_items]
result = pd.DataFrame(external_user_ids, columns=['customer_id'])
result['prediction'] = external_item_str
result.head()

Unnamed: 0,customer_id,prediction
0,0064cd1ee810d4caabd1182a8f177479b82b18961bd76b...,0894956001 0918292001 0914537002 0730683050 08...
1,00d7ebd46f6a6d53630d41386b6ef6a505cdc4c80011ff...,0918522001 0924243001 0923758001 0918525001 08...
2,00eebac2c2e37626461e74e8395711964c4e01a7afa643...,0860498001 0824490001 0852773002 0896848001 08...
3,0109ad0b5a76924a1b58be677409bb601cc8bead9a87b8...,0901955001 0865587001 0861477001 0600886001 07...
4,013f00f9e218549246a3aa82b3f3a0c22a693bc25fa735...,0839402002 0771602001 0861731002 0865172003 08...


In [20]:
del external_item_str
del topk_items
del external_user_ids
del train_data
del valid_data
del test_data
del model
del Trainer
del logger
del dataset
gc.collect()

42

#  default recomendation for user who can not be predicted by sequential model.
I use this approach in notebook: https://www.kaggle.com/hervind/h-m-faster-trending-products-weekly You can check it for more detail information. I will juse copy only code here

In [21]:
def cust_blend(dt, W = [1,1,1]):
    #Global ensemble weights
    #W = [1.15,0.95,0.85]
    
    #Create a list of all model predictions
    REC = []
    REC.append(dt['prediction0'].split())
    REC.append(dt['prediction1'].split())
    REC.append(dt['prediction2'].split())
    
    #Create a dictionary of items recommended. 
    #Assign a weight according the order of appearance and multiply by global weights
    res = {}
    for M in range(len(REC)):
        for n, v in enumerate(REC[M]):
            if v in res:
                res[v] += (W[M]/(n+1))
            else:
                res[v] = (W[M]/(n+1))
    
    # Sort dictionary by item weights
    res = list(dict(sorted(res.items(), key=lambda item: -item[1])).keys())
    
    # Return the top 12 itens only
    return ' '.join(res[:12])

In [22]:
sub0 = pd.read_csv('../input/hm-pre-recommendation/submissio_byfone_chris.csv').sort_values('customer_id').reset_index(drop=True)
sub1 = pd.read_csv('../input/hm-pre-recommendation/submission_trending.csv').sort_values('customer_id').reset_index(drop=True)
sub2 = pd.read_csv('../input/hm-pre-recommendation/submission_exponential_decay.csv').sort_values('customer_id').reset_index(drop=True)

sub0.columns = ['customer_id', 'prediction0']
sub0['prediction1'] = sub1['prediction']
sub0['prediction2'] = sub2['prediction']
del sub1, sub2
gc.collect()

21

In [23]:
sub0['prediction'] = sub0.apply(cust_blend, W = [1.05,1.00,0.95], axis=1)
del sub0['prediction0']
del sub0['prediction1']
del sub0['prediction2']
gc.collect()
#sub0.to_csv(f'submission.csv', index=False)

0

# 5. Combine result from most bought items and NN model

In [24]:
submit_df = sub0.copy() #pd.read_csv('submission.csv')
del sub0
submit_df = pd.merge(submit_df, result, on='customer_id', how='outer')
if VALID:
    valid_users = list(submit_df[~submit_df.prediction_y.isnull()]["customer_id"])
submit_df = submit_df.fillna(-1)
submit_df['prediction'] = submit_df.apply(
    lambda x: x['prediction_y'] if x['prediction_y'] != -1 else x['prediction_x'], axis=1)
submit_df = submit_df.drop(columns=['prediction_y', 'prediction_x'])

In [25]:
# local score
if VALID:
    sub_check = submit_df.copy()
    sub_check = sub_check.set_index('customer_id').reset_index()
    print("t", mapk(valid.prediction.str.split(), sub_check.prediction.str.split(), k=12), valid.shape[0])    
    print("v", 
            mapk(valid[valid.customer_id.isin(valid_users)].prediction.str.split(), 
               sub_check[sub_check.customer_id.isin(valid_users)].prediction.str.split(), k=12),
            valid[valid.customer_id.isin(valid_users)].shape[0])    
    print("  ")
    print(sub_check["prediction"].value_counts())  

t 0.0039022005616532536 68984
v 0.0018913636160436706 7665
  
0448509014 0573085028 0924243001 0751471001 0706016001 0924243002 0673677002 0715624001 0918522001 0706016003 0158340001 0579541001    316809
0706016001 0448509014 0924243001 0751471001 0573085028 0706016002 0924243002 0673677002 0918522001 0715624001 0706016003 0158340001       749
0568601006 0448509014 0924243001 0751471001 0573085028 0568597006 0924243002 0706016001 0673677002 0918522001 0715624001 0706016003       702
0720125001 0448509014 0924243001 0706016001 0751471001 0573085028 0924243002 0673677002 0918522001 0715624001 0706016003 0158340001       621
0673396002 0448509014 0924243001 0706016001 0751471001 0573085028 0924243002 0673677002 0918522001 0715624001 0706016003 0158340001       591
                                                                                                                                        ...  
0906226002 0867979001 0915529003 0918836001 0906226001 0852584001 0878604002 076279601

In [26]:
submit_df.to_csv('submission.csv', index=False)