In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.metrics import accuracy_score

import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
#import torchvision.transforms as tt

from tqdm.notebook import tqdm

from utils.metrics import recall_k, ndcg_k
from data.dataset_creation import DatasetInit, CustomDatasetSmall, CustomDatasetLarge, ToDevice

import torch
from torch import nn
from torch.utils.data import DataLoader
from torch.utils.data import DataLoader
from scipy.sparse import coo_matrix
import numpy as np

import wandb
import os
os.environ["WANDB_API_KEY"] = '0df87019b9f607855a97d62699a36b21b11eea04'

import numpy as np
import random
import torch
import os

def set_seed(seed: int = 42) -> None:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ["PYTHONHASHSEED"] = str(seed)
    print(f"Random seed set as {seed}")

from tqdm import tqdm

In [2]:
set_seed(2007)

Random seed set as 2007


In [3]:
####
history_len=20
item_embed_size=128
user_embed_size=32

user_item_hidden_size = 128
user_item_history_hidden_size = 128
lstm_hidden_size = 128
dense_1_hidden_size = 128
dense_2_hidden_size = 128

dataset_name = 'dunnhumby_cj' + 'test'
###

In [4]:
!wget https://raw.githubusercontent.com/mzhariann/recanet/main/data/dunnhumby_cj/train_baskets_sample.csv
!mv train_baskets_sample.csv data/dunnhumby_cj/train_baskets_sample.csv

--2023-05-27 19:17:59--  https://raw.githubusercontent.com/mzhariann/recanet/main/data/dunnhumby_cj/train_baskets_sample.csv
Распознаётся raw.githubusercontent.com (raw.githubusercontent.com)… 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Подключение к raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... соединение установлено.
HTTP-запрос отправлен. Ожидание ответа… 200 OK
Длина: 2706540 (2,6M) [text/plain]
Сохранение в: «train_baskets_sample.csv»


2023-05-27 19:18:02 (1,55 MB/s) - «train_baskets_sample.csv» сохранён [2706540/2706540]



In [5]:
path_train = f'data/{dataset_name}/train_baskets.csv'
path_test = f'data/{dataset_name}/test_baskets.csv'
path_val = f'data/{dataset_name}/valid_baskets.csv'

# path_train = f'data/dunnhumby_cj/train_baskets_sample.csv'
# path_test = f'data/dunnhumby_cj/test_baskets.csv'
# path_val = f'data/dunnhumby_cj/valid_baskets.csv'

In [6]:
!mkdir data/dunnhumby_cj_test

mkdir: data/dunnhumby_cj_test: File exists


In [7]:
dataset = DatasetInit(
    path_train=path_train,
    path_val=path_val,
    path_test=path_test,
    dataset=dataset_name,
    history_len=history_len,
    basket_count_min=3,
    min_item_count=5
    )

number of test users: 981
items: 22924
filtered items: 5078


In [8]:
batch_size = 10000 

train_dataset = CustomDatasetLarge(
    dataset=dataset, 
    mode='train'
    )
# train_dataset = CustomDatasetSmall(
#     dataset=dataset, 
#     mode='train'
#     )
train_loader = DataLoader(
    dataset=train_dataset, 
    batch_size=batch_size, 
    shuffle=True
    )

val_dataset = CustomDatasetSmall(
    dataset=dataset, 
    mode='val'
    )
val_loader = DataLoader(
    dataset=val_dataset, 
    batch_size=batch_size, 
    shuffle=False
    )

test_dataset = CustomDatasetSmall(
    dataset=dataset, 
    mode='test'
    )
test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=batch_size,
    shuffle=False
    )

30it [00:00, 98.44it/s] 

num users: 981
1 user passed


981it [00:12, 77.32it/s] 
533it [00:00, 2322.38it/s]

201 user passed
301 user passed
401 user passed
601 user passed


1250it [00:00, 3289.39it/s]


901 user passed
1001 user passed
1101 user passed
1201 user passed


701it [00:00, 3551.15it/s]

101 user passed
201 user passed
401 user passed
601 user passed
701 user passed
801 user passed
901 user passed


1250it [00:00, 2812.70it/s]


1001 user passed
1101 user passed
1201 user passed


In [9]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cpu device


In [10]:
train_loader = ToDevice(
    dl=train_loader, 
    device=device
    )
val_loader = ToDevice(
    dl=val_loader, 
    device=device
    )
test_loader = ToDevice(
    dl=test_loader, 
    device=device
    )

In [11]:
torch.cuda.empty_cache()

In [12]:
def predict(
        model,
        test_loader,
        regime='test'
        ):
    
    test_items, test_users, test_history2, test_labels = dataset.create_test_data(
        regime
        )
    
    preds = []
    pred_scores = []

    with torch.no_grad():
        for item_input, user_input,  history_input, target in tqdm(test_loader):
            y_pred = model(
                item_input=item_input,
                user_input=user_input,
                history_input=history_input
                )
            
            pred = [round(value) for value in y_pred.flatten().tolist()]
            preds.extend(pred)

            pred_scores.extend(y_pred.flatten().tolist())

    prediction_baskets = {}
    prediction_scores = {}
    for user in tqdm(dataset.test_users):
        top_items = []
        if user in dataset.user_id_mapper:
            user_id = dataset.user_id_mapper[user]
            indices = np.argwhere(test_users == user_id)
            item_scores = np.array(pred_scores)[indices].flatten()
            item_ids = test_items[indices].flatten()
            item_score_dic = {}
            for i, item_id in enumerate(item_ids):
                item_score_dic[dataset.id_item_mapper[item_id]] = item_scores[i]
            sorted_item_scores = sorted(item_score_dic.items(), key=lambda x: x[1], reverse = True)
            top_items = [x[0] for x in sorted_item_scores]
            prediction_scores[user] = sorted_item_scores
            
        prediction_baskets[user] = top_items

    return prediction_baskets


In [13]:
def train(
        model,
        epochs,
        checkpoint=True,
        wandb_name='ReCANet'
        ):
    
    torch.cuda.empty_cache()
    
    wandb.init(
        project='recanet',
        name=wandb_name,
        tags=[
            'all_data',
        ]
    )
    
    wandb.define_metric('epoch_step/train')
    wandb.define_metric('epoch/train/*', step_metric='epoch_step/train')
    
    wandb.define_metric('batch_step/train')
    wandb.define_metric('train/batch/*', step_metric='batch_step/train')
    
    wandb.define_metric('epoch_step/valid')
    wandb.define_metric('epoch/valid/*', step_metric='epoch_step/valid')
    
    wandb.define_metric('batch_step/valid')
    wandb.define_metric('batch/valid/*', step_metric='batch_step/valid')
    
    wandb.define_metric('epoch_step/test')
    wandb.define_metric('epoch/test/*', step_metric='epoch_step/test')
    
    loss_train = []
    loss_val = []
    metric_val = []
    metric_train = []

    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.AdamW(params=parameters, lr=0.001)
    criterion = nn.BCELoss(reduction='mean')

    for i in tqdm(range(epochs)):
        print(f'Epoch {i}')
        loss_train_epoch = []
        loss_val_epoch = []
        metric_val_epoch = []
        metric_train_epoch = []

        model.train()
        wandb.watch(
            model,
            log='all'
            )
        
        for step_id, (item_input, user_input, history_input, target) in tqdm(enumerate(train_loader)):
            optimizer.zero_grad()
            
            y_pred = model(
                item_input=item_input,
                user_input=user_input,
                history_input=history_input)
            
            predictions = [round(value) for value in y_pred.flatten().tolist()]
        
            loss = criterion(
                y_pred,
                target.to(device)
                )
            
            loss.backward()
            optimizer.step()
            
            loss_train_epoch.append(loss.item())
            
            metric_value = accuracy_score(
                target.detach().cpu().numpy(),
                predictions
                )
            metric_train_epoch.append(metric_value)
            
            wandb.log({
                'batch/train/loss': loss.item(),
                'batch/train/accuracy': metric_value,
                'batch_step/train': step_id
            })
            
        loss_train.append(np.mean(loss_train_epoch))
        metric_train.append(np.mean(metric_train_epoch))
        wandb.log({
            'epoch_step/train': i,
            'epoch/train/loss': np.mean(loss_train_epoch),
            'epoch/train/accuracy': np.mean(metric_train_epoch)
        })

        model.eval()
        full_y = []
        full_predictions = []
        pred_scores = []

        with torch.no_grad():
            for step_id, (item_input, user_input,  history_input, target) in enumerate(val_loader):
                y_pred = model(item_input, user_input, history_input)
                
                loss = criterion(y_pred, target)

                predictions = [round(value) for value in y_pred.flatten().tolist()]

                full_y.extend(target.detach().cpu())
                pred_scores.extend(y_pred.detach().cpu())
                full_predictions.extend(predictions)

                loss_val_epoch.append(loss.item())
                metric_val_epoch.append(accuracy_score(target.cpu(), predictions))
                
                wandb.log({
                    'batch/valid/loss': loss.item(),
                    'batch/valid/accuracy': accuracy_score(target.cpu(), predictions),
                    'batch_step/valid': step_id
                })
                
        accuracy = accuracy_score(full_y, full_predictions)
        print("Accuracy epoch: %.2f%%" % (accuracy * 100.0))

        val_baskets = pd.read_csv(path_val)
        user_val_baskets_df = val_baskets.groupby('user_id')['item_id'].apply(list).reset_index()
        user_val_baskets_dict = dict(zip(user_val_baskets_df['user_id'], user_val_baskets_df['item_id']))

        user_predictions = predict(model, val_loader, 'val')
        final_users = set(dataset.test_users).intersection(set(list(user_val_baskets_dict.keys())))
        print('predictions ready', len(user_predictions))
        print('number of final test users:', len(final_users))
        log_dict_valid = {}
        for k in [5, 10, 20, 'B']:
            print(k)
            recall_scores = {}
            ndcg_scores = {}
            for user in final_users:

                top_items = []
                if user in user_predictions:
                    top_items = user_predictions[user]

                if k == 'B':
                    recall_scores[user] = recall_k(
                        y_true=user_val_baskets_dict[user],
                        y_pred=top_items,
                        k=len(user_val_baskets_dict[user])
                        )
                    ndcg_scores[user] = ndcg_k(
                        y_true=user_val_baskets_dict[user],
                        y_pred=top_items,
                        k=len(user_val_baskets_dict[user])
                        )
                else:
                    recall_scores[user] = recall_k(
                        y_true=user_val_baskets_dict[user],
                        y_pred=top_items,
                        k=k
                        )
                    ndcg_scores[user] = ndcg_k(
                        y_true=user_val_baskets_dict[user],
                        y_pred=top_items,
                        k=k
                        )
            print('recall:', np.mean(list(recall_scores.values())))
            print('ndcg:', np.mean(list(ndcg_scores.values())))
            log_dict_valid[f'epoch/valid/recall_{k}'] = np.mean(list(recall_scores.values()))
            log_dict_valid[f'epoch/valid/ndcg_{k}'] = np.mean(list(ndcg_scores.values()))

        wandb.log({
            **{
                'epoch_step/valid': i,
                'epoch/valid/accuracy': np.mean(metric_val_epoch),
                'epoch/valid/loss': np.mean(loss_val_epoch)
            },
            **log_dict_valid,
            })

        loss_val.append(np.mean(loss_val_epoch))
        metric_val.append(np.mean(metric_val_epoch))
        
        test_baskets = pd.read_csv(path_test)
        user_test_baskets_df = test_baskets.groupby('user_id')['item_id'].apply(list).reset_index()
        user_test_baskets_dict = dict(zip(user_test_baskets_df['user_id'], user_test_baskets_df['item_id']))

        user_predictions = predict(model, test_loader, 'test')
        final_users = set(dataset.test_users).intersection(set(list(user_test_baskets_dict.keys())))
        print('predictions ready', len(user_predictions))
        print('number of final test users:', len(final_users))
        log_dict_test = {}
        for k in [5, 10, 20, 'B']:
            print(k)
            recall_scores = {}
            ndcg_scores = {}
            for user in final_users:

                top_items = []
                if user in user_predictions:
                    top_items = user_predictions[user]

                if k == 'B':
                    recall_scores[user] = recall_k(
                        y_true=user_test_baskets_dict[user],
                        y_pred=top_items,
                        k=len(user_test_baskets_dict[user])
                        )
                    ndcg_scores[user] = ndcg_k(
                        y_true=user_test_baskets_dict[user],
                        y_pred=top_items,
                        k=len(user_test_baskets_dict[user]))
                else:
                    recall_scores[user] = recall_k(
                        y_true=user_test_baskets_dict[user],
                        y_pred=top_items,
                        k=k)
                    ndcg_scores[user] = ndcg_k(
                        y_true=user_test_baskets_dict[user],
                        y_pred=top_items,
                        k=k
                        )
                    # 
            print('recall:', np.mean(list(recall_scores.values())))
            print('ndcg:', np.mean(list(ndcg_scores.values())))
            log_dict_test[f'epoch/test/recall_{k}'] = np.mean(list(recall_scores.values()))
            log_dict_test[f'epoch/test/ndcg_{k}'] = np.mean(list(ndcg_scores.values()))
        
        wandb.log({
            **{
                'epoch_step/test': i,   
            },
            **log_dict_test,
            })
        
        if checkpoint:
            path = f'wallykop_epoch{i}_recanet_dunnhumby_20.pth'
            torch.save(model.state_dict(), path)
            np.save('loss_train.npy', loss_train)
            np.save('loss_val.npy', loss_val)
            np.save('metric_train.npy', metric_train)
            np.save('metric_val.npy', metric_val)
            
            wandb.save(path)
        
    wandb.finish()
    return loss_train, loss_val


In [42]:
from models.reCANet_base import ReCaNet

In [30]:
from models.reCANet_Attention import ReCaNet_Attention, Bidir_ReCaNet_Attention

In [34]:
from models.reCANet_Pooling import ReCaNet_Pooling
from models.pooling_layers import AttentionPooling

In [43]:
from models.reCANet_MHSA import ReCaNet_separate_MHSA, ReCaNet_user_item_MHSA

In [52]:
from models.reCANet_Transformer import ReCaNet_Transformer, ReCaNet_MHSA_Transformer

In [53]:
# poolings2 = [AttentionPooling(lstm_hidden_size)]
# model = ReCaNet_MHSA_Transformer(
#     num_items=dataset.num_items, 
#     item_embed_size=item_embed_size, 
#     num_users=dataset.num_users, 
#     user_embed_size=user_embed_size, 
#     history_len = history_len, 
#     user_item_hidden_size = user_item_hidden_size,
#     user_item_history_hidden_size = user_item_history_hidden_size, 
#     lstm_hidden_size = lstm_hidden_size,
#     dense_1_hidden_size = dense_1_hidden_size, 
#     dense_2_hidden_size = dense_2_hidden_size,
#     # poolings=poolings2
#     ).to(device)


In [None]:
model = ReCaNet(
    num_items=dataset.num_items, 
    item_embed_size=item_embed_size, 
    num_users=dataset.num_users, 
    user_embed_size=user_embed_size, 
    history_len = history_len, 
    user_item_hidden_size = user_item_hidden_size,
    user_item_history_hidden_size = user_item_history_hidden_size, 
    lstm_hidden_size = lstm_hidden_size,
    dense_1_hidden_size = dense_1_hidden_size, 
    dense_2_hidden_size = dense_2_hidden_size)

In [None]:
model = ReCaNet_MHSA_Transformer(
    num_items=dataset.num_items, 
    item_embed_size=item_embed_size, 
    num_users=dataset.num_users, 
    user_embed_size=user_embed_size, 
    history_len = history_len, 
    user_item_hidden_size = user_item_hidden_size,
    user_item_history_hidden_size = user_item_history_hidden_size, 
    lstm_hidden_size = lstm_hidden_size,
    dense_1_hidden_size = dense_1_hidden_size, 
    dense_2_hidden_size = dense_2_hidden_size,
    )


In [None]:
model = ReCaNet_Transformer(
    num_items=dataset.num_items, 
    item_embed_size=item_embed_size, 
    num_users=dataset.num_users, 
    user_embed_size=user_embed_size, 
    history_len = history_len, 
    user_item_hidden_size = user_item_hidden_size,
    user_item_history_hidden_size = user_item_history_hidden_size, 
    lstm_hidden_size = lstm_hidden_size,
    dense_1_hidden_size = dense_1_hidden_size, 
    dense_2_hidden_size = dense_2_hidden_size,
    )


In [None]:
model = ReCaNet_separate_MHSA(
    num_items=dataset.num_items, 
    item_embed_size=item_embed_size, 
    num_users=dataset.num_users, 
    user_embed_size=user_embed_size, 
    history_len = history_len, 
    user_item_hidden_size = user_item_hidden_size,
    user_item_history_hidden_size = user_item_history_hidden_size, 
    lstm_hidden_size = lstm_hidden_size,
    dense_1_hidden_size = dense_1_hidden_size, 
    dense_2_hidden_size = dense_2_hidden_size,
    )

In [None]:
model = ReCaNet_user_item_MHSA(
    num_items=dataset.num_items, 
    item_embed_size=item_embed_size, 
    num_users=dataset.num_users, 
    user_embed_size=user_embed_size, 
    history_len = history_len, 
    user_item_hidden_size = user_item_hidden_size,
    user_item_history_hidden_size = user_item_history_hidden_size, 
    lstm_hidden_size = lstm_hidden_size,
    dense_1_hidden_size = dense_1_hidden_size, 
    dense_2_hidden_size = dense_2_hidden_size,
    )

In [None]:
poolings2 = [AttentionPooling(lstm_hidden_size)]
model = ReCaNet_Pooling(
    num_items=dataset.num_items, 
    item_embed_size=item_embed_size, 
    num_users=dataset.num_users, 
    user_embed_size=user_embed_size, 
    history_len = history_len, 
    user_item_hidden_size = user_item_hidden_size,
    user_item_history_hidden_size = user_item_history_hidden_size, 
    lstm_hidden_size = lstm_hidden_size,
    dense_1_hidden_size = dense_1_hidden_size, 
    dense_2_hidden_size = dense_2_hidden_size,
    poolings=poolings2
    )


In [None]:
model = Bidir_ReCaNet_Attention(
    num_items=dataset.num_items, 
    item_embed_size=item_embed_size, 
    num_users=dataset.num_users, 
    user_embed_size=user_embed_size, 
    history_len = history_len, 
    user_item_hidden_size = user_item_hidden_size,
    user_item_history_hidden_size = user_item_history_hidden_size, 
    lstm_hidden_size = lstm_hidden_size,
    dense_1_hidden_size = dense_1_hidden_size, 
    dense_2_hidden_size = dense_2_hidden_size,
    )

In [None]:
loss_train, loss_val = train(
    model=model, 
    epochs=5, 
    checkpoint=True, 
    wandb_name='ReCANet'
    )