In [None]:
! git clone git@github.com:yupopov/fantlab-recommender-system.git
import os
os.chdir('fantlab-recommender-system')

Cloning into 'fantlab-recommender-system'...
remote: Enumerating objects: 578, done.[K
remote: Counting objects: 100% (175/175), done.[K
remote: Compressing objects: 100% (119/119), done.[K
remote: Total 578 (delta 112), reused 112 (delta 53), pack-reused 403[K
Receiving objects: 100% (578/578), 545.34 MiB | 27.61 MiB/s, done.
Resolving deltas: 100% (307/307), done.


In [None]:
! git config --global user.email "yuri.ppv@gmail.com"
! git config --global user.name "Yury Popov"

In [None]:
%%capture
%load_ext autoreload
%autoreload 2

import gzip
import json
from tqdm.auto import tqdm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from torch import nn
from torch.nn import Module, Embedding, LSTM, RNN, GRU, Linear, Sequential, Dropout, \
    CrossEntropyLoss
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from scipy.sparse import coo_matrix, csr_matrix, csc_matrix, load_npz
! pip install lightfm

from src.preprocessing.datasets import RNNDatasetMaker
from src.models.get_top_k_predictions_with_label import get_top_k_predictions_with_labels
from src.models.rnn_recommender import RecurrentLanguageModel, RecurrentRecommender
from src.models.trainer import Trainer
from src.experiments.run_n_experiments import plot_experiments, run_experiment, run_n_experiments


In [None]:
rnn_dataset_maker = RNNDatasetMaker()
rnn_dataset = rnn_dataset_maker.get_rnn_dataset()

Loading marks...
Loading embeddings...
Done.
Stats before filtering by date:
Marks: 7368410	Unique titles: 23867	Unique users: 61739

Deleting marks dated before 2012...
Stats after filtering by date:
Marks: 5056872	Unique titles: 23867	Unique users: 47578

Splitting the marks by 2020-12-07 00:00:00...
Train set stats:
Marks: 4551770	Unique titles: 23768	Unique users: 43998

Test set stats:
Marks: 505102	Unique titles: 23715	Unique users: 13440

Dropping works with less than 50 marks in the train set...
Train set stats:
Marks: 4342747	Unique titles: 18130	Unique users: 43774

Dropping works with less than 10 marks in the test set...
Test set stats:
Marks: 446391	Unique titles: 12199	Unique users: 13264

Dropping works with interactions only in the train or the test set...
Train set stats:
Marks: 3778163	Unique titles: 11230	Unique users: 42840

Test set stats:
Marks: 407321	Unique titles: 11230	Unique users: 12939

Dropping users with less than 10 marks in the train set...
Stats after 

## Model training

In [None]:
def make_preds(model, dataset, k=20):
    recommender = RecurrentRecommender(model, dataset.pred_dataset)
    top_k_preds, labels = get_top_k_predictions_with_labels(recommender,
                                  dataset.test_interactions,
                                  dataset.train_interactions, k=k
                                  )
    precision = labels.sum(axis=1).mean()/k
    print(f'Prediction precision: {precision:.4f}')

    return top_k_preds

In [None]:
net_config = {
    "freeze_embs": True,
    "cell_type": "GRU",
    "cell_dropout": 0.5,
    "num_layers": 2,
    "hidden_size": 256,
    "out_activation": "relu",
    "out_dropout": 0.5,
    "out_sizes": [500, 500]}
    
trainer_config = {
    "n_epochs": 30,
    "batch_size": 256,
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    'optimizer_cls': torch.optim.Adam,
    'optimizer_params': {
        'lr': 1e-2,
        'weight_decay': 1e-6,
    },
    'scheduler_cls': torch.optim.lr_scheduler.ReduceLROnPlateau,
    'scheduler_params': {
        'patience': 0,
        'verbose': True
    }, 
}

train_dataset = rnn_dataset.train_dataset
train_dataloader = DataLoader(train_dataset, batch_size=trainer_config['batch_size'], collate_fn=train_dataset.collate_fn, shuffle=True)


val_dataset = rnn_dataset.val_dataset
val_dataloader = DataLoader(val_dataset, batch_size=trainer_config['batch_size'], collate_fn=train_dataset.collate_fn, shuffle=True)

model = RecurrentLanguageModel(net_config, rnn_dataset.item_vocab, rnn_dataset.embs)

trainer = Trainer(trainer_config)

trainer.fit(model, train_dataloader, val_dataloader)

preds = make_preds(model, rnn_dataset)



In [None]:
exp_res = run_n_experiments(rnn_dataset, net_config, trainer_config, bottom=5, top=8, n_experiments=1)

In [None]:
plot_experiments(exp_res.keys(), exp_res, bottom=7.6, n_experiments=1)