In [1]:
from src.utils import read_ml_1m
from src.utils import split_recsys_data
from src.utils import split_by_user_folds

In [2]:
ratings, item_data, shape = read_ml_1m()

In [26]:
ratings.head()

Unnamed: 0,user,item,rating,timestamp
0,0,0,5,978300760
1,0,1,3,978302109
2,0,2,3,978301968
3,0,3,4,978300275
4,0,4,5,978824291


In [27]:
ratings.user.nunique()

6040

In [28]:
ratings.item.nunique()

3706

In [29]:
ratings.shape

(1000209, 4)

In [4]:
item_data.head()

Unnamed: 0,item,title,genre
0,0,Toy Story (1995),Animation|Children's|Comedy
1,1,Jumanji (1995),Adventure|Children's|Fantasy
2,2,Grumpier Old Men (1995),Comedy|Romance
3,3,Waiting to Exhale (1995),Comedy|Drama
4,4,Father of the Bride Part II (1995),Comedy


# calculate distancies

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize

In [14]:
vectorizer = CountVectorizer(binary=True, lowercase=False)
item_genre_features = vectorizer.fit_transform(item_data.genre.str.split('|').apply(lambda genres: ' '.join(genres)))
normalized_item_genre_features = normalize(item_genre_features)
item_similarity = normalized_item_genre_features.dot(normalized_item_genre_features.T)
item_similarity = item_similarity.tolil()
distancies = item_similarity.copy()
distancies.data = 1 - item_similarity.data

# calculate item long tail

In [25]:
import pandas as pd
import math
n_users = ratings['user'].nunique()
long_tail = ratings.groupby('item')['user'].agg(pd.Series.nunique)
long_tail = (long_tail / n_users)
long_tail = (-long_tail.apply(math.log)).to_dict()

{0: 1.2531769614634005,
 1: 2.442761028337237,
 2: 2.2509607275887387,
 3: 1.5245673463169958,
 4: 1.2660126102661724,
 5: 0.9576992670291636,
 6: 2.1493809347708184,
 7: 1.4968190343259513,
 8: 2.0689012596444036,
 9: 1.25724318838466,
 10: 1.7401351038227477,
 11: 3.475050674074274,
 12: 2.765988038208429,
 13: 1.4111028744662304,
 14: 1.9239672349220693,
 15: 1.2497047357526514,
 16: 2.6148494088511627,
 17: 2.327733107277274,
 18: 2.184066492758708,
 19: 1.3989569761641223,
 20: 1.6517096327959202,
 21: 2.6239403805524146,
 22: 0.8494524978330202,
 23: 0.9637572691130787,
 24: 2.8655176335554624,
 25: 2.7607386823222853,
 26: 0.9790648061490188,
 27: 1.3627330617814932,
 28: 4.415699849780469,
 29: 2.21847527244425,
 30: 2.236908974133088,
 31: 2.640051200825113,
 32: 2.5555565224825814,
 33: 1.4975589529686617,
 34: 2.511753899824188,
 35: 2.740012551805169,
 36: 3.8008845124904314,
 37: 2.348317024420761,
 38: 0.898649248712668,
 39: 1.5744607804619495,
 40: 1.0674794670527483,
 

In [5]:
train_set, test_set = split_recsys_data(ratings, test_size=0.1)
train_set.head()

Unnamed: 0,user,item,rating,timestamp
0,6039,669,4,956703932
1,6039,323,4,956703954
2,6039,128,5,956703954
3,6039,41,4,956703977
4,6039,1092,5,956703977


In [6]:
test_set.head()

Unnamed: 0,user,item,rating,timestamp
0,24,1315,3,978133433
1,24,1316,2,978133433
2,24,1223,3,978133460
3,24,1317,4,978133460
4,23,538,2,978133665


In [7]:
folds = split_by_user_folds(test_set)

In [72]:
from implicit.bpr import BayesianPersonalizedRanking
from implicit.evaluation import ndcg_at_k, mean_average_precision_at_k, precision_at_k

import scipy.sparse as sp

In [73]:
train_matrix = sp.csr_matrix(
    (
        (train_set.rating > 0).astype('float32'),
        (train_set.item, train_set.item)
    ),
    dtype='float32',
    shape=(shape[1], shape[0])
    
)

test_matrix = sp.csr_matrix(
    (
        (test_set.rating > 0).astype('float32'),
        (test_set.item, test_set.user)
    ),
    dtype='float32',
    shape=(shape[1], shape[0])
)

In [74]:
model = BayesianPersonalizedRanking(factors=50)

In [75]:
model.fit(train_matrix)

HBox(children=(IntProgress(value=0), HTML(value='')))




In [76]:
ndcg_at_k(model, train_matrix.T.tocsr(), test_matrix.T.tocsr(), 10)

HBox(children=(IntProgress(value=0, max=6040), HTML(value='')))




0.028830271337937873

In [77]:
BayesianPersonalizedRanking??

In [33]:
hash('https://leetcode.com/problems/design-tinyurl')

-8295338473908790202

In [34]:
import torch

In [1]:
from src.utils import read_ml_1m, get_folds_by_time
from src.sampling_utils import uniform_sampling
from src.data_utils import PairwiseRankingData
from src.models.pairwise import LambdaRankFactorization
from typing import Tuple, Dict, Union, List
import pandas as pd
import scipy.sparse as sp
import torch.utils.data as data
import torch.optim as optim

In [2]:
def evaluate_pairwise(
        evaluate_set: Tuple[pd.DataFrame, pd.DataFrame],
        params: Dict[str, Union[int, float, str]],
        shape: Tuple[int, int],
        metrics: List[str],
        k_list: List[int],
        distancies: sp.lil.lil_matrix = None,
        item_long_tails: Dict[int, float] = None,
        n_items: int = None) -> pd.DataFrame:

    train_df = evaluate_set[0]
    test_df = evaluate_set[1]

    train_dataset_with_negatives = uniform_sampling(train_df, item_num=shape[1], sample_size=params['sample_size'])
    train_dataset_with_negatives = PairwiseRankingData(train_dataset_with_negatives)
    relevant_lists = test_df.groupby('user')['item'].agg(list).to_dict()

    train_loader = data.DataLoader(
        train_dataset_with_negatives,
        batch_size=params['batch_size'],
        shuffle=True,
        num_workers=4
    )

    model = LambdaRankFactorization(
        user_num=shape[0],
        item_num=shape[1],
        factors=params['factors'],
        epochs=params['epochs'],
        lr=params['lr'],
        user_regularization=params['user_reg'],
        item_regularization=params['item_reg']
    )

    optimizer = optim.Adam(model.parameters(), lr=model.lr)
    model.fit(train_loader, optimizer, show_progress=True)
    model.eval()

    test_users = torch.Tensor(test_df['user'].unique().astype(int)).long()

    scores = defaultdict(list)
    for metric in metrics:
        for k in k_list:
            recommend_lists = model.predict(test_users, k)
            recommend_lists = {user: [item for item, _ in recommend_lists[user]] for user in recommend_lists}
            score = get_score_by_metric(recommend_lists,
                                        relevant_lists,
                                        metric,
                                        k,
                                        distancies,
                                        item_long_tails,
                                        n_items)

            scores[metric].append(score)

    scores = pd.DataFrame(scores)
    scores['k'] = k_list

    return scores

In [None]:
ratings, movies, shape = read_ml_1m()
folds = get_folds_by_time(ratings, n_folds=3, test_size=0.1)
results = evaluate_pairwise(evaluate_set=folds['train'],
                            params={
                                'sample_size': 3,
                                'batch_size': 1,
                                'factors': 5,
                                'epochs': 1,
                                'lr': 0.01,
                                'user_reg': 0.1,
                                'item_reg': 0.1
                            },
                            shape=shape,
                            metrics=['precision', 'map', 'ndcg'],
                            k_list=[1, 5, 10, 20, 50])

[Epoch 001]:   2%|▏         | 59852/2443579 [05:15<3:39:38, 180.88it/s]