# Aula 04 - Exemplos

In [None]:
import numpy as np
import pandas as pd

## Fazendo download da base

In [None]:
!pip install wget
!python3 -m wget https://github.com/mmanzato/MBABigData/raw/master/ml-20m-compact.tar.gz
!tar -xvzf ml-20m-compact.tar.gz

Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9655 sha256=1f4fdeacdf36430f9cc2b2ff955cafac960906fb5aafb0bfd4562d6d3da8b497
  Stored in directory: /root/.cache/pip/wheels/8b/f1/7f/5c94f0a7a505ca1c81cd1d9208ae2064675d97582078e6c769
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2

Saved under ml-20m-compact.tar.gz
dataset/
dataset/tags_sample.csv
dataset/._.DS_Store
dataset/.DS_Store
dataset/movies_sample.csv
dataset/._genome-tags.csv
dataset/genome-tags.csv
dataset/._ml-youtube.csv
dataset/ml-youtube.csv
dataset/._genome-scores.csv
dataset/genome-scores.csv
dataset/ratings_sample.csv


## Ler e preparar dados (vide notebooks anteriores)


In [None]:
movies = pd.read_csv('./dataset/movies_sample.csv')
ratings = pd.read_csv('./dataset/ratings_sample.csv')
df = ratings[['userId', 'movieId', 'rating']]
df = df.merge(movies[['movieId', 'title']])
map_users = {user: idx for idx, user in enumerate(df.userId.unique())}
map_items = {item: idx for idx, item in enumerate(df.movieId.unique())}
df['userId'] = df['userId'].map(map_users)
df['movieId'] = df['movieId'].map(map_items)

map_title = {}
for _, row in df.iterrows():
    map_title[row.movieId] = row.title

## Avaliação no cenário de predição de notas (rating prediction)

### Cross-Validation

In [None]:
# install caserec
import sys
!{sys.executable} -m pip install caserecommender



In [None]:
from caserec.utils.cross_validation import CrossValidation
from caserec.recommenders.rating_prediction.itemknn import ItemKNN

df.to_csv('ratings.dat', index=False, header=False, sep='\t')

recommender = ItemKNN()
CrossValidation(input_file='ratings.dat', recommender=recommender, dir_folds='./', header=1, k_folds=5).compute()

[Case Recommender: Cross Validation]

Database:: ratings.dat 
Recommender Algorithm:: ItemKNN Algorithm | K Folds: 5

Eval:: MAE: 0.801819 RMSE: 1.064265 
Eval:: MAE: 0.800534 RMSE: 1.064435 
Eval:: MAE: 0.796414 RMSE: 1.060053 
Eval:: MAE: 0.803863 RMSE: 1.068898 
Eval:: MAE: 0.793535 RMSE: 1.056095 
Mean:: MAE: 0.799233 RMSE: 1.062749 
STD:: MAE: 0.003748 RMSE: 0.004347 


### Hold-Out

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=.2, random_state=2)
train.to_csv('train.dat', index=False, header=False, sep='\t')
test.to_csv('test.dat', index=False, header=False, sep='\t')


ItemKNN('train.dat', 'test.dat', 'rp_iknn.dat', as_similar_first=True).compute()

[Case Recommender: Rating Prediction > ItemKNN Algorithm]

train data:: 11090 users and 403 items (152496 interactions) | sparsity:: 96.59%
test data:: 10503 users and 340 items (38125 interactions) | sparsity:: 98.93%

training_time:: 13.652960 sec
prediction_time:: 2.255743 sec
Eval:: MAE: 0.665964 RMSE: 0.876739 


### Explorando as predições

In [None]:
preds = pd.read_csv('./rp_iknn.dat', sep='\t', names=['userId', 'movieId', 'rating'])
preds_user = preds.loc[(preds.userId==0), 'rating'].tolist()
print(preds_user)

[4.26384, 3.824934]


In [None]:
ratings_user = test.loc[(test.userId==0), 'rating'].tolist()
print(ratings_user)

[5.0, 5.0]


In [None]:
from math import sqrt

def rmse_user(preds, ratings):
    if len(preds) != len(ratings):
        return -1
    sum = 0
    for i in range(len(preds)):
        sum += pow(preds[i]-ratings[i], 2)
    return sqrt(sum/len(preds))

print(rmse_user(preds_user, ratings_user))

0.9804875445297611


## Avaliação no cenário de recomendação de itens (item recommendation)

In [None]:
from caserec.recommenders.item_recommendation.bprmf import BprMF

BprMF('train.dat', 'test.dat', 'ir_bprmf.dat', factors=3).compute()

[Case Recommender: Item Recommendation > BPRMF]

train data:: 11090 users and 403 items (152496 interactions) | sparsity:: 96.59%
test data:: 10503 users and 340 items (38125 interactions) | sparsity:: 98.93%

training_time:: 150.159013 sec
prediction_time:: 2.853613 sec


Eval:: PREC@1: 0.371418 PREC@3: 0.272779 PREC@5: 0.232257 PREC@10: 0.175645 RECALL@1: 0.116727 RECALL@3: 0.246369 RECALL@5: 0.342788 RECALL@10: 0.508531 MAP@1: 0.371418 MAP@3: 0.463098 MAP@5: 0.469203 MAP@10: 0.445974 NDCG@1: 0.371418 NDCG@3: 0.550143 NDCG@5: 0.571434 NDCG@10: 0.572217 


In [None]:
from caserec.recommenders.item_recommendation.itemknn import ItemKNN

ItemKNN('train.dat', 'test.dat', 'ir_itemknn.dat').compute()

[Case Recommender: Item Recommendation > ItemKNN Algorithm]

train data:: 11090 users and 403 items (152496 interactions) | sparsity:: 96.59%
test data:: 10503 users and 340 items (38125 interactions) | sparsity:: 98.93%

training_time:: 1.303467 sec
prediction_time:: 60.527441 sec


Eval:: PREC@1: 0.419023 PREC@3: 0.307975 PREC@5: 0.254384 PREC@10: 0.187261 RECALL@1: 0.134438 RECALL@3: 0.281617 RECALL@5: 0.378529 RECALL@10: 0.546924 MAP@1: 0.419023 MAP@3: 0.513599 MAP@5: 0.516788 MAP@10: 0.487732 NDCG@1: 0.419023 NDCG@3: 0.603503 NDCG@5: 0.620406 NDCG@10: 0.613585 


### Explorando as recomendações

In [None]:
recs = pd.read_csv('./ir_bprmf.dat', sep='\t', names=['userId', 'movieId', 'score'])
recs_user = recs.loc[(recs.userId==1), 'movieId'].tolist()
print(recs_user)

[12, 21, 17, 20, 10, 22, 28, 8, 43, 61]


In [None]:
ground_truth = test.loc[(test.userId==1), 'movieId'].tolist()
print(ground_truth)

[6, 106, 21, 30, 12]


In [None]:
intersec = list(set(recs_user) & set(ground_truth))
print('Precisão: ' + str(len(intersec)/len(recs_user)))
print('Revocação: ' + str(len(intersec)/len(ground_truth)))

Precisão: 0.2
Revocação: 0.4
