In [1]:
!pip install transformers



In [2]:
from transformers import AutoModel, AutoTokenizer
import torch
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [31]:
book = pd.read_csv('/content/final_cleaned_books.csv')
review = pd.read_csv('/content/new_cleaned_reviews_v2.csv')

review = review.drop_duplicates(subset=['customer_id', 'product_id'])

In [32]:
review_book_description = pd.merge(review, book[['product_id', 'description']], how='inner', on='product_id')


In [33]:
review_book_description.head()

Unnamed: 0,customer_id,product_id,rating,content,description
0,2119875,12416734,5,,Đọc sách - đó là bạn đang trải nghiệm. Điểm sá...
1,22628730,12416734,4,Đóng gói ok . Chất lượng OK. Nội dung ko hay lắm,Đọc sách - đó là bạn đang trải nghiệm. Điểm sá...
2,21351313,12416734,5,cực hài lòng. giao sớm hơn dự kiến. sách chất ...,Đọc sách - đó là bạn đang trải nghiệm. Điểm sá...
3,14148210,12416734,5,,Đọc sách - đó là bạn đang trải nghiệm. Điểm sá...
4,522878,12416734,4,,Đọc sách - đó là bạn đang trải nghiệm. Điểm sá...


In [34]:
list_user_id10 = []
list_user_id5 = []
for id in review_book_description['customer_id'].unique():
    user_reviews = review_book_description[review_book_description['customer_id'] == id]
    if len(user_reviews) > 10:
        # print(f'Customer ID: {id}, Number of reviews: {len(user_reviews)}')
        list_user_id10.append(id)

for id in review_book_description['customer_id'].unique():
    user_reviews = review_book_description[review_book_description['customer_id'] == id]
    if len(user_reviews) > 5:
        # print(f'Customer ID: {id}, Number of reviews: {len(user_reviews)}')
        list_user_id5.append(id)

In [35]:
model= AutoModel.from_pretrained('uitnlp/visobert')
tokenizer = AutoTokenizer.from_pretrained('uitnlp/visobert')


Some weights of XLMRobertaModel were not initialized from the model checkpoint at uitnlp/visobert and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [36]:
mapping = pd.Series(book['product_name'], index=book.index)

In [37]:
all_book_matrix = {}
product_ids = list(book['product_id'][book['description'].notnull()])
description_book = list(book['description'][book['product_id'].isin(product_ids)])
batch_size = 16
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for i in range(0, len(description_book), batch_size):
    batch_product_ids = product_ids[i:i+batch_size]
    batch_descriptions = description_book[i:i+batch_size]

    encoding = tokenizer(batch_descriptions,
                        return_tensors='pt',
                        padding=True,
                        truncation=True,
                        max_length=512).to(device)
    with torch.no_grad():
        output = model(**encoding)
        attention_mask = encoding['attention_mask'].unsqueeze(-1).expand(output.last_hidden_state.size()).float()

        sum_embeddings = torch.sum(output.last_hidden_state * attention_mask, 1)

        sum_mask = torch.clamp(attention_mask.sum(1), min=1e-9)

        all_book_description_matrix = sum_embeddings / sum_mask

        for idx, product_id in enumerate(batch_product_ids):
          embedding = all_book_description_matrix[idx].cpu()

          all_book_matrix[product_id] = embedding
    if i % (batch_size * 10) == 0:
        print(f"Đã xử lý {i} / {len(description_book)} sách...")


Đã xử lý 0 / 1086 sách...
Đã xử lý 160 / 1086 sách...
Đã xử lý 320 / 1086 sách...
Đã xử lý 480 / 1086 sách...
Đã xử lý 640 / 1086 sách...
Đã xử lý 800 / 1086 sách...
Đã xử lý 960 / 1086 sách...


In [38]:
from sklearn.metrics import ndcg_score

def calculate_ap_at_k(user_recommendations, relevant_items, k):
    precision_sum = 0
    relevant_count = 0

    recommend_list = list(user_recommendations.keys())[:k]
    real_list = list(relevant_items.keys())
    for i, book_id in enumerate(recommend_list):
        if book_id in real_list:
            relevant_count += 1
            precision_sum += relevant_count / (i + 1)
    if relevant_count == 0:
        return 0
    return precision_sum / relevant_count

# def calculate_map_at_k(user_recommendations, user_relevant_items, k):
#     ap_scores = []
#     ap_scores.append(calculate_ap_at_k(list(user_recommendations.keys()), list(user_relevant_items.keys()), k))
#     return sum(ap_scores) / len(ap_scores)

# def calculate_ndcg_at_k(y_true, y_pred, k):
#     sorted_pred_ids = list(y_pred.keys())[:k]
#     relevance_scores = [y_true.get(item_id, 0) for item_id in sorted_pred_ids]

#     def get_dcg(scores):
#         return sum(score / np.log2(idx + 2)
#                    for idx, score in enumerate(scores))

#     dcg = get_dcg(relevance_scores)

#     ideal_relevance = sorted(y_true.values(), reverse=True)[:k]
#     idcg = get_dcg(ideal_relevance)

#     if idcg == 0:
#         return 0.0

#     return dcg / idcg

def calculate_ndcg_at_k(y_true, y_pred, k):
    y_true_scores = list(y_true.values())[:k]
    y_pred_scores = list(y_pred.values())[:k]

    return ndcg_score([y_true_scores], [y_pred_scores], k=k)

def precision_at_k(recommended_books, relevant_books, k):
    pred_k = list(recommended_books.keys())[:k]
    relevant = set(relevant_books.keys())

    hit = len([item for item in pred_k if item in relevant])
    return hit / k


def recall_at_k(recommended_books, relevant_books, k):
    pred_k = list(recommended_books.keys())[:k]
    relevant = set(relevant_books.keys())

    hit = len([item for item in pred_k if item in relevant])
    total_relevant = len(relevant_books)
    if total_relevant == 0:
        return 0.0
    return hit / len(relevant)



In [42]:
topk = 5
precision = 0
recall = 0
ap = 0
ndcg = 0
count = 0

for user_id in list_user_id5:
    user_book = review_book_description[['product_id', 'description', 'rating']][(review_book_description['customer_id'] == user_id) & (~review_book_description['description'].isna())]

    X_test = user_book[:topk]
    X_train = user_book[topk:]

    user_book = list(X_train['description'])
    if len(user_book) != 0:
      encoding = tokenizer(user_book,
                          return_tensors='pt',
                          padding=True,
                          truncation=True,
                          max_length=512).to(device)

      with torch.no_grad():
          output = model(**encoding)
          attention_mask = encoding['attention_mask'].unsqueeze(-1).expand(output.last_hidden_state.size()).float()

          sum_embeddings = torch.sum(output.last_hidden_state * attention_mask, 1)

          sum_mask = torch.clamp(attention_mask.sum(1), min=1e-9)

          user_embedding = sum_embeddings / sum_mask
          user_embedding = user_embedding.cpu().mean(dim=0, keepdim=True)

      book_user_read_before = list(X_train['product_id'])
      book_comparision = {book_id : all_book_matrix[book_id] for book_id in all_book_matrix.keys() if book_id not in book_user_read_before}
      book_embeddings = torch.stack(tuple(book_comparision.values()))

      similarity = cosine_similarity(user_embedding.numpy(), book_embeddings.numpy())
      similar_indices = [(book_id, similarity[0][idx]) for idx, book_id in enumerate(list(book_comparision.keys()))]

      sortlist = sorted(similar_indices, key=lambda x: x[1], reverse=True)[:topk]

      recommend_book = {book: float(score) for book, score in sortlist}
      y_true = dict(zip(X_test['product_id'], X_test['rating']))

    # print(user_id)
    # print(X_test)
    # print(len(recommend_book))
    # print(len(y_true))
    # raise

      precision += precision_at_k(recommend_book, y_true, k=topk)
      recall += recall_at_k(recommend_book, y_true, k=topk)
      ap += calculate_ap_at_k(recommend_book, y_true, k=topk)
      ndcg += calculate_ndcg_at_k(y_true, recommend_book, k=topk)
      count += 1
      print(f"User {user_id} | Pre@{topk}: {precision_at_k(recommend_book, y_true, k=topk)} | Rec@{topk}: {recall_at_k(recommend_book, y_true, k=topk)} | AP@{topk}: {calculate_ap_at_k(recommend_book, y_true, k=topk)} | NDCG@{topk}: {calculate_ndcg_at_k(y_true, recommend_book, k=topk)} \n")

print("========Final========")
print(f"Pre@{topk}: {precision / count}")
print(f"Rec@{topk}: {recall / count}")
print(f"MAP@{topk}: {ap / count}")
print(f"NDCG@{topk}: {ndcg / count}")

User 120954 | Pre@5: 0.2 | Rec@5: 0.2 | AP@5: 0.5 | NDCG@5: 0.9999999999999999 

User 21654527 | Pre@5: 0.0 | Rec@5: 0.0 | AP@5: 0 | NDCG@5: 0.9999999999999999 

User 15188098 | Pre@5: 0.0 | Rec@5: 0.0 | AP@5: 0 | NDCG@5: 0.9999999999999999 

User 9927825 | Pre@5: 0.0 | Rec@5: 0.0 | AP@5: 0 | NDCG@5: 1.0 

User 11195564 | Pre@5: 0.2 | Rec@5: 0.2 | AP@5: 0.25 | NDCG@5: 0.9999999999999999 

User 15225634 | Pre@5: 0.2 | Rec@5: 0.2 | AP@5: 0.3333333333333333 | NDCG@5: 1.0 

User 5917418 | Pre@5: 0.0 | Rec@5: 0.0 | AP@5: 0 | NDCG@5: 0.9999999999999999 

User 620317 | Pre@5: 0.0 | Rec@5: 0.0 | AP@5: 0 | NDCG@5: 0.9829976023759304 

User 10025671 | Pre@5: 0.0 | Rec@5: 0.0 | AP@5: 0 | NDCG@5: 0.9999999999999999 

User 317483 | Pre@5: 0.0 | Rec@5: 0.0 | AP@5: 0 | NDCG@5: 0.9650534596995627 

User 6301744 | Pre@5: 0.0 | Rec@5: 0.0 | AP@5: 0 | NDCG@5: 0.9999999999999999 

User 21558077 | Pre@5: 0.0 | Rec@5: 0.0 | AP@5: 0 | NDCG@5: 0.9572881727261122 

User 14361212 | Pre@5: 0.0 | Rec@5: 0.0 | AP@

In [43]:
topk = 10
precision = 0
recall = 0
ap = 0
ndcg = 0
count = 0

for user_id in list_user_id10:
    user_book = review_book_description[['product_id', 'description', 'rating']][(review_book_description['customer_id'] == user_id) & (~review_book_description['description'].isna())]

    X_test = user_book[:topk]
    X_train = user_book[topk:]

    user_book = list(X_train['description'])
    encoding = tokenizer(user_book,
                         return_tensors='pt',
                         padding=True,
                         truncation=True,
                         max_length=512).to(device)

    with torch.no_grad():
        output = model(**encoding)
        attention_mask = encoding['attention_mask'].unsqueeze(-1).expand(output.last_hidden_state.size()).float()

        sum_embeddings = torch.sum(output.last_hidden_state * attention_mask, 1)

        sum_mask = torch.clamp(attention_mask.sum(1), min=1e-9)

        user_embedding = sum_embeddings / sum_mask
        user_embedding = user_embedding.cpu().mean(dim=0, keepdim=True)

    book_user_read_before = list(X_train['product_id'])
    book_comparision = {book_id : all_book_matrix[book_id] for book_id in all_book_matrix.keys() if book_id not in book_user_read_before}
    book_embeddings = torch.stack(tuple(book_comparision.values()))

    similarity = cosine_similarity(user_embedding.numpy(), book_embeddings.numpy())
    similar_indices = [(book_id, similarity[0][idx]) for idx, book_id in enumerate(list(book_comparision.keys()))]

    sortlist = sorted(similar_indices, key=lambda x: x[1], reverse=True)[:topk]

    recommend_book = {book: float(score) for book, score in sortlist}
    y_true = dict(zip(X_test['product_id'], X_test['rating']))

    # print(recommend_book)
    # print(y_true)
    # raise

    precision += precision_at_k(recommend_book, y_true, k=10)
    recall += recall_at_k(recommend_book, y_true, k=10)
    ap += calculate_ap_at_k(recommend_book, y_true, k=10)
    ndcg += calculate_ndcg_at_k(y_true, recommend_book, k=10)
    count += 1

    print(f"User {user_id} | Pre@{topk}: {precision_at_k(recommend_book, y_true, k=10)} | Rec@{topk}: {recall_at_k(recommend_book, y_true, k=10)} | AP@{topk}: {calculate_ap_at_k(recommend_book, y_true, k=10)} | NDCG@{topk}: {calculate_ndcg_at_k(y_true, recommend_book, k=10)} \n")

print("========Final========")
print(f"Pre@{topk}: {precision / count}")
print(f"Rec@{topk}: {recall / count}")
print(f"MAP@{topk}: {ap / count}")
print(f"NDCG@{topk}: {ndcg / count}")

User 15188098 | Pre@10: 0.0 | Rec@10: 0.0 | AP@10: 0 | NDCG@10: 0.9999999999999999 

User 9927825 | Pre@10: 0.1 | Rec@10: 0.1 | AP@10: 0.14285714285714285 | NDCG@10: 0.908405193796093 

User 15225634 | Pre@10: 0.0 | Rec@10: 0.0 | AP@10: 0 | NDCG@10: 0.8956185994036242 

User 10025671 | Pre@10: 0.0 | Rec@10: 0.0 | AP@10: 0 | NDCG@10: 0.9999999999999999 

User 356525 | Pre@10: 0.0 | Rec@10: 0.0 | AP@10: 0 | NDCG@10: 0.889079673248842 

User 16757512 | Pre@10: 0.0 | Rec@10: 0.0 | AP@10: 0 | NDCG@10: 0.8956185994036242 

User 17506038 | Pre@10: 0.0 | Rec@10: 0.0 | AP@10: 0 | NDCG@10: 0.9999999999999999 

User 5996998 | Pre@10: 0.0 | Rec@10: 0.0 | AP@10: 0 | NDCG@10: 0.908405193796093 

User 7261494 | Pre@10: 0.0 | Rec@10: 0.0 | AP@10: 0 | NDCG@10: 0.9084051937960927 

User 13513176 | Pre@10: 0.0 | Rec@10: 0.0 | AP@10: 0 | NDCG@10: 0.9904673709481427 

User 1621495 | Pre@10: 0.0 | Rec@10: 0.0 | AP@10: 0 | NDCG@10: 0.908405193796093 

User 20073 | Pre@10: 0.2 | Rec@10: 0.2 | AP@10: 0.2678571