In [1]:
!pip install transformers



In [28]:
from transformers import AutoModel, AutoTokenizer
import torch
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [7]:
book = pd.read_csv('/content/final_cleaned_books.csv')
review = pd.read_csv('/content/final_interactions.csv')

In [12]:
review_book_description = pd.merge(review, book[['product_id', 'description']], how='inner', on='product_id')


In [13]:
review_book_description.head()

Unnamed: 0,customer_id,product_id,rating,content,title,thank_count,customer_index,product_index,description
0,522878,12416734,4,,Hài lòng,0,34,205,Đọc sách - đó là bạn đang trải nghiệm. Điểm sá...
1,11736484,12416734,5,,Cực kì hài lòng,0,229,205,Đọc sách - đó là bạn đang trải nghiệm. Điểm sá...
2,120954,12416734,5,.,Cực kì hài lòng,0,13,205,Đọc sách - đó là bạn đang trải nghiệm. Điểm sá...
3,14088052,10752430,5,,Cực kì hài lòng,0,253,195,THAY ĐỔI bàn về cách thức thực hiện sự thay đổ...
4,22555105,10752430,5,,Cực kì hài lòng,0,331,195,THAY ĐỔI bàn về cách thức thực hiện sự thay đổ...


In [14]:
list_user_id = []
for id in review_book_description['customer_id'].unique():
    user_reviews = review_book_description[review_book_description['customer_id'] == id]
    if len(user_reviews) > 10:
        print(f'Customer ID: {id}, Number of reviews: {len(user_reviews)}')
        list_user_id.append(id)

Customer ID: 17506038, Number of reviews: 22
Customer ID: 13513176, Number of reviews: 12
Customer ID: 18278431, Number of reviews: 11
Customer ID: 5253296, Number of reviews: 11
Customer ID: 18132503, Number of reviews: 14


In [15]:
model= AutoModel.from_pretrained('uitnlp/visobert')
tokenizer = AutoTokenizer.from_pretrained('uitnlp/visobert')


config.json:   0%|          | 0.00/644 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/390M [00:00<?, ?B/s]

Some weights of XLMRobertaModel were not initialized from the model checkpoint at uitnlp/visobert and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/390M [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/471k [00:00<?, ?B/s]

In [16]:
mapping = pd.Series(book['product_name'], index=book.index)

In [17]:
all_book_matrix = {}
product_ids = list(book['product_id'][book['description'].notnull()])
description_book = list(book['description'][book['product_id'].isin(product_ids)])
batch_size = 16
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for i in range(0, len(description_book), batch_size):
    batch_product_ids = product_ids[i:i+batch_size]
    batch_descriptions = description_book[i:i+batch_size]

    encoding = tokenizer(batch_descriptions,
                        return_tensors='pt',
                        padding=True,
                        truncation=True,
                        max_length=512).to(device)
    with torch.no_grad():
        output = model(**encoding)
        attention_mask = encoding['attention_mask'].unsqueeze(-1).expand(output.last_hidden_state.size()).float()

        sum_embeddings = torch.sum(output.last_hidden_state * attention_mask, 1)

        sum_mask = torch.clamp(attention_mask.sum(1), min=1e-9)

        all_book_description_matrix = sum_embeddings / sum_mask

        for idx, product_id in enumerate(batch_product_ids):
          embedding = all_book_description_matrix[idx].cpu()

          all_book_matrix[product_id] = embedding
    if i % (batch_size * 10) == 0:
        print(f"Đã xử lý {i} / {len(description_book)} sách...")


Đã xử lý 0 / 1379 sách...
Đã xử lý 160 / 1379 sách...
Đã xử lý 320 / 1379 sách...
Đã xử lý 480 / 1379 sách...
Đã xử lý 640 / 1379 sách...
Đã xử lý 800 / 1379 sách...
Đã xử lý 960 / 1379 sách...
Đã xử lý 1120 / 1379 sách...
Đã xử lý 1280 / 1379 sách...


In [59]:
def calculate_ap_at_k(user_recommendations, relevant_items, k):
    precision_sum = 0
    relevant_count = 0
    for i, book_id in enumerate(user_recommendations):
        if book_id in relevant_items:
            relevant_count += 1
            precision_sum += relevant_count / (i + 1)
    if relevant_count == 0:
        return 0
    return precision_sum / len(relevant_items)

def calculate_map_at_k(user_recommendations, user_relevant_items, k):
    ap_scores = []
    ap_scores.append(calculate_ap_at_k(user_recommendations, user_relevant_items, k))
    return sum(ap_scores) / len(ap_scores)

def get_relevance(df, item_id, user_id):
    result = df.loc[(df['product_id'] == item_id) & (df['customer_id'] == user_id), 'rating']

    if result.empty:
        return 0
    else:
        return result.iloc[0]

def calculate_ndcg_at_k(y_pred, y_true, user_id, k):
    sorted_pred_ids = y_pred[:k]
    relevance_scores = [get_relevance(review_book_description, item_id, user_id) for item_id in sorted_pred_ids]

    def get_dcg(scores):
        return sum([score / np.log2(idx + 2) for idx, score in enumerate(scores)])

    dcg = get_dcg(relevance_scores)

    ideal_relevance = sorted(relevance_scores, reverse=True)
    idcg = get_dcg(ideal_relevance)

    if idcg == 0:
        return 0.0

    return dcg / idcg

def precision_at_k(recommended_books, relevant_books, k):
    pred_k = recommended_books[:k]
    relevant = relevant_books[:k]

    hit = len([item for item in pred_k if item in relevant])
    return hit / k


def recall_at_k(recommended_books, relevant_books, k):
    pred_k = recommended_books[:k]
    relevant = relevant_books

    hit = len([item for item in pred_k if item in relevant])
    total_relevant = len(relevant_books)
    if total_relevant == 0:
        return 0.0
    return hit / len(relevant)

In [62]:
topk = 10
precision = 0
recall = 0
map = 0
ndcg = 0
count = 0

for user_id in list_user_id:
    user_book = review_book_description[['product_id', 'description']][(review_book_description['customer_id'] == user_id) & (~review_book_description['description'].isna())]

    X_test = user_book[:topk]
    X_train = user_book[topk:]

    user_book = list(X_train['description'])
    encoding = tokenizer(user_book,
                         return_tensors='pt',
                         padding=True,
                         truncation=True,
                         max_length=512).to(device)

    with torch.no_grad():
        output = model(**encoding)
        attention_mask = encoding['attention_mask'].unsqueeze(-1).expand(output.last_hidden_state.size()).float()

        sum_embeddings = torch.sum(output.last_hidden_state * attention_mask, 1)

        sum_mask = torch.clamp(attention_mask.sum(1), min=1e-9)

        user_embedding = sum_embeddings / sum_mask
        user_embedding = user_embedding.cpu().mean(dim=0, keepdim=True)

    book_user_read_before = list(X_train['product_id'])
    book_comparision = {book_id : all_book_matrix[book_id] for book_id in all_book_matrix.keys() if book_id not in book_user_read_before}
    book_embeddings = torch.stack(tuple(book_comparision.values()))

    similarity = cosine_similarity(user_embedding.numpy(), book_embeddings.numpy())
    similar_indices = [(book_id, similarity[0][idx]) for idx, book_id in enumerate(list(book_comparision.keys()))]
    get_topk_book = sorted(similar_indices, key=lambda x: x[1], reverse=True)[:topk]

    recommend_book = [book_id for book_id, score in get_topk_book]
    y_true = list(X_test['product_id'])

    # print(recommend_book)
    # print(y_true)
    precision += precision_at_k(recommend_book, y_true, k=10)
    recall += recall_at_k(recommend_book, y_true, k=10)
    map += calculate_map_at_k(recommend_book, y_true, k=10)
    ndcg += calculate_ndcg_at_k(recommend_book, y_true, user_id, k=10)
    count += 1

print(f"Pre@{topk}: {precision / count}")
print(f"Rec@{topk}: {recall / count}")
print(f"MAP@{topk}: {map / count}")
print(f"NDCG@{topk}: {ndcg / count}")

Pre@10: 0.02
Rec@10: 0.02
MAP@10: 0.0025
NDCG@10: 0.06309297535714575
