In [None]:
!pip install transformers



In [None]:
from transformers import AutoModel, AutoTokenizer
import torch
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [None]:
book = pd.read_csv('/content/final_cleaned_books.csv')
review = pd.read_csv('/content/final_interactions.csv')

review = review.drop_duplicates(subset=['customer_id', 'product_id'])

In [None]:
review_book_description = pd.merge(review, book[['product_id', 'description']], how='inner', on='product_id')


In [None]:
review_book_description.head()

Unnamed: 0,customer_id,product_id,rating,content,customer_index,product_index,description
0,522878,12416734,4,,70,179,Đọc sách - đó là bạn đang trải nghiệm. Điểm sá...
1,11736484,12416734,5,,441,179,Đọc sách - đó là bạn đang trải nghiệm. Điểm sá...
2,120954,12416734,5,.,27,179,Đọc sách - đó là bạn đang trải nghiệm. Điểm sá...
3,14088052,10752430,5,,503,169,THAY ĐỔI bàn về cách thức thực hiện sự thay đổ...
4,22555105,10752430,5,,660,169,THAY ĐỔI bàn về cách thức thực hiện sự thay đổ...


In [None]:
list_user_id10 = []
list_user_id5 = []
for id in review_book_description['customer_id'].unique():
    user_reviews = review_book_description[review_book_description['customer_id'] == id]
    if len(user_reviews) > 10:
        # print(f'Customer ID: {id}, Number of reviews: {len(user_reviews)}')
        list_user_id10.append(id)

for id in review_book_description['customer_id'].unique():
    user_reviews = review_book_description[review_book_description['customer_id'] == id]
    if len(user_reviews) > 5:
        # print(f'Customer ID: {id}, Number of reviews: {len(user_reviews)}')
        list_user_id5.append(id)

In [None]:
model= AutoModel.from_pretrained('uitnlp/visobert')
tokenizer = AutoTokenizer.from_pretrained('uitnlp/visobert')


Some weights of XLMRobertaModel were not initialized from the model checkpoint at uitnlp/visobert and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
mapping = pd.Series(book['product_name'], index=book.index)

In [None]:
all_book_matrix = {}
product_ids = list(book['product_id'][book['description'].notnull()])
description_book = list(book['description'][book['product_id'].isin(product_ids)])
batch_size = 16
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for i in range(0, len(description_book), batch_size):
    batch_product_ids = product_ids[i:i+batch_size]
    batch_descriptions = description_book[i:i+batch_size]

    encoding = tokenizer(batch_descriptions,
                        return_tensors='pt',
                        padding=True,
                        truncation=True,
                        max_length=512).to(device)
    with torch.no_grad():
        output = model(**encoding)
        attention_mask = encoding['attention_mask'].unsqueeze(-1).expand(output.last_hidden_state.size()).float()

        sum_embeddings = torch.sum(output.last_hidden_state * attention_mask, 1)

        sum_mask = torch.clamp(attention_mask.sum(1), min=1e-9)

        all_book_description_matrix = sum_embeddings / sum_mask

        for idx, product_id in enumerate(batch_product_ids):
          embedding = all_book_description_matrix[idx].cpu()

          all_book_matrix[product_id] = embedding
    if i % (batch_size * 10) == 0:
        print(f"Đã xử lý {i} / {len(description_book)} sách...")


Đã xử lý 0 / 1086 sách...
Đã xử lý 160 / 1086 sách...
Đã xử lý 320 / 1086 sách...
Đã xử lý 480 / 1086 sách...
Đã xử lý 640 / 1086 sách...
Đã xử lý 800 / 1086 sách...
Đã xử lý 960 / 1086 sách...


In [None]:
from sklearn.metrics import ndcg_score

def calculate_ap_at_k(user_recommendations, relevant_items, k):
    precision_sum = 0
    relevant_count = 0

    recommend_list = list(user_recommendations.keys())[:k]
    real_list = list(relevant_items.keys())
    for i, book_id in enumerate(recommend_list):
        if book_id in real_list:
            relevant_count += 1
            precision_sum += relevant_count / (i + 1)
    if relevant_count == 0:
        return 0
    return precision_sum / relevant_count

def calculate_ndcg_at_k(y_true, y_pred, k):
    y_true_scores = list(y_true.values())
    y_pred_scores = list(y_pred.values())

    return ndcg_score([y_true_scores], [y_pred_scores], k=k)

def precision_at_k(recommended_books, relevant_books, k):
    pred_k = list(recommended_books.keys())[:k]
    relevant = set(relevant_books.keys())

    hit = len([item for item in pred_k if item in relevant])
    return hit / k

def recall_at_k(recommended_books, relevant_books, k):
    pred_k = list(recommended_books.keys())[:k]
    relevant = set(relevant_books.keys())

    hit = len([item for item in pred_k if item in relevant])
    total_relevant = len(relevant_books)
    if total_relevant == 0:
        return 0.0
    return hit / len(relevant)



# Cosine

In [None]:
topk = 5
precision = 0
recall = 0
ap = 0
ndcg = 0
count = 0

for user_id in list_user_id5:
    user_book = review_book_description[['product_id', 'description', 'rating']][(review_book_description['customer_id'] == user_id) & (~review_book_description['description'].isna())]
    X_train = user_book.sample(frac=0.2, random_state=42)
    X_test = user_book.drop(index=X_train.index)

    # print(len(X_train))
    # print(len(X_test))

    # raise
    user_book = list(X_train['description'])
    if len(user_book) != 0:
      encoding = tokenizer(user_book,
                          return_tensors='pt',
                          padding=True,
                          truncation=True,
                          max_length=512).to(device)

      with torch.no_grad():
          output = model(**encoding)
          attention_mask = encoding['attention_mask'].unsqueeze(-1).expand(output.last_hidden_state.size()).float()

          sum_embeddings = torch.sum(output.last_hidden_state * attention_mask, 1)

          sum_mask = torch.clamp(attention_mask.sum(1), min=1e-9)

          user_embedding = sum_embeddings / sum_mask
          user_embedding = user_embedding.cpu().mean(dim=0, keepdim=True)

      book_user_read_before = list(X_train['product_id'])
      book_comparision = {book_id : all_book_matrix[book_id] for book_id in all_book_matrix.keys() if book_id not in book_user_read_before}
      book_embeddings = torch.stack(tuple(book_comparision.values()))

      similarity = cosine_similarity(user_embedding.numpy(), book_embeddings.numpy())
      similar_indices = [(book_id, similarity[0][idx]) for idx, book_id in enumerate(list(book_comparision.keys()))]

      sortlist = sorted(similar_indices, key=lambda x: x[1], reverse=True)[:len(X_test)]

      recommend_book = {book: float(score) for book, score in sortlist}
      y_true = dict(zip(X_test['product_id'], X_test['rating']))

      precision += precision_at_k(recommend_book, y_true, k=topk)
      recall += recall_at_k(recommend_book, y_true, k=topk)
      ap += calculate_ap_at_k(recommend_book, y_true, k=topk)
      ndcg += calculate_ndcg_at_k(y_true, recommend_book, k=topk)
      count += 1
      print(f"User {user_id} | Pre@{topk}: {precision_at_k(recommend_book, y_true, k=topk)} | Rec@{topk}: {recall_at_k(recommend_book, y_true, k=topk)} | AP@{topk}: {calculate_ap_at_k(recommend_book, y_true, k=topk)} | NDCG@{topk}: {calculate_ndcg_at_k(y_true, recommend_book, k=topk)} \n")

print("========Final========")
print(f"Pre@{topk}: {precision / count}")
print(f"Rec@{topk}: {recall / count}")
print(f"MAP@{topk}: {ap / count}")
print(f"NDCG@{topk}: {ndcg / count}")

User 120954 | Pre@5: 0.0 | Rec@5: 0.0 | AP@5: 0 | NDCG@5: 0.9999999999999999 

User 21654527 | Pre@5: 0.0 | Rec@5: 0.0 | AP@5: 0 | NDCG@5: 0.9999999999999999 

User 15188098 | Pre@5: 0.2 | Rec@5: 0.1 | AP@5: 0.25 | NDCG@5: 0.9999999999999999 

User 9927825 | Pre@5: 0.0 | Rec@5: 0.0 | AP@5: 0 | NDCG@5: 0.33014761778423596 

User 11195564 | Pre@5: 0.0 | Rec@5: 0.0 | AP@5: 0 | NDCG@5: 0.9999999999999999 

User 15225634 | Pre@5: 0.0 | Rec@5: 0.0 | AP@5: 0 | NDCG@5: 0.3183671393962403 

User 5917418 | Pre@5: 0.0 | Rec@5: 0.0 | AP@5: 0 | NDCG@5: 0.9999999999999999 

User 620317 | Pre@5: 0.0 | Rec@5: 0.0 | AP@5: 0 | NDCG@5: 0.9572881727261122 

User 10025671 | Pre@5: 0.0 | Rec@5: 0.0 | AP@5: 0 | NDCG@5: 0.9999999999999999 

User 317483 | Pre@5: 0.0 | Rec@5: 0.0 | AP@5: 0 | NDCG@5: 0.9122105819346807 

User 6301744 | Pre@5: 0.0 | Rec@5: 0.0 | AP@5: 0 | NDCG@5: 0.9999999999999999 

User 21558077 | Pre@5: 0.0 | Rec@5: 0.0 | AP@5: 0 | NDCG@5: 0.9321679589452766 

User 14361212 | Pre@5: 0.0 | Rec@

In [None]:
topk = 10
precision = 0
recall = 0
ap = 0
ndcg = 0
count = 0

for user_id in list_user_id10:
    user_book = review_book_description[['product_id', 'description', 'rating']][(review_book_description['customer_id'] == user_id) & (~review_book_description['description'].isna())]

    X_train = user_book.sample(frac=0.2, random_state=42)
    X_test = user_book.drop(index=X_train.index)

    user_book = list(X_train['description'])
    if len(user_book) != 0:
      encoding = tokenizer(user_book,
                          return_tensors='pt',
                          padding=True,
                          truncation=True,
                          max_length=512).to(device)

      with torch.no_grad():
          output = model(**encoding)
          attention_mask = encoding['attention_mask'].unsqueeze(-1).expand(output.last_hidden_state.size()).float()

          sum_embeddings = torch.sum(output.last_hidden_state * attention_mask, 1)

          sum_mask = torch.clamp(attention_mask.sum(1), min=1e-9)

          user_embedding = sum_embeddings / sum_mask
          user_embedding = user_embedding.cpu().mean(dim=0, keepdim=True)

      book_user_read_before = list(X_train['product_id'])
      book_comparision = {book_id : all_book_matrix[book_id] for book_id in all_book_matrix.keys() if book_id not in book_user_read_before}
      book_embeddings = torch.stack(tuple(book_comparision.values()))

      similarity = cosine_similarity(user_embedding.numpy(), book_embeddings.numpy())
      similar_indices = [(book_id, similarity[0][idx]) for idx, book_id in enumerate(list(book_comparision.keys()))]

      sortlist = sorted(similar_indices, key=lambda x: x[1], reverse=True)[:len(X_test)]

      recommend_book = {book: float(score) for book, score in sortlist}
      y_true = dict(zip(X_test['product_id'], X_test['rating']))

      precision += precision_at_k(recommend_book, y_true, k=10)
      recall += recall_at_k(recommend_book, y_true, k=10)
      ap += calculate_ap_at_k(recommend_book, y_true, k=10)
      ndcg += calculate_ndcg_at_k(y_true, recommend_book, k=10)
      count += 1

      print(f"User {user_id} | Pre@{topk}: {precision_at_k(recommend_book, y_true, k=10)} | Rec@{topk}: {recall_at_k(recommend_book, y_true, k=10)} | AP@{topk}: {calculate_ap_at_k(recommend_book, y_true, k=10)} | NDCG@{topk}: {calculate_ndcg_at_k(y_true, recommend_book, k=10)} \n")

print("========Final========")
print(f"Pre@{topk}: {precision / count}")
print(f"Rec@{topk}: {recall / count}")
print(f"MAP@{topk}: {ap / count}")
print(f"NDCG@{topk}: {ndcg / count}")

User 15188098 | Pre@10: 0.1 | Rec@10: 0.1 | AP@10: 0.25 | NDCG@10: 0.9999999999999999 

User 9927825 | Pre@10: 0.0 | Rec@10: 0.0 | AP@10: 0 | NDCG@10: 0.5252520457236445 

User 15225634 | Pre@10: 0.0 | Rec@10: 0.0 | AP@10: 0 | NDCG@10: 0.6311028238422782 

User 10025671 | Pre@10: 0.0 | Rec@10: 0.0 | AP@10: 0 | NDCG@10: 0.9999999999999998 

User 356525 | Pre@10: 0.0 | Rec@10: 0.0 | AP@10: 0 | NDCG@10: 0.6199792816779409 

User 16757512 | Pre@10: 0.0 | Rec@10: 0.0 | AP@10: 0 | NDCG@10: 0.5960626287014779 

User 17506038 | Pre@10: 0.1 | Rec@10: 0.05 | AP@10: 0.3333333333333333 | NDCG@10: 0.9999999999999999 

User 5996998 | Pre@10: 0.0 | Rec@10: 0.0 | AP@10: 0 | NDCG@10: 0.5686433555798904 

User 7261494 | Pre@10: 0.0 | Rec@10: 0.0 | AP@10: 0 | NDCG@10: 0.5607449928684298 

User 13513176 | Pre@10: 0.0 | Rec@10: 0.0 | AP@10: 0 | NDCG@10: 0.9904673709481427 

User 1621495 | Pre@10: 0.0 | Rec@10: 0.0 | AP@10: 0 | NDCG@10: 0.5550009559248589 

User 11999970 | Pre@10: 0.0 | Rec@10: 0.0 | AP@10:

# Pearson

In [None]:
topk = 5
precision = 0
recall = 0
ap = 0
ndcg = 0
count = 0

for user_id in list_user_id5:
    user_book = review_book_description[['product_id', 'description', 'rating']][(review_book_description['customer_id'] == user_id) & (~review_book_description['description'].isna())]

    X_train = user_book.sample(frac=0.2, random_state=42)
    X_test = user_book.drop(index=X_train.index)

    user_book = list(X_train['description'])
    if len(user_book) != 0:
      encoding = tokenizer(user_book,
                          return_tensors='pt',
                          padding=True,
                          truncation=True,
                          max_length=512).to(device)

      with torch.no_grad():
          output = model(**encoding)
          attention_mask = encoding['attention_mask'].unsqueeze(-1).expand(output.last_hidden_state.size()).float()

          sum_embeddings = torch.sum(output.last_hidden_state * attention_mask, 1)

          sum_mask = torch.clamp(attention_mask.sum(1), min=1e-9)

          user_embedding = sum_embeddings / sum_mask
          user_embedding = user_embedding.cpu().mean(dim=0, keepdim=True)

      book_user_read_before = list(X_train['product_id'])
      book_comparision = {book_id : all_book_matrix[book_id] for book_id in all_book_matrix.keys() if book_id not in book_user_read_before}
      book_embeddings = torch.stack(tuple(book_comparision.values()))

      similarity = np.corrcoef(user_embedding.numpy(), book_embeddings.numpy())
      similar_indices = [(book_id, similarity[0][idx]) for idx, book_id in enumerate(list(book_comparision.keys()))]

      sortlist = sorted(similar_indices, key=lambda x: x[1], reverse=True)[:len(X_test)]

      recommend_book = {book: float(score) for book, score in sortlist}
      y_true = dict(zip(X_test['product_id'], X_test['rating']))

      precision += precision_at_k(recommend_book, y_true, k=topk)
      recall += recall_at_k(recommend_book, y_true, k=topk)
      ap += calculate_ap_at_k(recommend_book, y_true, k=topk)
      ndcg += calculate_ndcg_at_k(y_true, recommend_book, k=topk)
      count += 1
      print(f"User {user_id} | Pre@{topk}: {precision_at_k(recommend_book, y_true, k=topk)} | Rec@{topk}: {recall_at_k(recommend_book, y_true, k=topk)} | AP@{topk}: {calculate_ap_at_k(recommend_book, y_true, k=topk)} | NDCG@{topk}: {calculate_ndcg_at_k(y_true, recommend_book, k=topk)} \n")

print("========Final========")
print(f"Pre@{topk}: {precision / count}")
print(f"Rec@{topk}: {recall / count}")
print(f"MAP@{topk}: {ap / count}")
print(f"NDCG@{topk}: {ndcg / count}")

User 120954 | Pre@5: 0.0 | Rec@5: 0.0 | AP@5: 0 | NDCG@5: 0.9999999999999999 

User 21654527 | Pre@5: 0.0 | Rec@5: 0.0 | AP@5: 0 | NDCG@5: 0.9999999999999999 

User 15188098 | Pre@5: 0.0 | Rec@5: 0.0 | AP@5: 0 | NDCG@5: 0.9999999999999999 

User 9927825 | Pre@5: 0.0 | Rec@5: 0.0 | AP@5: 0 | NDCG@5: 0.33014761778423596 

User 11195564 | Pre@5: 0.0 | Rec@5: 0.0 | AP@5: 0 | NDCG@5: 0.9999999999999999 

User 15225634 | Pre@5: 0.0 | Rec@5: 0.0 | AP@5: 0 | NDCG@5: 0.3183671393962403 

User 5917418 | Pre@5: 0.0 | Rec@5: 0.0 | AP@5: 0 | NDCG@5: 0.9999999999999999 

User 620317 | Pre@5: 0.0 | Rec@5: 0.0 | AP@5: 0 | NDCG@5: 0.9572881727261122 

User 10025671 | Pre@5: 0.0 | Rec@5: 0.0 | AP@5: 0 | NDCG@5: 0.9999999999999999 

User 317483 | Pre@5: 0.0 | Rec@5: 0.0 | AP@5: 0 | NDCG@5: 0.9122105819346807 

User 6301744 | Pre@5: 0.0 | Rec@5: 0.0 | AP@5: 0 | NDCG@5: 0.9999999999999999 

User 21558077 | Pre@5: 0.0 | Rec@5: 0.0 | AP@5: 0 | NDCG@5: 0.9321679589452766 

User 14361212 | Pre@5: 0.2 | Rec@5: 

In [None]:
topk = 10
precision = 0
recall = 0
ap = 0
ndcg = 0
count = 0

for user_id in list_user_id10:
    user_book = review_book_description[['product_id', 'description', 'rating']][(review_book_description['customer_id'] == user_id) & (~review_book_description['description'].isna())]

    X_train = user_book.sample(frac=0.2, random_state=42)
    X_test = user_book.drop(index=X_train.index)

    user_book = list(X_train['description'])
    if len(user_book) != 0:
      encoding = tokenizer(user_book,
                          return_tensors='pt',
                          padding=True,
                          truncation=True,
                          max_length=512).to(device)

      with torch.no_grad():
          output = model(**encoding)
          attention_mask = encoding['attention_mask'].unsqueeze(-1).expand(output.last_hidden_state.size()).float()

          sum_embeddings = torch.sum(output.last_hidden_state * attention_mask, 1)

          sum_mask = torch.clamp(attention_mask.sum(1), min=1e-9)

          user_embedding = sum_embeddings / sum_mask
          user_embedding = user_embedding.cpu().mean(dim=0, keepdim=True)

      book_user_read_before = list(X_train['product_id'])
      book_comparision = {book_id : all_book_matrix[book_id] for book_id in all_book_matrix.keys() if book_id not in book_user_read_before}
      book_embeddings = torch.stack(tuple(book_comparision.values()))

      similarity = np.corrcoef(user_embedding.numpy(), book_embeddings.numpy())
      similar_indices = [(book_id, similarity[0][idx]) for idx, book_id in enumerate(list(book_comparision.keys()))]

      sortlist = sorted(similar_indices, key=lambda x: x[1], reverse=True)[:len(X_test)]

      recommend_book = {book: float(score) for book, score in sortlist}
      y_true = dict(zip(X_test['product_id'], X_test['rating']))

      precision += precision_at_k(recommend_book, y_true, k=topk)
      recall += recall_at_k(recommend_book, y_true, k=topk)
      ap += calculate_ap_at_k(recommend_book, y_true, k=topk)
      ndcg += calculate_ndcg_at_k(y_true, recommend_book, k=topk)
      count += 1
      print(f"User {user_id} | Pre@{topk}: {precision_at_k(recommend_book, y_true, k=topk)} | Rec@{topk}: {recall_at_k(recommend_book, y_true, k=topk)} | AP@{topk}: {calculate_ap_at_k(recommend_book, y_true, k=topk)} | NDCG@{topk}: {calculate_ndcg_at_k(y_true, recommend_book, k=topk)} \n")

print("========Final========")
print(f"Pre@{topk}: {precision / count}")
print(f"Rec@{topk}: {recall / count}")
print(f"MAP@{topk}: {ap / count}")
print(f"NDCG@{topk}: {ndcg / count}")

User 15188098 | Pre@10: 0.0 | Rec@10: 0.0 | AP@10: 0 | NDCG@10: 0.9999999999999999 

User 9927825 | Pre@10: 0.0 | Rec@10: 0.0 | AP@10: 0 | NDCG@10: 0.5252520457236445 

User 15225634 | Pre@10: 0.0 | Rec@10: 0.0 | AP@10: 0 | NDCG@10: 0.6311028238422782 

User 10025671 | Pre@10: 0.0 | Rec@10: 0.0 | AP@10: 0 | NDCG@10: 0.9999999999999998 

User 356525 | Pre@10: 0.0 | Rec@10: 0.0 | AP@10: 0 | NDCG@10: 0.6199792816779409 

User 16757512 | Pre@10: 0.0 | Rec@10: 0.0 | AP@10: 0 | NDCG@10: 0.5960626287014779 

User 17506038 | Pre@10: 0.0 | Rec@10: 0.0 | AP@10: 0 | NDCG@10: 1.0 

User 5996998 | Pre@10: 0.0 | Rec@10: 0.0 | AP@10: 0 | NDCG@10: 0.5686433555798904 

User 7261494 | Pre@10: 0.0 | Rec@10: 0.0 | AP@10: 0 | NDCG@10: 0.5607449928684298 

User 13513176 | Pre@10: 0.0 | Rec@10: 0.0 | AP@10: 0 | NDCG@10: 0.9904673709481427 

User 1621495 | Pre@10: 0.0 | Rec@10: 0.0 | AP@10: 0 | NDCG@10: 0.5550009559248589 

User 11999970 | Pre@10: 0.0 | Rec@10: 0.0 | AP@10: 0 | NDCG@10: 0.7280655461900787 

U

# BM25

In [None]:
! pip install rank-bm25

Collecting rank-bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank-bm25
Successfully installed rank-bm25-0.2.2


In [None]:
! pip install vncorenlp

Collecting vncorenlp
  Downloading vncorenlp-1.0.3.tar.gz (2.6 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━[0m [32m2.2/2.6 MB[0m [31m65.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m51.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: vncorenlp
  Building wheel for vncorenlp (setup.py) ... [?25l[?25hdone
  Created wheel for vncorenlp: filename=vncorenlp-1.0.3-py3-none-any.whl size=2645933 sha256=75f02110264e125a816b48a9a7f168245f5c280fc9a6f9addd491e36727f3a6d
  Stored in directory: /root/.cache/pip/wheels/6f/19/20/ec7083125fd06db1a19d0d3ca18806ecf4e8ed1464713b4efa
Successfully built vncorenlp
Installing collected packages: vncorenlp
Successfully installed vncorenlp-1.0.3


In [None]:
!mkdir -p vncorenlp/models/wordsegmenter
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/VnCoreNLP-1.1.1.jar
!mv VnCoreNLP-1.1.1.jar vncorenlp/

!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/vi-vocab
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/wordsegmenter.rdr
!mv vi-vocab vncorenlp/models/wordsegmenter/
!mv wordsegmenter.rdr vncorenlp/models/wordsegmenter/


!mkdir -p vncorenlp/models/postagger
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/postagger/vi-tagger
!mv vi-tagger vncorenlp/models/postagger/


!mkdir -p vncorenlp/models/ner
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/ner/vi-500brownclusters.xz
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/ner/vi-ner.xz
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/ner/vi-pretrainedembeddings.xz
!mv vi-500brownclusters.xz vncorenlp/models/ner/
!mv vi-ner.xz vncorenlp/models/ner/
!mv vi-pretrainedembeddings.xz vncorenlp/models/ner/

--2025-12-13 04:25:05--  https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/VnCoreNLP-1.1.1.jar
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 27412575 (26M) [application/octet-stream]
Saving to: ‘VnCoreNLP-1.1.1.jar’


2025-12-13 04:25:06 (136 MB/s) - ‘VnCoreNLP-1.1.1.jar’ saved [27412575/27412575]

--2025-12-13 04:25:06--  https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/vi-vocab
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 526544 (514K) [application/octet-stream]
Saving to: ‘vi-voc

In [None]:
from rank_bm25 import BM25Okapi
from vncorenlp import VnCoreNLP
vncorenlp = VnCoreNLP("vncorenlp/VnCoreNLP-1.1.1.jar", annotators="wseg")

def tokennize_vn(text):
    sentences = vncorenlp.tokenize(text)
    s = ''
    for t in sentences:
        s = s + ' '.join(t) + ' '
    return s

product_ids = list(book['product_id'][book['description'].notnull()].unique())
description_book = list(book['description'][book['product_id'].isin(product_ids)])
corpus_tokens = [tokennize_vn(desc).lower().split() for desc in description_book]
bm25 = BM25Okapi(corpus_tokens)

In [None]:
index_to_book_id = {i: book_id for i, book_id in enumerate(product_ids)}

In [None]:
topk = 5
precision = 0
recall = 0
ap = 0
ndcg = 0
count = 0

for user_id in list_user_id5:
    user_book = review_book_description[['product_id', 'description', 'rating']][(review_book_description['customer_id'] == user_id) & (~review_book_description['description'].isna())]

    X_train = user_book.sample(frac=0.2, random_state=42)
    X_test = user_book.drop(index=X_train.index)

    user_book = list(X_train['description'])
    if len(user_book) != 0:
      user_query_tokens = []
      for book in user_book:
        tokens = tokennize_vn(book).lower().split()
        user_query_tokens.extend(tokens)

      doc_scores = bm25.get_scores(user_query_tokens)
      all_candidates = [(index_to_book_id[i], score) for i, score in enumerate(doc_scores)]

      books_read_in_train = set(X_train['product_id'])

      filtered_candidates = [
          item for item in all_candidates
          if item[0] not in books_read_in_train
      ]
      sortlist = sorted(filtered_candidates, key=lambda x: x[1], reverse=True)[:len(X_test)]

      # Chuyển về định dạng dictionary cho hàm đánh giá
      recommend_book = {book: float(score) for book, score in sortlist}
      y_true = dict(zip(X_test['product_id'], X_test['rating']))

      precision += precision_at_k(recommend_book, y_true, k=topk)
      recall += recall_at_k(recommend_book, y_true, k=topk)
      ap += calculate_ap_at_k(recommend_book, y_true, k=topk)
      ndcg += calculate_ndcg_at_k(y_true, recommend_book, k=topk)
      count += 1
      print(f"User {user_id} | Pre@{topk}: {precision_at_k(recommend_book, y_true, k=topk)} | Rec@{topk}: {recall_at_k(recommend_book, y_true, k=topk)} | AP@{topk}: {calculate_ap_at_k(recommend_book, y_true, k=topk)} | NDCG@{topk}: {calculate_ndcg_at_k(y_true, recommend_book, k=topk)} \n")

print("========Final========")
print(f"Pre@{topk}: {precision / count}")
print(f"Rec@{topk}: {recall / count}")
print(f"MAP@{topk}: {ap / count}")
print(f"NDCG@{topk}: {ndcg / count}")

User 120954 | Pre@5: 0.0 | Rec@5: 0.0 | AP@5: 0 | NDCG@5: 0.9999999999999999 

User 21654527 | Pre@5: 0.0 | Rec@5: 0.0 | AP@5: 0 | NDCG@5: 0.9999999999999999 

User 15188098 | Pre@5: 0.0 | Rec@5: 0.0 | AP@5: 0 | NDCG@5: 0.9999999999999999 

User 9927825 | Pre@5: 0.0 | Rec@5: 0.0 | AP@5: 0 | NDCG@5: 0.33014761778423596 

User 11195564 | Pre@5: 0.0 | Rec@5: 0.0 | AP@5: 0 | NDCG@5: 0.9999999999999999 

User 15225634 | Pre@5: 0.0 | Rec@5: 0.0 | AP@5: 0 | NDCG@5: 0.3183671393962403 

User 5917418 | Pre@5: 0.0 | Rec@5: 0.0 | AP@5: 0 | NDCG@5: 0.9999999999999999 

User 620317 | Pre@5: 0.0 | Rec@5: 0.0 | AP@5: 0 | NDCG@5: 0.9572881727261122 

User 10025671 | Pre@5: 0.0 | Rec@5: 0.0 | AP@5: 0 | NDCG@5: 0.9999999999999999 

User 317483 | Pre@5: 0.0 | Rec@5: 0.0 | AP@5: 0 | NDCG@5: 0.9122105819346807 

User 6301744 | Pre@5: 0.2 | Rec@5: 0.16666666666666666 | AP@5: 1.0 | NDCG@5: 0.9999999999999999 

User 21558077 | Pre@5: 0.0 | Rec@5: 0.0 | AP@5: 0 | NDCG@5: 0.9321679589452766 

User 14361212 | Pr

In [None]:
topk = 10
precision = 0
recall = 0
ap = 0
ndcg = 0
count = 0

for user_id in list_user_id10:
    user_book = review_book_description[['product_id', 'description', 'rating']][(review_book_description['customer_id'] == user_id) & (~review_book_description['description'].isna())]

    X_train = user_book.sample(frac=0.2, random_state=42)
    X_test = user_book.drop(index=X_train.index)

    user_book = list(X_train['description'])
    if len(user_book) != 0:
      user_query_tokens = []
      for book in user_book:
        tokens = tokennize_vn(book).lower().split()
        user_query_tokens.extend(tokens)

      doc_scores = bm25.get_scores(user_query_tokens)
      all_candidates = [(index_to_book_id[i], score) for i, score in enumerate(doc_scores)]

      books_read_in_train = set(X_train['product_id'])

      filtered_candidates = [
          item for item in all_candidates
          if item[0] not in books_read_in_train
      ]
      sortlist = sorted(filtered_candidates, key=lambda x: x[1], reverse=True)[:len(X_test)]

      # Chuyển về định dạng dictionary cho hàm đánh giá
      recommend_book = {book: float(score) for book, score in sortlist}
      y_true = dict(zip(X_test['product_id'], X_test['rating']))

      precision += precision_at_k(recommend_book, y_true, k=topk)
      recall += recall_at_k(recommend_book, y_true, k=topk)
      ap += calculate_ap_at_k(recommend_book, y_true, k=topk)
      ndcg += calculate_ndcg_at_k(y_true, recommend_book, k=topk)
      count += 1
      print(f"User {user_id} | Pre@{topk}: {precision_at_k(recommend_book, y_true, k=topk)} | Rec@{topk}: {recall_at_k(recommend_book, y_true, k=topk)} | AP@{topk}: {calculate_ap_at_k(recommend_book, y_true, k=topk)} | NDCG@{topk}: {calculate_ndcg_at_k(y_true, recommend_book, k=topk)} \n")

print("========Final========")
print(f"Pre@{topk}: {precision / count}")
print(f"Rec@{topk}: {recall / count}")
print(f"MAP@{topk}: {ap / count}")
print(f"NDCG@{topk}: {ndcg / count}")

User 15188098 | Pre@10: 0.0 | Rec@10: 0.0 | AP@10: 0 | NDCG@10: 0.9999999999999999 

User 9927825 | Pre@10: 0.0 | Rec@10: 0.0 | AP@10: 0 | NDCG@10: 0.5252520457236445 

User 15225634 | Pre@10: 0.0 | Rec@10: 0.0 | AP@10: 0 | NDCG@10: 0.6311028238422782 

User 10025671 | Pre@10: 0.0 | Rec@10: 0.0 | AP@10: 0 | NDCG@10: 0.9999999999999998 

User 356525 | Pre@10: 0.0 | Rec@10: 0.0 | AP@10: 0 | NDCG@10: 0.6199792816779409 

User 16757512 | Pre@10: 0.0 | Rec@10: 0.0 | AP@10: 0 | NDCG@10: 0.5960626287014779 

User 17506038 | Pre@10: 0.0 | Rec@10: 0.0 | AP@10: 0 | NDCG@10: 1.0 

User 5996998 | Pre@10: 0.0 | Rec@10: 0.0 | AP@10: 0 | NDCG@10: 0.5686433555798904 

User 7261494 | Pre@10: 0.0 | Rec@10: 0.0 | AP@10: 0 | NDCG@10: 0.5607449928684298 

User 13513176 | Pre@10: 0.1 | Rec@10: 0.1 | AP@10: 1.0 | NDCG@10: 0.9904673709481427 

User 1621495 | Pre@10: 0.0 | Rec@10: 0.0 | AP@10: 0 | NDCG@10: 0.5550009559248589 

User 11999970 | Pre@10: 0.0 | Rec@10: 0.0 | AP@10: 0 | NDCG@10: 0.7263950715057684 
