In [None]:
! pip install rank-bm25 --quiet
! pip install vncorenlp --quiet

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m2.6/2.6 MB[0m [31m105.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m62.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for vncorenlp (setup.py) ... [?25l[?25hdone


In [None]:
!mkdir -p vncorenlp/models/wordsegmenter
!mkdir -p vncorenlp/models/postagger
!mkdir -p vncorenlp/models/ner

!wget -q https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/VnCoreNLP-1.1.1.jar
!mv VnCoreNLP-1.1.1.jar vncorenlp/

!wget -q https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/vi-vocab
!wget -q https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/wordsegmenter.rdr
!mv vi-vocab vncorenlp/models/wordsegmenter/
!mv wordsegmenter.rdr vncorenlp/models/wordsegmenter/

!wget -q https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/postagger/vi-tagger
!mv vi-tagger vncorenlp/models/postagger/

!wget -q https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/ner/vi-500brownclusters.xz
!wget -q https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/ner/vi-ner.xz
!wget -q https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/ner/vi-pretrainedembeddings.xz
!mv vi-500brownclusters.xz vncorenlp/models/ner/
!mv vi-ner.xz vncorenlp/models/ner/
!mv vi-pretrainedembeddings.xz vncorenlp/models/ner/

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import scipy.sparse as sp
from sklearn.metrics import ndcg_score
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
import kagglehub

In [None]:
path = kagglehub.dataset_download("heeraldedhia/stop-words-in-28-languages")

path_stop_word = path + "/vietnamese.txt"
path_stop_word = path_stop_word.replace("\\", "/")
with open(path_stop_word, "r", encoding="utf-8") as f:
    stopwords = f.read().splitlines()

stopwords_set = set()
for word in stopwords:
    stopwords_set.add(word.replace(' ', '_'))

In [None]:
from rank_bm25 import BM25Okapi
from vncorenlp import VnCoreNLP

vncorenlp = VnCoreNLP("vncorenlp/VnCoreNLP-1.1.1.jar", annotators="wseg")

def tokennize_vn(text):
    sentences = vncorenlp.tokenize(text)
    valid_tokens = []
    for sentence in sentences:
      for token in sentence:
        if token not in stopwords_set:
          valid_tokens.append(token)

    return ' '.join(valid_tokens)

In [None]:
reviews_df = pd.read_csv('/content/final_interactions.csv')
books_df = pd.read_csv('/content/final_cleaned_books.csv')
desc_feat = np.load("/content/text_descbook_feat_VisoBert.npy")
typebook_feat = np.load('/content/text_typebook_feat_VisoBert.npy')

In [None]:
product_indexes = books_df.loc[~books_df['description'].isnull(), 'product_index']
index_to_product_indexes = {i : index for i, index in enumerate(product_indexes)}
description_book = books_df.loc[books_df['product_index'].isin(product_indexes), 'description'].tolist()
corpus_tokens = [tokennize_vn(desc).lower().split() for desc in description_book]
bm25_corpus = BM25Okapi(corpus_tokens)

In [None]:
def pearson_score(ratingsPivot, id1, id2):
    if id1 not in ratingsPivot.index or id2 not in ratingsPivot.index:
        return 0.0

    vec1 = ratingsPivot.loc[id1]
    vec2 = ratingsPivot.loc[id2]
    co_mask = vec1.notna() & vec2.notna()

    if co_mask.sum() < 2:
        return 0.0

    a1 = (vec1[co_mask] - vec1[co_mask].mean()).to_numpy()
    a2 = (vec2[co_mask] - vec2[co_mask].mean()).to_numpy()
    denorminator = np.linalg.norm(a1) * np.linalg.norm(a2)
    if denorminator == 0:
        return 0.0
    return float(np.dot(a1, a2) / denorminator)


def cosine_score(ratingsPivot, id1, id2):
    vec1 = ratingsPivot.loc[id1]
    vec2 = ratingsPivot.loc[id2]
    co_mask = vec1.notna() & vec2.notna()

    if co_mask.sum() < 2:
        return 0.0

    a1 = vec1[co_mask].to_numpy()
    a2 = vec2[co_mask].to_numpy()
    denorminator = np.linalg.norm(a1) * np.linalg.norm(a2)
    if denorminator == 0:
        return 0.0
    return float(np.dot(a1, a2) / denorminator)


def pearson_similarity_vector(v1, v2_matrix):
    v1 = v1.flatten()
    v1_mean = np.mean(v1)
    v1_centered = v1 - v1_mean
    v1_norm = np.linalg.norm(v1_centered)

    v2_means = np.mean(v2_matrix, axis=1, keepdims=True)
    v2_centered = v2_matrix - v2_means
    v2_norms = np.linalg.norm(v2_centered, axis=1)

    denominators = v1_norm * v2_norms
    denominators[denominators == 0] = 1e-9

    correlation = np.dot(v2_centered, v1_centered) / denominators
    return correlation


def get_topK_neighbors(ratingsPivot, target_id, k, similarity_name):
    if similarity_name == 'Pearson':
        similarity_score = [(id, pearson_score(ratingsPivot, target_id, id)) for id in ratingsPivot.index if id != target_id]
    elif similarity_name == 'Cosine':
        similarity_score = [(id, cosine_score(ratingsPivot, target_id, id)) for id in ratingsPivot.index if id != target_id]
    else:
        raise ValueError("similarity_name must be in ['Pearson', 'Cosine]")

    similarity_score = sorted(similarity_score, key=lambda x: x[1], reverse=True)[:k]
    return similarity_score

def average_precision_at_k(predictions, true_interactions, k):
    if k <= 0 or len(true_interactions) == 0 or len(predictions) == 0:
        return 0.0
    k_eff = min(k, len(predictions))
    ap_k, relevant = 0.0, 0
    for i in range(k_eff):
        if predictions[i][0] in true_interactions:
            relevant += 1
            ap_k += relevant / (i + 1)
    # common choice: divide by min(k, |relevant set|)
    return ap_k / min(k, len(true_interactions))

def normal_discounted_cumulative_gain_at_k(predictions, true_interactions, k):
    if k <= 0 or len(true_interactions) == 0 or len(predictions) == 0:
        return 0.0
    k_eff = min(k, len(predictions))
    dcg = 0.0
    for i in range(k_eff):
        if predictions[i][0] in true_interactions:
            dcg += 1 / np.log2(i + 2)
    idcg = sum(1 / np.log2(i + 2) for i in range(min(k, len(true_interactions))))
    return dcg / idcg if idcg > 0 else 0.0


def precision_at_k(predictions, true_interactions, k):
    if k <= 0 or len(predictions) == 0:
        return 0.0
    k_eff = min(k, len(predictions))
    top_k = [pred[0] for pred in predictions[:k_eff]]
    return sum(item in true_interactions for item in top_k) / k

def recall_at_k(predictions, true_interactions, k):
    if len(true_interactions) == 0 or len(predictions) == 0:
        return 0.0
    k_eff = min(k, len(predictions))
    top_k = [pred[0] for pred in predictions[:k_eff]]
    return sum(item in true_interactions for item in top_k) / len(true_interactions)


def get_recommendation_cf(ratingsPivot, user_id, k_neighbors, similarity_name):
    topK_neighbors = get_topK_neighbors(ratingsPivot, user_id, k_neighbors, similarity_name)

    total, den = {}, {}
    mean_target = ratingsPivot.loc[user_id].mean(skipna=True)

    for neighbor_id, score in topK_neighbors:
        if neighbor_id not in ratingsPivot.index:
            continue

        neighbor_ratings = ratingsPivot.loc[neighbor_id]
        mean_neighbor = neighbor_ratings.mean(skipna=True)

        # Các item neighbor đã rating
        items = ratingsPivot.loc[neighbor_id][ratingsPivot.loc[neighbor_id].notna()].index.tolist()
        # Các item mà target_id chưa rating
        unseen_items = [it for it in items if pd.isna(ratingsPivot.loc[user_id, it])]
        if not unseen_items:
            continue

        for item in unseen_items:
            total[item] = total.get(item, 0.0) + score * (neighbor_ratings[item] - mean_neighbor)
            den[item]   = den.get(item, 0.0)   + abs(score)

    ranking = []
    for item, num in total.items():
        if den[item] != 0:
            pred_rating = mean_target + (num / den[item])
            ranking.append((item, float(pred_rating)))

    ranking.sort(key=lambda x: x[1], reverse=True)
    return ranking


def get_recommendation_cb(train_ratingsDF, desc_feat, user_id, similarity_name):
    user_data = train_ratingsDF[train_ratingsDF['customer_index'] == user_id]
    interacted_idx = user_data['product_index'].tolist()
    user_ratings = user_data['rating'].values.reshape(-1, 1)
    interacted_vecs = desc_feat[interacted_idx]

    user_profile_vec = np.sum(interacted_vecs * user_ratings, axis=0) / np.sum(user_ratings)
    user_profile_vec = user_profile_vec.reshape(1, -1)

    candidate_indices = [product_index for product_index in train_ratingsDF['product_index'].unique() if product_index not in interacted_idx]
    candidate_vecs = desc_feat[candidate_indices]

    if similarity_name == 'Cosine':
        sim_scores = cosine_similarity(user_profile_vec, candidate_vecs).flatten()
    elif similarity_name == 'Pearson':
        sim_scores = pearson_similarity_vector(user_profile_vec, candidate_vecs).flatten()
    else:
        raise ValueError("Similarity Name must be in ['Cosine', 'Pearson']")

    recommendations = sorted(zip(candidate_indices, sim_scores), key=lambda x: x[1], reverse=True)
    return recommendations

def get_recommendation_cb_bm25(train_ratingsDF, user_id, bm25_corpus):
    user_data = train_ratingsDF[train_ratingsDF['customer_index'] == user_id]
    interacted_idx = user_data['product_index'].tolist()
    description_data = books_df.loc[books_df['product_index'].isin(interacted_idx) & ~books_df['description'].isnull(), 'description'].tolist()
    if not description_data:
        return []

    total_scores = defaultdict(float)

    for desc in description_data:
        tokens = tokennize_vn(str(desc)).lower().split()
        scores = bm25_corpus.get_scores(tokens)
        for idx, score in enumerate(scores):
            total_scores[idx] += score

    recommendations = []
    check_item = []
    for idx, score in total_scores.items():
        real_product_id = index_to_product_indexes.get(idx)

        # Chỉ thêm vào nếu sách này chưa từng đọc
        if real_product_id and (real_product_id not in interacted_idx) and (real_product_id not in check_item):
            check_item.append(real_product_id)
            recommendations.append((real_product_id, score))

    recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)
    return recommendations

def get_recommendation_test(train_ratingsDF, test_ratingsDF, K, type_rec, similarity_name=None, text_feat=None, typebook_feat=None, bm25_corpus=None):
    preds = []
    user_ids = test_ratingsDF['customer_index'].tolist()
    ap_list, ndcg_list, precision_list, recall_list = [], [], [], []
    train_ratingsPivot = train_ratingsDF.pivot(index='customer_index', columns='product_index', values='rating')

    for i in tqdm(range(len(user_ids)), desc='Recommend items for User'):
        if type_rec == 'cf':
          rec_items = get_recommendation_cf(
              train_ratingsPivot,
              user_id=user_ids[i],
              k_neighbors=10,
              similarity_name=similarity_name
          )
        elif type_rec == 'cb':
          rec_items = get_recommendation_cb(
              train_ratingsDF,
              text_feat,
              user_id=user_ids[i],
              similarity_name=similarity_name
          )
        elif type_rec == 'typebook_based':
          rec_items = get_recommendation_cb(
              train_ratingsDF,
              typebook_feat,
              user_id=user_ids[i],
              similarity_name=similarity_name
          )
        elif type_rec == 'cb_bm25':
          rec_items = get_recommendation_cb_bm25(
              train_ratingsDF,
              user_id=user_ids[i],
              bm25_corpus=bm25_corpus
          )
        else:
          raise ValueError(f"Type Recommendation system must be in ['cf', 'cb', 'cb_bm25', 'typebook_based']")

        preds.append(rec_items)
        true_interactions = test_ratingsDF[test_ratingsDF['customer_index'] == user_ids[i]]['product_index'].tolist()

        # Tính các metrics cho mỗi dự đoán
        ap_list.append(average_precision_at_k(rec_items, true_interactions, K))
        ndcg_list.append(normal_discounted_cumulative_gain_at_k(rec_items, true_interactions, K))
        precision_list.append(precision_at_k(rec_items, true_interactions, K))
        recall_list.append(recall_at_k(rec_items, true_interactions, K))

    # Tính giá trị trung bình của tất cả các metrics
    mean_ap = sum(ap_list) / len(ap_list)
    mean_ndcg = sum(ndcg_list) / len(ndcg_list)
    mean_precision = sum(precision_list) / len(precision_list)
    mean_recall = sum(recall_list) / len(recall_list)

    print(f"MAP@{K}: {mean_ap}")
    print(f"NDCG@{K}: {mean_ndcg}")
    print(f"Precision@{K}: {mean_precision}")
    print(f"Recall@{K}: {mean_recall}")

    return preds

In [None]:
train_parts, test_parts = [], []
for user_id, group in reviews_df.groupby('customer_index'):
    train_parts.append(group.iloc[:-1])
    test_parts.append(group.iloc[[-1]])

train_reviews_df = pd.concat(train_parts).reset_index(drop=True)
test_reviews_df = pd.concat(test_parts).reset_index(drop=True)

print(f"Training Size: {train_reviews_df.shape}, Testing Size: {test_reviews_df.shape}")

Training Size: (4655, 6), Testing Size: (707, 6)


In [None]:
for K in [5, 10]:
    preds = get_recommendation_test(train_reviews_df, test_reviews_df, K, type_rec='cf', similarity_name='Pearson')

Recommend items for User: 100%|██████████| 707/707 [02:07<00:00,  5.57it/s]


MAP@5: 0.000825082508250825
NDCG@5: 0.0013163741981236112
Precision@5: 0.0005657708628005659
Recall@5: 0.002828854314002829


Recommend items for User: 100%|██████████| 707/707 [02:04<00:00,  5.67it/s]

MAP@10: 0.0010608203677510608
NDCG@10: 0.001820203317088282
Precision@10: 0.00042432814710042436
Recall@10: 0.004243281471004243





In [None]:
for K in [5, 10]:
    preds = get_recommendation_test(train_reviews_df, test_reviews_df, K, type_rec='cf', similarity_name='Cosine')

Recommend items for User: 100%|██████████| 707/707 [02:02<00:00,  5.77it/s]


MAP@5: 0.0027109853842527108
NDCG@5: 0.0034380149336257324
Precision@5: 0.0011315417256011317
Recall@5: 0.005657708628005658


Recommend items for User: 100%|██████████| 707/707 [02:01<00:00,  5.82it/s]

MAP@10: 0.003926719202532498
NDCG@10: 0.0065323073706363075
Precision@10: 0.001555869872701556
Recall@10: 0.015558698727015558





In [None]:
for K in [5, 10]:
    preds = get_recommendation_test(train_reviews_df, test_reviews_df, K, type_rec='cb', similarity_name='Pearson', text_feat=desc_feat)

Recommend items for User: 100%|██████████| 707/707 [00:05<00:00, 123.36it/s]


MAP@5: 0.002593116454502593
NDCG@5: 0.003384426111335746
Precision@5: 0.0011315417256011317
Recall@5: 0.005657708628005658


Recommend items for User: 100%|██████████| 707/707 [00:06<00:00, 108.78it/s]

MAP@10: 0.004052446060932624
NDCG@10: 0.006996109306402555
Precision@10: 0.0016973125884016975
Recall@10: 0.016973125884016973





In [None]:
for K in [5, 10]:
    preds = get_recommendation_test(train_reviews_df, test_reviews_df, K, type_rec='cb', similarity_name='Cosine', text_feat=desc_feat)

Recommend items for User: 100%|██████████| 707/707 [00:06<00:00, 112.02it/s]


MAP@5: 0.002593116454502593
NDCG@5: 0.003384426111335746
Precision@5: 0.0011315417256011317
Recall@5: 0.005657708628005658


Recommend items for User: 100%|██████████| 707/707 [00:07<00:00, 94.85it/s] 


MAP@10: 0.004052446060932624
NDCG@10: 0.006996109306402555
Precision@10: 0.0016973125884016975
Recall@10: 0.016973125884016973


In [None]:
for K in [5, 10]:
    preds = get_recommendation_test(train_reviews_df, test_reviews_df, K, type_rec='typebook_based', similarity_name='Cosine', typebook_feat=typebook_feat)

Recommend items for User: 100%|██████████| 707/707 [00:07<00:00, 100.10it/s]


MAP@5: 0.005233380480905233
NDCG@5: 0.007423204983566234
Precision@5: 0.002828854314002829
Recall@5: 0.014144271570014143


Recommend items for User: 100%|██████████| 707/707 [00:09<00:00, 78.43it/s] 

MAP@10: 0.005812622078534384
NDCG@10: 0.008807370962009294
Precision@10: 0.0018387553041018388
Recall@10: 0.018387553041018388





In [None]:
for K in [5, 10]:
    preds = get_recommendation_test(train_reviews_df, test_reviews_df, K, type_rec='typebook_based', similarity_name='Pearson', typebook_feat=typebook_feat)

Recommend items for User: 100%|██████████| 707/707 [00:05<00:00, 131.51it/s]


MAP@5: 0.002168788307402169
NDCG@5: 0.003990264276389321
Precision@5: 0.0019801980198019802
Recall@5: 0.009900990099009901


Recommend items for User: 100%|██████████| 707/707 [00:03<00:00, 180.28it/s]

MAP@10: 0.0027048112974562763
NDCG@10: 0.0053337270851347846
Precision@10: 0.0014144271570014145
Recall@10: 0.014144271570014143





In [None]:
for K in [5, 10]:
    preds = get_recommendation_test(train_reviews_df, test_reviews_df, K, type_rec='cb_bm25', bm25_corpus=bm25_corpus)

Recommend items for User: 100%|██████████| 707/707 [03:31<00:00,  3.35it/s]


MAP@5: 0.02524752475247525
NDCG@5: 0.029877282528691894
Precision@5: 0.00876944837340877
Recall@5: 0.04384724186704385


Recommend items for User: 100%|██████████| 707/707 [03:28<00:00,  3.38it/s]

MAP@10: 0.027747468624413463
NDCG@10: 0.03613841938688594
Precision@10: 0.006364922206506365
Recall@10: 0.06364922206506365



