In [1]:
import pandas as pd
import numpy as np

In [2]:
review = pd.read_csv(r'data\final\new_cleaned_reviews_v2.csv')

In [3]:
review.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26050 entries, 0 to 26049
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   customer_id  26050 non-null  int64 
 1   product_id   26050 non-null  int64 
 2   rating       26050 non-null  int64 
 3   content      12049 non-null  object
dtypes: int64(3), object(1)
memory usage: 814.2+ KB


In [4]:
review_gr = review.groupby('product_id').count()
review_gr[review_gr['customer_id'] >= 10]

Unnamed: 0_level_0,customer_id,rating,content
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
316018,32,32,23
316890,12,12,4
322063,29,29,29
322074,48,48,21
324406,12,12,12
...,...,...,...
274364986,10,10,5
274612423,17,17,8
274927641,17,17,6
275613258,13,13,9


# User Based

In [5]:
user_item_matrix = review.pivot_table(index='customer_id', columns='product_id', values='rating')
user_item_matrix.fillna(0, inplace=True)
user_item_matrix

product_id,316018,316890,320455,322063,322074,324406,324681,324684,324688,335315,...,278025229,278037699,278045554,278045559,278045562,278060816,278228992,278237797,278247094,278424386
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
58,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
247,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
295,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
351,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
389,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30430171,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30434934,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30435318,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30476627,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
# test_size = 0.2

# X_train = user_item_matrix.sample(frac=1 - test_size, random_state=42)
# X_test = user_item_matrix.drop(X_train.index)

# X_train.shape, X_test.shape

In [6]:
def get_ratings(matrix, user_id, movie_id):
    return matrix.loc[user_id,movie_id]

In [7]:
def get_cosine_similarity(user1_ratings, user2_ratings):

    """
    Get user1_ratings is vector of user1 ratings
    Get user2_ratings is vector of user2 ratings
    """

    dot_product = np.dot(user1_ratings, user2_ratings)
    norm_user1 = np.linalg.norm(user1_ratings)
    norm_user2 = np.linalg.norm(user2_ratings)

    result = dot_product / (norm_user1 * norm_user2)
    return result

def get_pearson_similarity(user1_ratings, user2_ratings):
    """
    Get user1_ratings is vector of user1 ratings
    Get user2_ratings is vector of user2 ratings
    """

    mean_user1 = np.mean(user1_ratings)
    mean_user2 = np.mean(user2_ratings)

    user1_ratings_centered = user1_ratings - mean_user1
    user2_ratings_centered = user2_ratings - mean_user2

    dot_product = np.dot(user1_ratings_centered, user2_ratings_centered)

    norm_user1 = np.linalg.norm(user1_ratings_centered)
    norm_user2 = np.linalg.norm(user2_ratings_centered)

    result = dot_product / (norm_user1 * norm_user2)
    return result


In [8]:
def get_top_k_similar_users(target_user, k, matrix, similarity_metric):
    """
    target_user: vector dimension of target user ratings
    k: number of similar users to return
    similarity_metric: 'cosine' or 'pearson'
    matrix: user-item train rating matrix 
    """
    similarity = []
    target_user_embedding = matrix.loc[target_user].to_numpy()

    for user in matrix.index:
        user_rating_matrix = matrix.loc[user].to_numpy()
        if similarity_metric == 'cosine':
            sim = get_cosine_similarity(target_user_embedding, user_rating_matrix)
        elif similarity_metric == 'pearson':
            sim = get_pearson_similarity(target_user_embedding, user_rating_matrix)
        else:
            raise ValueError("Invalid similarity metric. Choose 'cosine' or 'pearson'.")
        similarity.append((user, sim))

    # Sort users by similarity score and select top k
    top_k = sorted(similarity, key=lambda x: x[1], reverse=True)[:k]
    return top_k


In [9]:
def predict_rating_film(target_user, target_film, matrix, topk_users, list_books=None):

    min_rating = 1
    
    book_user_watched = matrix.loc[target_user][matrix.loc[target_user].index.isin(list_books)]
    mean_rating_target_user = book_user_watched.mean()
    dot_product = 0
    for k in range(len(topk_users)):
        userid, sim_score = topk_users[k]
        mean_userid = np.mean(matrix.loc[userid][matrix.loc[userid] != 0])

        result = sim_score * (get_ratings(matrix, movie_id=target_film, user_id=userid) - mean_userid)
        dot_product += result
    total_similarity = sum(sim_score for _, sim_score in topk_users)

    rating = mean_rating_target_user + (dot_product / total_similarity)

    rating = max(min_rating, rating)
    
    return (target_film, rating)

In [10]:
def recommended_books_topk(target_user,
                            topk_similar, 
                            user_item_matrix, 
                            similarity_metric,
                            number_of_item_recommend,
                            X_train):
    topk_users = get_top_k_similar_users(target_user=target_user, k=topk_similar, matrix=user_item_matrix, similarity_metric=similarity_metric)

    rating_predict = {}
    for book in user_item_matrix.columns:
        if book not in X_train.keys():
            predicted_rating = predict_rating_film(target_user=target_user, target_film=book, matrix=user_item_matrix, topk_users=topk_users, list_books=X_train.keys())
            if predicted_rating[1] > 0:
                rating_predict[book] = predicted_rating[1]

    recommended_books = dict(sorted(rating_predict.items(), key=lambda x: x[1], reverse=True)[:number_of_item_recommend])
    return recommended_books

In [11]:
from sklearn.metrics import ndcg_score

def calculate_ap_at_k(user_recommendations, relevant_items, k):
    precision_sum = 0
    relevant_count = 0

    recommend_list = list(user_recommendations.keys())[:k]
    real_list = list(relevant_items.keys())
    for i, book_id in enumerate(recommend_list):
        if book_id in real_list:
            relevant_count += 1
            precision_sum += relevant_count / (i + 1)
    if relevant_count == 0:
        return 0
    return precision_sum / relevant_count

def calculate_map_at_k(user_recommendations, user_relevant_items, k):
    ap_scores = []
    ap_scores.append(calculate_ap_at_k(list(user_recommendations.keys()), list(user_relevant_items.keys()), k))
    return sum(ap_scores) / len(ap_scores)

# def calculate_ndcg_at_k(y_true, y_pred, k):
#     sorted_pred_ids = list(y_pred.keys())[:k]
#     relevance_scores = [y_true.get(item_id, 0) for item_id in sorted_pred_ids]
    
#     def get_dcg(scores):
#         return sum(score / np.log2(idx + 2) 
#                    for idx, score in enumerate(scores))

#     dcg = get_dcg(relevance_scores)
    
#     ideal_relevance = sorted(y_true.values(), reverse=True)[:k]
#     idcg = get_dcg(ideal_relevance)
    
#     if idcg == 0:
#         return 0.0
    
#     return dcg / idcg

def calculate_ndcg_at_k(y_true, y_pred, k):
    y_true_scores = list(y_true.values())[:k]
    y_pred_scores = list(y_pred.values())[:k]

    return ndcg_score([y_true_scores], [y_pred_scores], k=k)

def precision_at_k(recommended_books, relevant_books, k):
    pred_k = list(recommended_books.keys())[:k]
    relevant = set(relevant_books.keys())

    hit = len([item for item in pred_k if item in relevant])
    return hit / k


def recall_at_k(recommended_books, relevant_books, k):
    pred_k = list(recommended_books.keys())[:k]
    relevant = set(relevant_books.keys())

    hit = len([item for item in pred_k if item in relevant])
    total_relevant = len(relevant_books)
    if total_relevant == 0:
        return 0.0
    return hit / len(relevant)



# Cosine

In [78]:
# Calculate metric top 5
import numpy as np
topk = 5
recall = []
precision = []
ap = []
ndcg = []
for user_id in list(user_item_matrix.index):
    if len(user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] != 0]) > topk:
        X_train = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] != 0].iloc[topk:].to_dict()
        X_test = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] != 0].iloc[:topk].to_dict()

        recommended_books = recommended_books_topk(user_id,
                                           topk_similar=10,
                                           user_item_matrix=user_item_matrix,
                                           similarity_metric='cosine',
                                           number_of_item_recommend=topk,
                                           X_train=X_train)
        
        print(f"""User: {user_id} => 
              rec@{topk}: {recall_at_k(recommended_books, X_test, k=topk)}, pre@{topk} {precision_at_k(recommended_books, X_test, k=topk)}, ap@{topk}: {calculate_ap_at_k(recommended_books, X_test, k=topk)}, ndcg@{topk}: {calculate_ndcg_at_k(X_test, recommended_books, k=topk)}""")
        recall.append(recall_at_k(recommended_books, X_test, k=topk))
        precision.append(precision_at_k(recommended_books, X_test, k=topk))
        ap.append(calculate_ap_at_k(recommended_books, X_test, k=topk))
        ndcg.append(calculate_ndcg_at_k(X_test, recommended_books, k=topk))

print(f"Rec@{topk}: {(np.array(recall)).mean()}")
print(f"Pre@{topk}: {(np.array(precision)).mean()}")
print(f"MAP@{topk}: {(np.array(ap)).mean()}")
print(f"NDCG@{topk}: {(np.array(ndcg)).mean()}")



User: 2795 => 
              rec@5: 0.2, pre@5 0.2, ap@5: 1.0, ndcg@5: 0.7191589133038784
User: 10174 => 
              rec@5: 0.2, pre@5 0.2, ap@5: 1.0, ndcg@5: 0.911650072459567
User: 20073 => 
              rec@5: 1.0, pre@5 1.0, ap@5: 1.0, ndcg@5: 1.0
User: 20485 => 
              rec@5: 0.2, pre@5 0.2, ap@5: 1.0, ndcg@5: 0.9535409183388576
User: 26273 => 
              rec@5: 0.0, pre@5 0.0, ap@5: 0, ndcg@5: 0.8787836414936849
User: 39397 => 
              rec@5: 0.2, pre@5 0.2, ap@5: 1.0, ndcg@5: 0.9733469251140208
User: 60751 => 
              rec@5: 0.2, pre@5 0.2, ap@5: 1.0, ndcg@5: 0.800211520857695
User: 70533 => 
              rec@5: 0.2, pre@5 0.2, ap@5: 0.25, ndcg@5: 0.8729868289941036
User: 78901 => 
              rec@5: 0.4, pre@5 0.4, ap@5: 1.0, ndcg@5: 0.8111909098753992
User: 98653 => 
              rec@5: 1.0, pre@5 1.0, ap@5: 1.0, ndcg@5: 1.0
User: 118270 => 
              rec@5: 0.2, pre@5 0.2, ap@5: 1.0, ndcg@5: 0.9779901751999253
User: 120954 => 
              r

In [77]:
# Calculate metric top 10
import numpy as np
topk = 10
recall = []
precision = []
ap = []
ndcg = []
for user_id in list(user_item_matrix.index):
    if len(user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] != 0]) > topk:
        X_train = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] != 0].iloc[topk:].to_dict()
        X_test = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] != 0].iloc[:topk].to_dict()


        recommended_books = recommended_books_topk(user_id,
                                           topk_similar=10,
                                           user_item_matrix=user_item_matrix,
                                           similarity_metric='cosine',
                                           number_of_item_recommend=topk,
                                           X_train=X_train)
        
        print(f"""User: {user_id} => 
              rec@{topk}: {recall_at_k(recommended_books, X_test, k=topk)}, pre@{topk} {precision_at_k(recommended_books, X_test, k=topk)}, ap@{topk}: {calculate_ap_at_k(recommended_books, X_test, k=topk)}, ndcg@{topk}: {calculate_ndcg_at_k(X_test, recommended_books, k=topk)}""")
        recall.append(recall_at_k(recommended_books, X_test, k=topk))
        precision.append(precision_at_k(recommended_books, X_test, k=topk))
        ap.append(calculate_ap_at_k(recommended_books, X_test, k=topk))
        ndcg.append(calculate_ndcg_at_k(X_test, recommended_books, k=topk))

print(f"Rec@{topk}: {(np.array(recall)).mean()}")
print(f"Pre@{topk}: {(np.array(precision)).mean()}")
print(f"MAP@{topk}: {(np.array(ap)).mean()}")
print(f"NDCG@{topk}: {(np.array(ndcg)).mean()}")



User: 2795 => 
              rec@10: 0.1, pre@10 0.1, ap@10: 1.0, ndcg@10: 0.7409102438138027
User: 10174 => 
              rec@10: 0.1, pre@10 0.1, ap@10: 1.0, ndcg@10: 0.897498986072591
User: 20073 => 
              rec@10: 0.6, pre@10 0.6, ap@10: 1.0, ndcg@10: 0.9489283159232522
User: 20485 => 
              rec@10: 0.1, pre@10 0.1, ap@10: 1.0, ndcg@10: 0.9412790812378725
User: 26273 => 
              rec@10: 0.1, pre@10 0.1, ap@10: 1.0, ndcg@10: 0.7593584164583503
User: 39397 => 
              rec@10: 0.1, pre@10 0.1, ap@10: 1.0, ndcg@10: 0.9355879004362353
User: 60751 => 
              rec@10: 0.1, pre@10 0.1, ap@10: 1.0, ndcg@10: 0.7919601108321268
User: 70533 => 
              rec@10: 0.1, pre@10 0.1, ap@10: 0.25, ndcg@10: 0.8686555389977063
User: 78901 => 
              rec@10: 0.2, pre@10 0.2, ap@10: 1.0, ndcg@10: 0.8194777118549064
User: 118270 => 
              rec@10: 0.1, pre@10 0.1, ap@10: 1.0, ndcg@10: 0.9485245922123161
User: 174783 => 
              rec@10: 0.1, pre@10

# Pearson

In [12]:
# Calculate metric top 5
import numpy as np
topk = 5
recall = []
precision = []
ap = []
ndcg = []
for user_id in list(user_item_matrix.index):
    if len(user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] != 0]) > topk:
        X_train = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] != 0].iloc[topk:].to_dict()
        X_test = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] != 0].iloc[:topk].to_dict()

        recommended_books = recommended_books_topk(user_id,
                                           topk_similar=10,
                                           user_item_matrix=user_item_matrix,
                                           similarity_metric='pearson',
                                           number_of_item_recommend=topk,
                                           X_train=X_train)
        
        print(f"""User: {user_id} => 
              rec@{topk}: {recall_at_k(recommended_books, X_test, k=topk)}, pre@{topk} {precision_at_k(recommended_books, X_test, k=topk)}, ap@{topk}: {calculate_ap_at_k(recommended_books, X_test, k=topk)}, ndcg@{topk}: {calculate_ndcg_at_k(X_test, recommended_books, k=topk)}""")
        recall.append(recall_at_k(recommended_books, X_test, k=topk))
        precision.append(precision_at_k(recommended_books, X_test, k=topk))
        ap.append(calculate_ap_at_k(recommended_books, X_test, k=topk))
        ndcg.append(calculate_ndcg_at_k(X_test, recommended_books, k=topk))

print(f"Rec@{topk}: {(np.array(recall)).mean()}")
print(f"Pre@{topk}: {(np.array(precision)).mean()}")
print(f"MAP@{topk}: {(np.array(ap)).mean()}")
print(f"NDCG@{topk}: {(np.array(ndcg)).mean()}")



User: 2795 => 
              rec@5: 0.2, pre@5 0.2, ap@5: 1.0, ndcg@5: 0.7191589133038784
User: 10174 => 
              rec@5: 0.2, pre@5 0.2, ap@5: 1.0, ndcg@5: 0.911650072459567
User: 20073 => 
              rec@5: 1.0, pre@5 1.0, ap@5: 1.0, ndcg@5: 1.0
User: 20485 => 
              rec@5: 0.2, pre@5 0.2, ap@5: 1.0, ndcg@5: 0.9535409183388576
User: 26273 => 
              rec@5: 0.0, pre@5 0.0, ap@5: 0, ndcg@5: 0.8787836414936849
User: 39397 => 
              rec@5: 0.2, pre@5 0.2, ap@5: 1.0, ndcg@5: 0.9733469251140208
User: 60751 => 
              rec@5: 0.2, pre@5 0.2, ap@5: 1.0, ndcg@5: 0.800211520857695
User: 70533 => 
              rec@5: 0.2, pre@5 0.2, ap@5: 0.25, ndcg@5: 0.8729868289941036
User: 78901 => 
              rec@5: 0.4, pre@5 0.4, ap@5: 1.0, ndcg@5: 0.8111909098753992
User: 98653 => 
              rec@5: 1.0, pre@5 1.0, ap@5: 1.0, ndcg@5: 1.0
User: 118270 => 
              rec@5: 0.2, pre@5 0.2, ap@5: 1.0, ndcg@5: 0.9779901751999253
User: 120954 => 
              r

In [13]:
# Calculate metric top 10
import numpy as np
topk = 10
recall = []
precision = []
ap = []
ndcg = []
for user_id in list(user_item_matrix.index):
    if len(user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] != 0]) > topk:
        X_train = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] != 0].iloc[topk:].to_dict()
        X_test = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] != 0].iloc[:topk].to_dict()


        recommended_books = recommended_books_topk(user_id,
                                           topk_similar=10,
                                           user_item_matrix=user_item_matrix,
                                           similarity_metric='pearson',
                                           number_of_item_recommend=topk,
                                           X_train=X_train)
        
        print(f"""User: {user_id} => 
              rec@{topk}: {recall_at_k(recommended_books, X_test, k=topk)}, pre@{topk} {precision_at_k(recommended_books, X_test, k=topk)}, ap@{topk}: {calculate_ap_at_k(recommended_books, X_test, k=topk)}, ndcg@{topk}: {calculate_ndcg_at_k(X_test, recommended_books, k=topk)}""")
        recall.append(recall_at_k(recommended_books, X_test, k=topk))
        precision.append(precision_at_k(recommended_books, X_test, k=topk))
        ap.append(calculate_ap_at_k(recommended_books, X_test, k=topk))
        ndcg.append(calculate_ndcg_at_k(X_test, recommended_books, k=topk))

print(f"Rec@{topk}: {(np.array(recall)).mean()}")
print(f"Pre@{topk}: {(np.array(precision)).mean()}")
print(f"MAP@{topk}: {(np.array(ap)).mean()}")
print(f"NDCG@{topk}: {(np.array(ndcg)).mean()}")



User: 2795 => 
              rec@10: 0.1, pre@10 0.1, ap@10: 1.0, ndcg@10: 0.7409102438138027
User: 10174 => 
              rec@10: 0.1, pre@10 0.1, ap@10: 1.0, ndcg@10: 0.897498986072591
User: 20073 => 
              rec@10: 0.5, pre@10 0.5, ap@10: 0.925, ndcg@10: 0.951008807975839
User: 20485 => 
              rec@10: 0.1, pre@10 0.1, ap@10: 1.0, ndcg@10: 0.9412790812378725
User: 26273 => 
              rec@10: 0.1, pre@10 0.1, ap@10: 1.0, ndcg@10: 0.7593584164583503
User: 39397 => 
              rec@10: 0.1, pre@10 0.1, ap@10: 1.0, ndcg@10: 0.9355879004362353
User: 60751 => 
              rec@10: 0.1, pre@10 0.1, ap@10: 1.0, ndcg@10: 0.7919601108321268
User: 70533 => 
              rec@10: 0.1, pre@10 0.1, ap@10: 0.25, ndcg@10: 0.8686555389977063
User: 78901 => 
              rec@10: 0.2, pre@10 0.2, ap@10: 1.0, ndcg@10: 0.8194777118549064
User: 118270 => 
              rec@10: 0.1, pre@10 0.1, ap@10: 1.0, ndcg@10: 0.9485245922123161
User: 174783 => 
              rec@10: 0.1, pre@1