In [1]:
import pandas as pd
import numpy as np

In [2]:
review = pd.read_csv(r'data\final\final_interactions.csv')

In [3]:
review.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1336 entries, 0 to 1335
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   customer_id     1336 non-null   int64 
 1   product_id      1336 non-null   int64 
 2   rating          1336 non-null   int64 
 3   content         451 non-null    object
 4   title           1336 non-null   object
 5   thank_count     1336 non-null   int64 
 6   customer_index  1336 non-null   int64 
 7   product_index   1336 non-null   int64 
dtypes: int64(6), object(2)
memory usage: 83.6+ KB


# User Based

In [4]:
user_item_matrix = review.pivot_table(index='customer_id', columns='product_id', values='rating')
user_item_matrix.fillna(0, inplace=True)
user_item_matrix

product_id,442424,477566,486536,560565,561056,578262,599037,734872,807719,1353611,...,272877866,273273943,273386447,273690134,273822779,274927641,275337800,275866978,276289479,278095411
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2415,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11359,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15312,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42438,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28001953,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0
28056883,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
28059010,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28420073,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
test_size = 0.2

X_train = user_item_matrix.sample(frac=1 - test_size, random_state=42)
X_test = user_item_matrix.drop(X_train.index)

X_train.shape, X_test.shape

((278, 177), (70, 177))

In [6]:
def get_ratings(matrix, user_id, movie_id):
    return matrix.loc[user_id,movie_id]

In [7]:
def get_cosine_similarity(user1_ratings, user2_ratings):

    """
    Get user1_ratings is vector of user1 ratings
    Get user2_ratings is vector of user2 ratings
    """

    dot_product = np.dot(user1_ratings, user2_ratings)
    norm_user1 = np.linalg.norm(user1_ratings)
    norm_user2 = np.linalg.norm(user2_ratings)

    result = dot_product / (norm_user1 * norm_user2)
    return result

def get_pearson_similarity(user1_ratings, user2_ratings):
    """
    Get user1_ratings is vector of user1 ratings
    Get user2_ratings is vector of user2 ratings
    """

    mean_user1 = np.mean(user1_ratings)
    mean_user2 = np.mean(user2_ratings)

    user1_ratings_centered = user1_ratings - mean_user1
    user2_ratings_centered = user2_ratings - mean_user2

    dot_product = np.dot(user1_ratings_centered, user2_ratings_centered)

    norm_user1 = np.linalg.norm(user1_ratings_centered)
    norm_user2 = np.linalg.norm(user2_ratings_centered)

    result = dot_product / (norm_user1 * norm_user2)
    return result


In [8]:
def get_top_k_similar_users(target_user, k, matrix, similarity_metric):
    """
    target_user: vector dimension of target user ratings
    k: number of similar users to return
    similarity_metric: 'cosine' or 'pearson'
    matrix: user-item train rating matrix 
    """
    similarity = []
    target_user_embedding = matrix.loc[target_user].to_numpy()

    for user in matrix.index:
        user_rating_matrix = matrix.loc[user].to_numpy()
        if similarity_metric == 'cosine':
            sim = get_cosine_similarity(target_user_embedding, user_rating_matrix)
        elif similarity_metric == 'pearson':
            sim = get_pearson_similarity(target_user_embedding, user_rating_matrix)
        else:
            raise ValueError("Invalid similarity metric. Choose 'cosine' or 'pearson'.")
        similarity.append((user, sim))

    # Sort users by similarity score and select top k
    top_k = sorted(similarity, key=lambda x: x[1], reverse=True)[:k]
    return top_k


In [9]:
def predict_rating_film(target_user, target_film, matrix, topk_users, list_books=None):
    
    book_user_watched = matrix.loc[target_user][matrix.loc[target_user].index.isin(list_books)]
    mean_rating_target_user = book_user_watched.mean()
    dot_product = 0
    for k in range(len(topk_users)):
        userid, sim_score = topk_users[k]
        mean_userid = np.mean(matrix.loc[userid][matrix.loc[userid] != 0])

        result = sim_score * (get_ratings(matrix, movie_id=target_film, user_id=userid) - mean_userid)
        dot_product += result
    total_similarity = sum(sim_score for _, sim_score in topk_users)

    rating = mean_rating_target_user + (dot_product / total_similarity)
    
    return (target_film, rating)

In [10]:
def recommended_books_topk(target_user,
                            topk_similar, 
                            user_item_matrix, 
                            similarity_metric,
                            number_of_item_recommend,
                            X_train):
    topk_users = get_top_k_similar_users(target_user=target_user, k=topk_similar, matrix=user_item_matrix, similarity_metric=similarity_metric)

    rating_predict = {}
    for book in user_item_matrix.columns:
        if book not in X_train.keys():
            predicted_rating = predict_rating_film(target_user=target_user, target_film=book, matrix=user_item_matrix, topk_users=topk_users, list_books=X_train.keys())
            if predicted_rating[1] > 0:
                rating_predict[book] = predicted_rating[1]

    recommended_books = dict(sorted(rating_predict.items(), key=lambda x: x[1], reverse=True)[:number_of_item_recommend])

    return recommended_books

In [15]:
def calculate_ap_at_k(user_recommendations, relevant_items, k):
    precision_sum = 0
    relevant_count = 0
    for i, book_id in enumerate(user_recommendations):
        if book_id in relevant_items:
            relevant_count += 1
            precision_sum += relevant_count / (i + 1)
    if relevant_count == 0:
        return 0
    return precision_sum / len(relevant_items)

def calculate_map_at_k(user_recommendations, user_relevant_items, k):
    ap_scores = []
    ap_scores.append(calculate_ap_at_k(list(user_recommendations.keys()), list(user_relevant_items.keys()), k))
    return sum(ap_scores) / len(ap_scores)

def calculate_ndcg_at_k(y_true, y_pred, k):
    sorted_pred_ids = list(y_pred.keys())[:k]
    relevance_scores = [y_pred.get(item_id, 0) for item_id in sorted_pred_ids]
    
    def get_dcg(scores):
        return sum(score / np.log2(idx + 2) 
                   for idx, score in enumerate(scores))

    dcg = get_dcg(relevance_scores)
    
    ideal_relevance = sorted(y_pred.values(), reverse=True)[:k]
    idcg = get_dcg(ideal_relevance)
    
    if idcg == 0:
        return 0.0
    
    return dcg / idcg

def precision_at_k(recommended_books, relevant_books, k):
    pred_k = list(recommended_books.keys())[:k]
    relevant = list(relevant_books.keys())[:k]

    hit = len([item for item in pred_k if item in relevant])
    return hit / k


def recall_at_k(recommended_books, relevant_books, k):
    pred_k = list(recommended_books.keys())[:k]
    relevant = list(relevant_books.keys())

    hit = len([item for item in pred_k if item in relevant])
    total_relevant = len(relevant_books)
    if total_relevant == 0:
        return 0.0
    return hit / len(relevant)



In [16]:
# Calculate metric top 5
import numpy as np
topk = 5
recall = []
precision = []
map = []
ndcg = []
for user_id in list(user_item_matrix.index):
    if len(user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] != 0]) > topk:
        X_train = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] != 0].iloc[topk:].to_dict()
        X_test = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] != 0].iloc[:topk].to_dict()

        recommended_books = recommended_books_topk(user_id,
                                           topk_similar=10,
                                           user_item_matrix=user_item_matrix,
                                           similarity_metric='cosine',
                                           number_of_item_recommend=topk,
                                           X_train=X_train)
        
        print(f"""User: {user_id} => 
              rec@{topk}: {recall_at_k(recommended_books, X_test, k=topk)}, pre@{topk} {precision_at_k(recommended_books, X_test, k=topk)}, map@{topk}: {calculate_map_at_k(recommended_books, X_test, k=topk)}, ndcg@{topk}: {calculate_ndcg_at_k(X_test, recommended_books, k=topk)}""")
        recall.append(recall_at_k(recommended_books, X_test, k=topk))
        precision.append(precision_at_k(recommended_books, X_test, k=topk))
        map.append(calculate_map_at_k(recommended_books, X_test, k=topk))
        ndcg.append(calculate_ndcg_at_k(X_test, recommended_books, k=topk))

print(f"Rec@{topk}: {(np.array(recall)).mean()}")
print(f"Pre@{topk}: {(np.array(precision)).mean()}")
print(f"MAP@{topk}: {(np.array(map)).mean()}")
print(f"NDCG@{topk}: {(np.array(ndcg)).mean()}")



User: 98653 => 
              rec@5: 1.0, pre@5 1.0, map@5: 1.0, ndcg@5: 1.0
User: 306044 => 
              rec@5: 1.0, pre@5 1.0, map@5: 1.0, ndcg@5: 1.0
User: 642183 => 
              rec@5: 1.0, pre@5 1.0, map@5: 1.0, ndcg@5: 1.0
User: 1039840 => 
              rec@5: 1.0, pre@5 1.0, map@5: 1.0, ndcg@5: 1.0
User: 1118879 => 
              rec@5: 1.0, pre@5 1.0, map@5: 1.0, ndcg@5: 1.0
User: 1428415 => 
              rec@5: 1.0, pre@5 1.0, map@5: 1.0, ndcg@5: 1.0
User: 1503284 => 
              rec@5: 1.0, pre@5 1.0, map@5: 1.0, ndcg@5: 1.0
User: 2052582 => 
              rec@5: 1.0, pre@5 1.0, map@5: 1.0, ndcg@5: 1.0
User: 2065541 => 
              rec@5: 1.0, pre@5 1.0, map@5: 1.0, ndcg@5: 1.0
User: 5253296 => 
              rec@5: 1.0, pre@5 1.0, map@5: 1.0, ndcg@5: 1.0
User: 5917418 => 
              rec@5: 1.0, pre@5 1.0, map@5: 1.0, ndcg@5: 1.0
User: 6301744 => 
              rec@5: 1.0, pre@5 1.0, map@5: 1.0, ndcg@5: 1.0
User: 7333002 => 
              rec@5: 1.0, pre@5 1.0, m

In [17]:
# Calculate metric top 10
import numpy as np
topk = 10
recall = []
precision = []
map = []
ndcg = []
for user_id in list(user_item_matrix.index):
    if len(user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] != 0]) > topk:
        X_train = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] != 0].iloc[topk:].to_dict()
        X_test = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] != 0].iloc[:topk].to_dict()

        recommended_books = recommended_books_topk(user_id,
                                           topk_similar=10,
                                           user_item_matrix=user_item_matrix,
                                           similarity_metric='cosine',
                                           number_of_item_recommend=topk,
                                           X_train=X_train)
        
        print(f"""User: {user_id} => 
              rec@{topk}: {recall_at_k(recommended_books, X_test, k=topk)}, pre@{topk} {precision_at_k(recommended_books, X_test, k=topk)}, map@{topk}: {calculate_map_at_k(recommended_books, X_test, k=topk)}, ndcg@{topk}: {calculate_ndcg_at_k(X_test, recommended_books, k=topk)}""")
        recall.append(recall_at_k(recommended_books, X_test, k=topk))
        precision.append(precision_at_k(recommended_books, X_test, k=topk))
        map.append(calculate_map_at_k(recommended_books, X_test, k=topk))
        ndcg.append(calculate_ndcg_at_k(X_test, recommended_books, k=topk))

print(f"Rec@{topk}: {(np.array(recall)).mean()}")
print(f"Pre@{topk}: {(np.array(precision)).mean()}")
print(f"MAP@{topk}: {(np.array(map)).mean()}")
print(f"NDCG@{topk}: {(np.array(ndcg)).mean()}")



User: 5253296 => 
              rec@10: 1.0, pre@10 1.0, map@10: 1.0, ndcg@10: 1.0
User: 13513176 => 
              rec@10: 1.0, pre@10 1.0, map@10: 1.0, ndcg@10: 1.0
User: 17506038 => 
              rec@10: 1.0, pre@10 1.0, map@10: 1.0, ndcg@10: 1.0
User: 18132503 => 
              rec@10: 1.0, pre@10 1.0, map@10: 1.0, ndcg@10: 1.0
User: 18278431 => 
              rec@10: 1.0, pre@10 1.0, map@10: 1.0, ndcg@10: 1.0
Rec@10: 1.0
Pre@10: 1.0
MAP@10: 1.0
NDCG@10: 1.0


In [18]:
# Calculate metric top 5
import numpy as np
topk = 15
recall = []
precision = []
map = []
ndcg = []
for user_id in list(user_item_matrix.index):
    if len(user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] != 0]) > topk:
        X_train = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] != 0].iloc[topk:].to_dict()
        X_test = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] != 0].iloc[:topk].to_dict()

        recommended_books = recommended_books_topk(user_id,
                                           topk_similar=10,
                                           user_item_matrix=user_item_matrix,
                                           similarity_metric='cosine',
                                           number_of_item_recommend=topk,
                                           X_train=X_train)
        
        print(f"""User: {user_id} => 
              rec@{topk}: {recall_at_k(recommended_books, X_test, k=topk)}, pre@{topk} {precision_at_k(recommended_books, X_test, k=topk)}, map@{topk}: {calculate_map_at_k(recommended_books, X_test, k=topk)}, ndcg@{topk}: {calculate_ndcg_at_k(X_test, recommended_books, k=topk)}""")
        recall.append(recall_at_k(recommended_books, X_test, k=topk))
        precision.append(precision_at_k(recommended_books, X_test, k=topk))
        map.append(calculate_map_at_k(recommended_books, X_test, k=topk))
        ndcg.append(calculate_ndcg_at_k(X_test, recommended_books, k=topk))

print(f"Rec@{topk}: {(np.array(recall)).mean()}")
print(f"Pre@{topk}: {(np.array(precision)).mean()}")
print(f"MAP@{topk}: {(np.array(map)).mean()}")
print(f"NDCG@{topk}: {(np.array(ndcg)).mean()}")



User: 17506038 => 
              rec@15: 1.0, pre@15 1.0, map@15: 1.0, ndcg@15: 1.0
Rec@15: 1.0
Pre@15: 1.0
MAP@15: 1.0
NDCG@15: 1.0
