In [1]:
import pandas as pd
import numpy as np

In [2]:
review = pd.read_csv(r'data\final\final_interactions.csv')

In [3]:
review.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5362 entries, 0 to 5361
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   customer_id     5362 non-null   int64 
 1   product_id      5362 non-null   int64 
 2   rating          5362 non-null   int64 
 3   content         4009 non-null   object
 4   customer_index  5362 non-null   int64 
 5   product_index   5362 non-null   int64 
dtypes: int64(5), object(1)
memory usage: 251.5+ KB


In [4]:
review_gr = review.groupby('product_id').count()
review_gr[review_gr['customer_id'] >= 5]

Unnamed: 0_level_0,customer_id,rating,content,customer_index,product_index
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
316018,10,10,9,10,10
322063,5,5,5,5,5
322074,7,7,5,7,7
324406,5,5,5,5,5
324681,6,6,5,6,6
...,...,...,...,...,...
278023802,5,5,5,5,5
278023804,5,5,5,5,5
278023824,6,6,6,6,6
278037699,5,5,5,5,5


# Item Based

In [5]:
user_item_matrix = review.pivot_table(index='product_id', columns='customer_id', values='rating')
user_item_matrix.fillna(0, inplace=True)
user_item_matrix

customer_id,2415,2795,10174,11359,15195,15312,20073,20485,26273,39397,...,28217026,28420073,28441861,28499196,28756461,28876741,29550533,29990552,30035160,30200011
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
316018,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
316890,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
322063,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
322074,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
324406,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278045562,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
278060816,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
278228992,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
278237797,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0


In [6]:
def get_ratings(matrix, user_id, movie_id):
    return matrix.loc[movie_id,user_id]

In [7]:
def get_cosine_similarity(item1_ratings, item2_ratings):

    """
    Get item1_ratings is vector of item1 ratings
    Get item2_ratings is vector of item2 ratings
    """

    dot_product = np.dot(item1_ratings, item2_ratings)
    norm_item1 = np.linalg.norm(item1_ratings)
    norm_item2 = np.linalg.norm(item2_ratings)

    result = dot_product / (norm_item1 * norm_item2)
    return result

def get_pearson_similarity(item1_ratings, item2_ratings):
    """
    Get item1_ratings is vector of item1 ratings
    Get item2_ratings is vector of item2 ratings
    """

    mean_item1 = np.mean(item1_ratings)
    mean_item2 = np.mean(item2_ratings)

    item1_ratings_centered = item1_ratings - mean_item1
    item2_ratings_centered = item2_ratings - mean_item2

    dot_product = np.dot(item1_ratings_centered, item2_ratings_centered)

    norm_item1 = np.linalg.norm(item1_ratings_centered)
    norm_item2 = np.linalg.norm(item2_ratings_centered)

    result = dot_product / (norm_item1 * norm_item2)
    return result


In [8]:
def get_top_k_similar_items(target_item, k, matrix, similarity_metric):
    """
    target_item: vector dimension of target user ratings
    k: number of similar items to return
    similarity_metric: 'cosine' or 'pearson'
    matrix: user-item train rating matrix 
    """
    similarity = []
    target_item_embedding = matrix.loc[target_item].to_numpy()

    for item in matrix.index:
        item_rating_matrix = matrix.loc[item].to_numpy()
        if similarity_metric == 'cosine':
            sim = get_cosine_similarity(target_item_embedding, item_rating_matrix)
        elif similarity_metric == 'pearson':
            sim = get_pearson_similarity(target_item_embedding, item_rating_matrix)
        else:
            raise ValueError("Invalid similarity metric. Choose 'cosine' or 'pearson'.")
        similarity.append((item, sim))

    # Sort items by similarity score and select top k
    top_k = sorted(similarity, key=lambda x: x[1], reverse=True)[:k]
    return top_k


In [14]:
def predict_rating_film(target_user, target_film, matrix, topk_items, list_users=None):

    min_rating = 1
    
    book_user_watched = matrix.loc[target_film][matrix.loc[target_film].index.isin(list_users)]
    mean_rating_target_item = book_user_watched.mean()
    dot_product = 0
    for k in range(len(topk_items)):
        itemid, sim_score = topk_items[k]
        mean_itemid = np.mean(matrix.loc[itemid][matrix.loc[itemid] != 0])

        result = sim_score * (get_ratings(matrix, movie_id=itemid, user_id=target_user) - mean_itemid)
        dot_product += result
    total_similarity = sum(sim_score for _, sim_score in topk_items)

    rating = mean_rating_target_item + (dot_product / total_similarity)

    rating = max(min_rating, rating)
    
    return (target_film, rating)

In [15]:
def recommended_users_topk(target_item,
                            topk_similar, 
                            user_item_matrix, 
                            similarity_metric,
                            number_of_user_recommend,
                            X_train):
    topk_items = get_top_k_similar_items(target_item=target_item, k=topk_similar, matrix=user_item_matrix, similarity_metric=similarity_metric)

    rating_predict = {}
    for user in user_item_matrix.columns:
        if user not in X_train.keys():
            predicted_rating = predict_rating_film(target_user=user, target_film=target_item, matrix=user_item_matrix, topk_items=topk_items, list_users=X_train.keys())
            if predicted_rating[1] > 0:
                rating_predict[user] = predicted_rating[1]

    recommended_users = dict(sorted(rating_predict.items(), key=lambda x: x[1], reverse=True)[:number_of_user_recommend])
    return recommended_users

In [16]:
from sklearn.metrics import ndcg_score

def calculate_ap_at_k(user_recommendations, relevant_items, k):
    precision_sum = 0
    relevant_count = 0

    recommend_list = list(user_recommendations.keys())[:k]
    real_list = list(relevant_items.keys())
    for i, book_id in enumerate(recommend_list):
        if book_id in real_list:
            relevant_count += 1
            precision_sum += relevant_count / (i + 1)
    if relevant_count == 0:
        return 0
    return precision_sum / relevant_count

def calculate_ndcg_at_k(y_true, y_pred, k):
    y_true_scores = list(y_true.values())
    y_pred_scores = list(y_pred.values())

    return ndcg_score([y_true_scores], [y_pred_scores], k=k)

def precision_at_k(recommended_books, relevant_books, k):
    pred_k = list(recommended_books.keys())[:k]
    relevant = set(relevant_books.keys())

    hit = len([item for item in pred_k if item in relevant])
    return hit / k

def recall_at_k(recommended_books, relevant_books, k):
    pred_k = list(recommended_books.keys())[:k]
    relevant = set(relevant_books.keys())

    hit = len([item for item in pred_k if item in relevant])
    total_relevant = len(relevant_books)
    if total_relevant == 0:
        return 0.0
    return hit / len(relevant)



# Cosine

In [17]:
# Calculate metric top 5
import numpy as np
topk = 5
recall = []
precision = []
ap = []
ndcg = []
for book_id in list(user_item_matrix.index):
    if len(user_item_matrix.loc[book_id][user_item_matrix.loc[book_id] != 0]) > topk:
        X_train = user_item_matrix.loc[book_id][user_item_matrix.loc[book_id] != 0].sample(frac=0.2, random_state=42).to_dict()
        X_test = user_item_matrix.loc[book_id][user_item_matrix.loc[book_id] != 0].drop(X_train.keys()).to_dict()

        recommended_users = recommended_users_topk(book_id,
                                           topk_similar=10,
                                           user_item_matrix=user_item_matrix,
                                           similarity_metric='cosine',
                                           number_of_user_recommend=len(X_test),
                                           X_train=X_train)
        
        print(f"""Book: {book_id} => 
              rec@{topk}: {recall_at_k(recommended_users, X_test, k=topk)}, pre@{topk} {precision_at_k(recommended_users, X_test, k=topk)}, ap@{topk}: {calculate_ap_at_k(recommended_users, X_test, k=topk)}, ndcg@{topk}: {calculate_ndcg_at_k(X_test, recommended_users, k=topk)}""")
        
        recall.append(recall_at_k(recommended_users, X_test, k=topk))
        precision.append(precision_at_k(recommended_users, X_test, k=topk))
        ap.append(calculate_ap_at_k(recommended_users, X_test, k=topk))
        ndcg.append(calculate_ndcg_at_k(X_test, recommended_users, k=topk))

print(f"Rec@{topk}: {(np.array(recall)).mean()}")
print(f"Pre@{topk}: {(np.array(precision)).mean()}")
print(f"MAP@{topk}: {(np.array(ap)).mean()}")
print(f"NDCG@{topk}: {(np.array(ndcg)).mean()}")



Book: 316018 => 
              rec@5: 0.625, pre@5 1.0, ap@5: 1.0, ndcg@5: 0.55024105647494
Book: 322074 => 
              rec@5: 0.8333333333333334, pre@5 1.0, ap@5: 1.0, ndcg@5: 0.8531417781514009
Book: 324681 => 
              rec@5: 0.6, pre@5 0.6, ap@5: 1.0, ndcg@5: 0.9845228870423038
Book: 324688 => 
              rec@5: 0.5555555555555556, pre@5 1.0, ap@5: 1.0, ndcg@5: 0.9663037991936972
Book: 335315 => 
              rec@5: 0.2, pre@5 0.2, ap@5: 1.0, ndcg@5: 0.9557802964634485
Book: 337881 => 
              rec@5: 1.0, pre@5 1.0, ap@5: 1.0, ndcg@5: 0.9493428345359353
Book: 340161 => 
              rec@5: 0.7142857142857143, pre@5 1.0, ap@5: 1.0, ndcg@5: 0.8956102901191864
Book: 340167 => 
              rec@5: 0.4166666666666667, pre@5 1.0, ap@5: 1.0, ndcg@5: 0.9660839794726382
Book: 342428 => 
              rec@5: 0.2777777777777778, pre@5 1.0, ap@5: 1.0, ndcg@5: 0.8678320410547231
Book: 342896 => 
              rec@5: 0.5, pre@5 0.6, ap@5: 0.8666666666666667, ndcg@5: 0.7782749

In [18]:
# Calculate metric top 5
import numpy as np
topk = 10
recall = []
precision = []
ap = []
ndcg = []
for book_id in list(user_item_matrix.index):
    if len(user_item_matrix.loc[book_id][user_item_matrix.loc[book_id] != 0]) > topk:
        X_train = user_item_matrix.loc[book_id][user_item_matrix.loc[book_id] != 0].sample(frac=0.2, random_state=42).to_dict()
        X_test = user_item_matrix.loc[book_id][user_item_matrix.loc[book_id] != 0].drop(X_train.keys()).to_dict()

        recommended_users = recommended_users_topk(book_id,
                                           topk_similar=10,
                                           user_item_matrix=user_item_matrix,
                                           similarity_metric='cosine',
                                           number_of_user_recommend=len(X_test),
                                           X_train=X_train)
        
        print(f"""Book: {book_id} => 
              rec@{topk}: {recall_at_k(recommended_users, X_test, k=topk)}, pre@{topk} {precision_at_k(recommended_users, X_test, k=topk)}, ap@{topk}: {calculate_ap_at_k(recommended_users, X_test, k=topk)}, ndcg@{topk}: {calculate_ndcg_at_k(X_test, recommended_users, k=topk)}""")
        
        recall.append(recall_at_k(recommended_users, X_test, k=topk))
        precision.append(precision_at_k(recommended_users, X_test, k=topk))
        ap.append(calculate_ap_at_k(recommended_users, X_test, k=topk))
        ndcg.append(calculate_ndcg_at_k(X_test, recommended_users, k=topk))

print(f"Rec@{topk}: {(np.array(recall)).mean()}")
print(f"Pre@{topk}: {(np.array(precision)).mean()}")
print(f"MAP@{topk}: {(np.array(ap)).mean()}")
print(f"NDCG@{topk}: {(np.array(ndcg)).mean()}")



Book: 324688 => 
              rec@10: 0.7777777777777778, pre@10 0.7, ap@10: 1.0, ndcg@10: 0.9845070500021217
Book: 340167 => 
              rec@10: 0.5, pre@10 0.6, ap@10: 1.0, ndcg@10: 0.9623037420197135
Book: 342428 => 
              rec@10: 0.5555555555555556, pre@10 1.0, ap@10: 1.0, ndcg@10: 0.8440183532596159
Book: 353323 => 
              rec@10: 0.8333333333333334, pre@10 1.0, ap@10: 1.0, ndcg@10: 0.8826535758118728
Book: 367635 => 
              rec@10: 0.5454545454545454, pre@10 0.6, ap@10: 1.0, ndcg@10: 0.959465064009097
Book: 599037 => 
              rec@10: 0.5, pre@10 1.0, ap@10: 1.0, ndcg@10: 0.9296310976946672
Book: 614801 => 
              rec@10: 0.3225806451612903, pre@10 1.0, ap@10: 1.0, ndcg@10: 0.9829713764716758
Book: 619275 => 
              rec@10: 0.7692307692307693, pre@10 1.0, ap@10: 1.0, ndcg@10: 0.890432314852872
Book: 828647 => 
              rec@10: 0.3, pre@10 0.3, ap@10: 1.0, ndcg@10: 0.9473191165900232
Book: 1353611 => 
              rec@10: 0.666666

# Pearson

In [19]:
# Calculate metric top 5
import numpy as np
topk = 5
recall = []
precision = []
ap = []
ndcg = []
for book_id in list(user_item_matrix.index):
    if len(user_item_matrix.loc[book_id][user_item_matrix.loc[book_id] != 0]) > topk:
        X_train = user_item_matrix.loc[book_id][user_item_matrix.loc[book_id] != 0].sample(frac=0.2, random_state=42).to_dict()
        X_test = user_item_matrix.loc[book_id][user_item_matrix.loc[book_id] != 0].drop(X_train.keys()).to_dict()

        recommended_users = recommended_users_topk(book_id,
                                           topk_similar=10,
                                           user_item_matrix=user_item_matrix,
                                           similarity_metric='pearson',
                                           number_of_user_recommend=len(X_test),
                                           X_train=X_train)
        
        print(f"""Book: {book_id} => 
              rec@{topk}: {recall_at_k(recommended_users, X_test, k=topk)}, pre@{topk} {precision_at_k(recommended_users, X_test, k=topk)}, ap@{topk}: {calculate_ap_at_k(recommended_users, X_test, k=topk)}, ndcg@{topk}: {calculate_ndcg_at_k(X_test, recommended_users, k=topk)}""")
        
        recall.append(recall_at_k(recommended_users, X_test, k=topk))
        precision.append(precision_at_k(recommended_users, X_test, k=topk))
        ap.append(calculate_ap_at_k(recommended_users, X_test, k=topk))
        ndcg.append(calculate_ndcg_at_k(X_test, recommended_users, k=topk))

print(f"Rec@{topk}: {(np.array(recall)).mean()}")
print(f"Pre@{topk}: {(np.array(precision)).mean()}")
print(f"MAP@{topk}: {(np.array(ap)).mean()}")
print(f"NDCG@{topk}: {(np.array(ndcg)).mean()}")



Book: 316018 => 
              rec@5: 0.625, pre@5 1.0, ap@5: 1.0, ndcg@5: 0.55024105647494
Book: 322074 => 
              rec@5: 0.8333333333333334, pre@5 1.0, ap@5: 1.0, ndcg@5: 0.8531417781514009
Book: 324681 => 
              rec@5: 0.6, pre@5 0.6, ap@5: 1.0, ndcg@5: 0.9845228870423038
Book: 324688 => 
              rec@5: 0.5555555555555556, pre@5 1.0, ap@5: 1.0, ndcg@5: 0.9663037991936972
Book: 335315 => 
              rec@5: 0.2, pre@5 0.2, ap@5: 1.0, ndcg@5: 0.9557802964634485
Book: 337881 => 
              rec@5: 1.0, pre@5 1.0, ap@5: 1.0, ndcg@5: 0.9493428345359353
Book: 340161 => 
              rec@5: 0.7142857142857143, pre@5 1.0, ap@5: 1.0, ndcg@5: 0.8956102901191864
Book: 340167 => 
              rec@5: 0.4166666666666667, pre@5 1.0, ap@5: 1.0, ndcg@5: 0.9660839794726382
Book: 342428 => 
              rec@5: 0.2777777777777778, pre@5 1.0, ap@5: 1.0, ndcg@5: 0.7791605959910606
Book: 342896 => 
              rec@5: 0.5, pre@5 0.6, ap@5: 0.8666666666666667, ndcg@5: 0.7782749

In [20]:
# Calculate metric top 5
import numpy as np
topk = 10
recall = []
precision = []
ap = []
ndcg = []
for book_id in list(user_item_matrix.index):
    if len(user_item_matrix.loc[book_id][user_item_matrix.loc[book_id] != 0]) > topk:
        X_train = user_item_matrix.loc[book_id][user_item_matrix.loc[book_id] != 0].sample(frac=0.2, random_state=42).to_dict()
        X_test = user_item_matrix.loc[book_id][user_item_matrix.loc[book_id] != 0].drop(X_train.keys()).to_dict()

        recommended_users = recommended_users_topk(book_id,
                                           topk_similar=10,
                                           user_item_matrix=user_item_matrix,
                                           similarity_metric='pearson',
                                           number_of_user_recommend=len(X_test),
                                           X_train=X_train)
        
        print(f"""Book: {book_id} => 
              rec@{topk}: {recall_at_k(recommended_users, X_test, k=topk)}, pre@{topk} {precision_at_k(recommended_users, X_test, k=topk)}, ap@{topk}: {calculate_ap_at_k(recommended_users, X_test, k=topk)}, ndcg@{topk}: {calculate_ndcg_at_k(X_test, recommended_users, k=topk)}""")
        
        recall.append(recall_at_k(recommended_users, X_test, k=topk))
        precision.append(precision_at_k(recommended_users, X_test, k=topk))
        ap.append(calculate_ap_at_k(recommended_users, X_test, k=topk))
        ndcg.append(calculate_ndcg_at_k(X_test, recommended_users, k=topk))

print(f"Rec@{topk}: {(np.array(recall)).mean()}")
print(f"Pre@{topk}: {(np.array(precision)).mean()}")
print(f"MAP@{topk}: {(np.array(ap)).mean()}")
print(f"NDCG@{topk}: {(np.array(ndcg)).mean()}")



Book: 324688 => 
              rec@10: 0.7777777777777778, pre@10 0.7, ap@10: 1.0, ndcg@10: 0.9845070500021217
Book: 340167 => 
              rec@10: 0.5, pre@10 0.6, ap@10: 1.0, ndcg@10: 0.9623037420197135
Book: 342428 => 
              rec@10: 0.5555555555555556, pre@10 1.0, ap@10: 1.0, ndcg@10: 0.8176827349202269
Book: 353323 => 
              rec@10: 0.8333333333333334, pre@10 1.0, ap@10: 1.0, ndcg@10: 0.8826535758118728
Book: 367635 => 
              rec@10: 0.5454545454545454, pre@10 0.6, ap@10: 1.0, ndcg@10: 0.959465064009097
Book: 599037 => 
              rec@10: 0.5, pre@10 1.0, ap@10: 1.0, ndcg@10: 0.922604807212581
Book: 614801 => 
              rec@10: 0.3225806451612903, pre@10 1.0, ap@10: 1.0, ndcg@10: 0.9829713764716758
Book: 619275 => 
              rec@10: 0.7692307692307693, pre@10 1.0, ap@10: 1.0, ndcg@10: 0.890432314852872
Book: 828647 => 
              rec@10: 0.3, pre@10 0.3, ap@10: 1.0, ndcg@10: 0.9473191165900232
Book: 1353611 => 
              rec@10: 0.7777777