# Personalized PageRank on multipartite graph

In [1]:
import sys
sys.path.append("..")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
import math
import networkx
from networkx.algorithms import bipartite

from data.data_helper_functions import *

In [2]:
%load_ext autoreload
%autoreload 2

Load a Multipartite Graph

In [3]:
data_path = '../data/'
books_df, users_df, ratings_df = load_data(data_path)
# G = load_multipartite_graph(books_df, users_df, ratings_df)

train_df, test_df = split_ratings_by_user(ratings_df, test_size = 0.2, random_state=0)
G = load_multipartite_graph(books_df, users_df, train_df, create=True)

### Personalized PageRank

In [4]:
query_used_id='u-10'
personalization = {n: 0 for n in G.nodes()}
personalization[query_used_id] = 1

pagerank_scores = nx.pagerank(G, alpha=0.85, weight='ppr_weight', personalization=personalization) 

book_recommendations = [(node, score) for node, score in pagerank_scores.items() if node.startswith('b')]
book_recommendations.sort(key=lambda x: x[1], reverse=True)
book_recommendations = [(node, score) for (node, score) in book_recommendations if node not in G[query_used_id]]

# Display the top rated books by the user
user_book_ratings = ratings_df[ratings_df['user_id'] == 10]
user_book_ratings = user_book_ratings.merge(books_df, left_on='book_id', right_index=True, how='inner')
user_book_ratings = user_book_ratings[['book_id', 'title', 'authors', 'average_rating', 'rating', 'genres', 'year', 'language', 'pages']].sort_values('rating', ascending=False)
print("Top rated books by user:")
display(user_book_ratings.head())

Top rated books by user:


Unnamed: 0,book_id,title,authors,average_rating,rating,genres,year,language,pages
339570,3409,O vencedor está só,Paulo Coelho,3.37,5,"['christian', 'nonfiction', 'religion', 'spiri...",2008.0,eng,224.0
392628,3946,Matterhorn: A Novel of the Vietnam War,Karl Marlantes,4.25,5,"['young-adult', 'horror', 'fiction', 'fantasy'...",2009.0,eng,458.0
686883,7002,A Mercy,Toni Morrison,3.65,5,"['historical-fiction', 'fiction', 'romance']",2008.0,eng,368.0
150393,1506,O Zahir,"Paulo Coelho, Margaret Jull Costa",3.56,4,"['classics', 'historical-fiction', 'fiction', ...",2005.0,eng,541.0
282522,2833,El prisionero del cielo,Carlos Ruiz Zafón,3.96,4,"['fiction', 'historical-fiction', 'mystery', '...",2011.0,eng,417.0


In [5]:
book_ids = [int(book_id[2:]) for book_id, score in book_recommendations if book_id not in G[query_used_id]][:10]
selected_books = books_df.loc[book_ids]
display(selected_books.head(5))

Unnamed: 0_level_0,authors,year,title,average_rating,num_ratings,num_1,num_2,num_3,num_4,num_5,genres,language,pages
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
5812,Paulo Coelho,2000.0,Ser Como o Rio que Flui,3.93,12708,303,882,3391,4983,5064,"['poetry', 'classics', 'fiction']",eng,736.0
5318,"Paulo Coelho, Alan R. Clarke",1988.0,As Valkírias,3.3,14364,1099,2817,5768,4426,2844,"['fiction', 'contemporary', 'young-adult']",eng,661.0
2902,Paulo Coelho,1996.0,O Monte Cinco,3.61,26060,1220,3366,9197,9601,7593,"['nonfiction', 'memoir', 'classics', 'spiritua...",eng,130.0
3913,Milan Kundera,1990.0,Nesmrtelnost,4.12,18574,206,859,4186,8619,9244,"['mystery', 'fiction', 'crime', 'classics', 'h...",eng,212.0
1663,"Paulo Coelho, Montserrat Mira",1990.0,Brida,3.46,45905,2936,7653,16869,14785,11874,"['classics', 'fiction', 'romance', 'historical...",eng,576.0


## Evaluating pagerank approaches

In [70]:
def precision_recall_at_k(user_id, top_n_recommendations):
    relevant_items  = set(test_df[(test_df.user_id == user_id) & (test_df.rating>=3)]['book_id'])
    if len (relevant_items ) == 0: return -1, -1 # Can not evaluate this user if no relavant items in test set
    true_positives = len(relevant_items.intersection(set(top_n_recommendations)))

    false_positives = len(top_n_recommendations) - true_positives
    false_negatives = len(relevant_items) - true_positives

    tp, fp, fn = true_positives, false_positives, false_negatives

    precision = tp / (tp + fp) if tp + fp > 0 else 0
    recall = tp / (tp + fn) if tp + fn > 0 else 0
    return precision, recall

Get top n recommendations for each user in a sample

In [72]:
from tqdm import tqdm
import random

user_ids = ratings_df['user_id'].unique()
random.seed(42)
sample_user_ids = random.sample(list(user_ids), 500)
sample_user_ids = ["u-" + str(user_id) for user_id in sample_user_ids]

#### Personalized pagerank

In [73]:
k = 200

all_book_ids = set(ratings_df['book_id'].unique())
ppr_recommendations = {}
for user_id in tqdm(sample_user_ids, desc="Getting recommendations for users in a sample"):
    personalization = {n: 0 for n in G.nodes()}
    personalization[user_id] = 1
    pagerank_scores = nx.pagerank(G, alpha=0.85, weight='ppr_weight', personalization=personalization) 

    book_recommendations = [(node, score) for node, score in pagerank_scores.items() if node.startswith('b') and  node not in G[user_id]]
    book_recommendations.sort(key=lambda x: x[1], reverse=True)
    ppr_recommendations[user_id] = [int(node[2:]) for node, score in book_recommendations[:k]]

Getting recommendations for users in a sample: 100%|██████████| 500/500 [23:39<00:00,  2.84s/it]


In [81]:
total_precision = 0
total_recall = 0
count = len(user_ids)

for user_id in tqdm(sample_user_ids, desc="Evaluating recommendations"):
    top_n_recommendations = ppr_recommendations[user_id]
    precision, recall = precision_recall_at_k(int(user_id[2:]), top_n_recommendations)

    if precision > 0:
        total_precision += precision
        total_recall += recall
    elif precision < 0: # There were no relavant items
        count -=1

average_precision = total_precision / count
average_recall = total_recall / count
print(f"Precision@{k}: {average_precision:.8f}")
print(f"Recall@{k}: {average_recall:.8f}")

Evaluating recommendations: 100%|██████████| 500/500 [00:00<00:00, 894.92it/s]

Precision@200: 0.00017493
Recall@200: 0.00698128





#### Topic Specific Pagerank

In [79]:
k = 200

all_book_ids = set(ratings_df['book_id'].unique())
tppr_recommendations = {}
for user_id in tqdm(sample_user_ids, desc="Getting recommendations for users in a sample"):
    # Create personalization vector
    personalization_vector = {node: 0 for node in G.nodes}
    for book_node, edge_data in G[user_id].items(): 
        personalization_vector[book_node] = edge_data['ppr_weight']

    pagerank_scores = nx.pagerank(G, alpha=0.85, weight='ppr_weight', personalization=personalization_vector)
    book_recommendations = [(node, score) for node, score in pagerank_scores.items() if node.startswith('b') and  node not in G[user_id]]
    book_recommendations.sort(key=lambda x: x[1], reverse=True)
    tppr_recommendations[user_id] = [int(node[2:]) for node, score in book_recommendations[:k]]

Getting recommendations for users in a sample: 100%|██████████| 500/500 [20:56<00:00,  2.51s/it]


In [80]:
total_precision = 0
total_recall = 0
count = len(user_ids)
k=200

for user_id in tqdm(sample_user_ids, desc="Evaluating recommendations"):
    top_n_recommendations = tppr_recommendations[user_id]
    precision, recall = precision_recall_at_k(int(user_id[2:]), top_n_recommendations)

    if precision > 0:
        total_precision += precision
        total_recall += recall
    elif precision < 0: # There were no relavant items
        count -=1

average_precision = total_precision / count
average_recall = total_recall / count
print(f"Precision@{k}: {average_precision:.8f}")
print(f"Recall@{k}: {average_recall:.8f}")

Evaluating recommendations: 100%|██████████| 500/500 [00:00<00:00, 918.73it/s]

Precision@200: 0.00017493
Recall@200: 0.00698128



