In [1]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
from networkx.algorithms import bipartite
import torch
import torch.nn as nn
import torch.nn.functional as F
import random

In [2]:
import pandas as pd

# Load the data
data = pd.read_csv('/kaggle/input/movielens-20m-dataset/rating.csv')
movies_df = pd.read_csv("/kaggle/input/movielens-20m-dataset/movie.csv")
tags_df = pd.read_csv("/kaggle/input/movielens-20m-dataset/tag.csv")

# Convert the timestamp column to datetime format for sorting
data['timestamp'] = pd.to_datetime(data['timestamp'])

# Filter users who have watched at least 30 movies
user_counts = data['userId'].value_counts()
eligible_users = user_counts[user_counts >= 30].index.tolist()

# Initialize train and test dataframes
train_data = pd.DataFrame()
test_data = pd.DataFrame()

# Iterate over eligible users and split their data
for user_id in eligible_users:
    user_data = data[data['userId'] == user_id].sort_values(by='timestamp', ascending=False)
    
    # Add the latest 3 movies to the test set
    test_data = pd.concat([test_data, user_data.iloc[:3]])
    
    # Add the remaining movies to the train set
    train_data = pd.concat([train_data, user_data.iloc[3:]])
    
    # Check if train_data has reached the desired size
    if 10000 <= len(train_data) <= 50000:
        break

# Filter the train_data to keep only popular movies
movie_num_user_rated_counts = train_data['movieId'].value_counts()
popular_movies = movie_num_user_rated_counts[movie_num_user_rated_counts >= 10].index.tolist()
train_data = train_data[train_data['movieId'].isin(popular_movies)]


In [3]:
ratings_df = train_data
# Create a new graph
G = nx.Graph()

# 2. Adding nodes and edges
# Add user nodes
for user_id in ratings_df['userId'].unique():
    G.add_node('u_'+str(user_id), bipartite=0)  # Add user node with a bipartite attribute of 0

# Add movie nodes with title and genres as attributes
for _, row in movies_df.iterrows():
    movie_id = 'm_' + str(row['movieId'])
    G.add_node(movie_id, bipartite=1, title=row['title'], genres=row['genres'].split('|'))

# Add edges based on ratings with rating and timestamp as attributes
for _, row in ratings_df.iterrows():
    user_id = 'u_'+str(row['userId'])
    movie_id = 'm_' + str(row['movieId'])
    G.add_edge(user_id, movie_id, rating=row['rating'], timestamp=row['timestamp'])

# Add tag data as an attribute to the movie nodes
for _, row in tags_df.iterrows():
    movie_id = 'm_' + str(row['movieId'])
    if 'tags' not in G.nodes[movie_id]:
        G.nodes[movie_id]['tags'] = []
    G.nodes[movie_id]['tags'].append({'tag': row['tag'], 'timestamp': row['timestamp'], 'userId': row['userId']})

# Ensure the graph is bipartite
assert bipartite.is_bipartite(G)

In [4]:
def predict_rating(G, user, movie):
    neighbors = list(G.neighbors(movie))
    if not neighbors:
        return np.mean([attr['rating'] for _, _, attr in G.edges(data=True) if 'rating' in attr])

    sim_weights = []
    user_ratings = []
    for neighbor in neighbors:
        # Jaccard similarity as an example, but can be changed
        common_movies = list(nx.common_neighbors(G, user, neighbor))
        sim = len(common_movies) / (G.degree(user) + G.degree(neighbor) - len(common_movies))
        rating = G[neighbor][movie]['rating']
        
        sim_weights.append(sim)
        user_ratings.append(rating)
    
    return np.dot(user_ratings, sim_weights) / sum(sim_weights)


In [5]:
def random_walk_legacy(G, start_node, alpha=0.85, walk_length=10):
    """Perform a random walk on graph G starting from node start_node."""
    current_node = start_node
    path = [current_node]
    
    for _ in range(walk_length):
        neighbors = list(G.neighbors(current_node))
        if random.random() < alpha and neighbors:
            current_node = random.choice(neighbors)
        else:
            current_node = start_node
        path.append(current_node)
    
    return path
def weighted_choice(neighbors, weights):
    """Choose a neighbor based on the given weights."""
    total = sum(weights)
    r = random.uniform(0, total)
    upto = 0
    for n, w in zip(neighbors, weights):
        if upto + w >= r:
            return n
        upto += w

def random_walk(G, start_node, alpha=0.85, walk_length=10):
    """Perform a random walk on graph G starting from node start_node."""
    current_node = start_node
    path = [current_node]
    
    for _ in range(walk_length):
        neighbors = list(G.neighbors(current_node))
        
        if neighbors:
            # Get weights (ratings) of the edges to the neighbors
            weights = [G[current_node][neighbor].get('rating', 1) for neighbor in neighbors]
            
            if random.random() < alpha:
                current_node = weighted_choice(neighbors, weights)
            else:
                current_node = start_node
        else:
            current_node = start_node
            
        path.append(current_node)
    
    return path
def personalized_pagerank_recommendations(G, user, alpha=0.85, num_walks=50, walk_length=5):
    """Generate recommendations and explanations using Personalized PageRank via random walks."""
    # Perform random walks and keep track of visit counts
    visit_counts = {node: 0 for node in G.nodes()}
    all_paths = []
    user = 'u_'+str(user)
    
    for _ in range(num_walks):
        path = random_walk(G, user, alpha, walk_length)
        all_paths.append(path)
        for node in path:
            visit_counts[node] += 1
    
    # Normalize visit counts to get a probability distribution
    total_visits = sum(visit_counts.values())
    ppr = {node: count/total_visits for node, count in visit_counts.items()}
    
    # Filter for movies and sort by PPR score
    movies = [node for node in ppr.keys() if G.nodes[node]['bipartite'] == 1 and node not in G.neighbors(str(user))]
    sorted_movies = sorted(movies, key=lambda x: ppr[x], reverse=True)
    
    # Generate explanations for the top 10 movies
    explanations = {}
    significant_neighbors = {}
    contributing_paths_all = {}
    for movie in sorted_movies[:10]:
        # Find paths that contributed to the movie's score
        contributing_paths = [path for path in all_paths if movie in path]
        
        # Count the frequency of each neighbor leading to the movie
        neighbor_counts = {}
        for path in contributing_paths:
            for i in range(len(path) - 1):
                if path[i+1] == movie:
                    neighbor = path[1]
                    neighbor_counts[neighbor] = neighbor_counts.get(neighbor, 0) + 1

        # Identify the most significant neighbor
        sorted_neighbors = sorted(neighbor_counts, key=neighbor_counts.get, reverse=True)
        most_significant_neighbor = next((n for n in sorted_neighbors if not n.startswith('u_') and not n == movie), None)
        if most_significant_neighbor:
            significant_neighbors[movie] = most_significant_neighbor
        contributing_paths_all[movie] = contributing_paths


        
    
    return sorted_movies[:10], contributing_paths_all, significant_neighbors


In [6]:
user = train_data['userId'].sample().iloc[0]
movies, paths, neigbors = personalized_pagerank_recommendations(G,user)


In [7]:
def id_to_name(movie_id, movies_df):
    movie_row = movies_df[movies_df['movieId'] == movie_id]
    if not movie_row.empty:
        return movie_row['title'].iloc[0]
    else:
        return None

In [8]:
for neig_key, neig_name in neigbors.items():
    print(f"{neig_key[2:]} : {neig_name[2:]}")
    print(f"{id_to_name(int(neig_key[2:]), movies_df)} : {id_to_name(int(neig_name[2:]), movies_df)}")

2571 : 595
Matrix, The (1999) : Beauty and the Beast (1991)
1676 : 788
Starship Troopers (1997) : Nutty Professor, The (1996)
21 : 370
Get Shorty (1995) : Naked Gun 33 1/3: The Final Insult (1994)
39 : 595
Clueless (1995) : Beauty and the Beast (1991)
47 : 1198
Seven (a.k.a. Se7en) (1995) : Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)
165 : 316
Die Hard: With a Vengeance (1995) : Stargate (1994)
173 : 1136
Judge Dredd (1995) : Monty Python and the Holy Grail (1975)
292 : 110
Outbreak (1995) : Braveheart (1995)
296 : 593
Pulp Fiction (1994) : Silence of the Lambs, The (1991)
349 : 11
Clear and Present Danger (1994) : American President, The (1995)
