In [5]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
from networkx.algorithms import bipartite
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_add_pool
from torch_geometric.data import Data, DataLoader
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder

ModuleNotFoundError: No module named 'torch_geometric'

In [None]:
data = pd.read_csv('/kaggle/input/movielens-20m-dataset/rating.csv')
data = data.iloc[:10000]
movies_df = pd.read_csv("/kaggle/input/movielens-20m-dataset/movie.csv")
tags_df = pd.read_csv("/kaggle/input/movielens-20m-dataset/tag.csv")

train_data = data.copy()
test_data = pd.DataFrame()

movie_num_user_rated_counts = train_data['movieId'].value_counts()
# Create a list of movie IDs with 50 or more ratings
popular_movies = movie_num_user_rated_counts[movie_num_user_rated_counts >= 10].index.tolist()
# Filter the DataFrame to keep only the rows with movies that meet the threshold
train_data = train_data[train_data['movieId'].isin(popular_movies)]

for user_id in train_data['userId'].unique():
    user_ratings = train_data[train_data['userId'] == user_id]
    
    if len(user_ratings) > 1:
        test_rating = user_ratings.sample()
        test_data = pd.concat([test_data, test_rating])
        train_data.drop(test_rating.index, inplace=True)


In [None]:

def makeGraph(ratings_df):
    G = nx.Graph()

    # 2. Adding nodes and edges
    # Add user nodes
    for user_id in ratings_df['userId'].unique():
        G.add_node(user_id, bipartite=0)  # Add user node with a bipartite attribute of 0

    # Add movie nodes with title and genres as attributes
    for _, row in movies_df.iterrows():
        movie_id = 'm_' + str(row['movieId'])
        G.add_node(movie_id, bipartite=1, title=row['title'], genres=row['genres'].split('|'))

    # Add edges based on ratings with rating and timestamp as attributes
    for _, row in ratings_df.iterrows():
        user_id = row['userId']
        movie_id = 'm_' + str(row['movieId'])
        G.add_edge(user_id, movie_id, rating=row['rating'], timestamp=row['timestamp'])

    # Add tag data as an attribute to the movie nodes
    for _, row in tags_df.iterrows():
        movie_id = 'm_' + str(row['movieId'])
        if 'tags' not in G.nodes[movie_id]:
            G.nodes[movie_id]['tags'] = []
        G.nodes[movie_id]['tags'].append({'tag': row['tag'], 'timestamp': row['timestamp'], 'userId': row['userId']})

    # Ensure the graph is bipartite
    assert bipartite.is_bipartite(G)
    return G
G = makeGraph(train_data)


In [None]:
def encode_graph(G):
    # Create encoders
    user_encoder = LabelEncoder()
    movie_encoder = LabelEncoder()
    genre_encoder = LabelEncoder()
    tag_encoder = LabelEncoder()

    # Collect data for encoding
    users = [node for node, data in G.nodes(data=True) if data['bipartite'] == 0]
    movies = [data['title'] for node, data in G.nodes(data=True) if data['bipartite'] == 1]
    genres = [genre for node, data in G.nodes(data=True) if data['bipartite'] == 1 for genre in data['genres']]
    tags = [tag_data['tag'] for node, data in G.nodes(data=True) if 'tags' in data for tag_data in data['tags']]

    # Fit encoders
    user_encoder.fit(users)
    movie_encoder.fit(movies)
    genre_encoder.fit(genres)
    tag_encoder.fit(tags)

    # Encode user nodes
    for user in users:
        encoded_user = user_encoder.transform([user])[0]
        G = nx.relabel_nodes(G, {user: encoded_user})

    # Encode movie nodes and their attributes
    for node, data in list(G.nodes(data=True)):
        if data['bipartite'] == 1:
            encoded_movie = movie_encoder.transform([data['title']])[0]
            G = nx.relabel_nodes(G, {node: encoded_movie})
            G.nodes[encoded_movie]['title'] = encoded_movie
            G.nodes[encoded_movie]['genres'] = genre_encoder.transform(data['genres']).tolist()
            if 'tags' in data:
                for tag_data in data['tags']:
                    tag_data['tag'] = tag_encoder.transform([tag_data['tag']])[0]
                    
    encoded_users = [node for node, data in G.nodes(data=True) if data['bipartite'] == 0]
    encoded_movies = [data['title'] for node, data in G.nodes(data=True) if data['bipartite'] == 1]
    encoded_genres = [genre for node, data in G.nodes(data=True) if data['bipartite'] == 1 for genre in data['genres']]
    encoded_tags = [tag_data['tag'] for node, data in G.nodes(data=True) if 'tags' in data for tag_data in data['tags']]

    print("\nEncoded Users:", encoded_users[:5])  # Print first 5 encoded users
    print("Encoded Movies:", encoded_movies[:5])  # Print first 5 encoded movies
    print("Encoded Genres:", list(set(encoded_genres)))  # Print unique encoded genres
    print("Encoded Tags:", list(set(encoded_tags)))  # Print unique encoded tags

    return G
G_encoded = encode_graph(G)

In [None]:
# 1. Convert the graph into PyG format
def convert_to_pyg_data(G):
    # Extract edge indices
    edge_index = torch.tensor(list(G.edges)).t().contiguous()
    
    # Extract node features (for simplicity, we'll use one-hot encoding for users and movies)
    num_nodes = len(G.nodes)
    x = torch.eye(num_nodes)
    
    # Extract edge features (ratings)
    edge_attr = [G[u][v]['rating'] for u, v in G.edges]
    edge_attr = torch.tensor(edge_attr).unsqueeze(-1).float()
    
    return Data(x=x, edge_index=edge_index, edge_attr=edge_attr)

data = convert_to_pyg_data(G_encoded)

# 2. Define the GNN model
class BipartiteGNN(torch.nn.Module):
    def __init__(self, num_node_features):
        super(BipartiteGNN, self).__init__()
        self.conv1 = GCNConv(num_node_features, 128)
        self.conv2 = GCNConv(128, 64)
        self.fc = torch.nn.Linear(64, 1)  # Predicting a single rating value

    def forward(self, data):
        x, edge_index, edge_attr = data.x, data.edge_index, data.edge_attr
        
        # Node representation
        x = self.conv1(x, edge_index, edge_attr)
        x = F.relu(x)
        x = self.conv2(x, edge_index, edge_attr)
        x = F.relu(x)
        
        # Readout
        x = global_add_pool(x, batch=data.batch)  # Aggregate node embeddings
        out = self.fc(x)
        return out

model = BipartiteGNN(data.num_node_features)

# 3. Training the model
# For simplicity, we'll use Mean Squared Error (MSE) loss and Adam optimizer
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
print(data)
# Dummy training loop (you'll need to split your data into train and test)
for epoch in range(100):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = criterion(out, data.y)  # Assuming data.y contains the true ratings
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")
