<a href="https://colab.research.google.com/github/zoebatz/EEL6878/blob/main/graphSAGE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install torch-geometric

Collecting torch-geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m68.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-geometric
Successfully installed torch-geometric-2.6.1


In [43]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
from networkx.algorithms import bipartite
import random

In [3]:
import torch
from torch_geometric.data import HeteroData
from torch_geometric.utils import from_networkx

In [4]:
import torch.nn as nn
from torch_geometric.nn import SAGEConv, to_hetero
from torch_geometric.transforms import RandomLinkSplit
from torch_geometric.utils import remove_isolated_nodes, train_test_split_edges
import torch.nn.functional as F
from torch_geometric.data import Data

In [5]:
# load data

df_ratings = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/AI_project/ml-100k/ratings.csv')
df_movies = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/AI_project/ml-100k/movies.csv')
df_ratings = df_ratings.drop('timestamp', axis=1)
df_ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [6]:
df = df_ratings.merge(df_movies, on='movieId')
df.head()

Unnamed: 0,userId,movieId,rating,title,genres
0,1,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [7]:
# Map userIds and movieIds to consecutive node indices
user_mapping = {id: idx for idx, id in enumerate(df['userId'].unique())}
movie_mapping = {id: idx + len(user_mapping) for idx, id in enumerate(df['movieId'].unique())}

# Create edge list
edge_index = []
edge_weight = []

for _, row in df.iterrows():
    u = user_mapping[row['userId']]
    m = movie_mapping[row['movieId']]
    edge_index.append([u, m])
    edge_index.append([m, u])  # Make it undirected
    edge_weight.append(row['rating'])
    edge_weight.append(row['rating'])  # Same rating back

edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
edge_weight = torch.tensor(edge_weight, dtype=torch.float)

# Create Graph
G = Data(edge_index=edge_index, edge_attr=edge_weight)

# Set total number of nodes
G.num_nodes = len(user_mapping) + len(movie_mapping)

print(G)


Data(edge_index=[2, 201672], edge_attr=[201672], num_nodes=10334)


In [8]:
transform = RandomLinkSplit(
    is_undirected=True,
    split_labels=True,
    add_negative_train_samples=True,
    neg_sampling_ratio=1.0,  # 1:1 ratio
)

train_data, val_data, test_data = transform(G)

print(train_data)


Data(edge_index=[2, 141172], edge_attr=[141172], num_nodes=10334, pos_edge_label=[70586], pos_edge_label_index=[2, 70586], neg_edge_label=[70586], neg_edge_label_index=[2, 70586])


In [9]:
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv

class GNNEncoder(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.conv1 = SAGEConv((-1, -1), hidden_channels)
        self.conv2 = SAGEConv((-1, -1), hidden_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return x


In [10]:
# Random node features
x = torch.randn((G.num_nodes, 64))  # 64-dim features

In [11]:
from sklearn.metrics import roc_auc_score

@torch.no_grad()
def evaluate(model, x, data):
    model.eval()

    z = model(x, data.edge_index)

    pos_edge = data.pos_edge_label_index
    neg_edge = data.neg_edge_label_index

    # Positive and Negative Scores
    pos_score = (z[pos_edge[0]] * z[pos_edge[1]]).sum(dim=1)
    neg_score = (z[neg_edge[0]] * z[neg_edge[1]]).sum(dim=1)

    # Stack all predictions and labels
    y_pred = torch.cat([pos_score, neg_score])
    y_true = torch.cat([torch.ones(pos_score.size(0)), torch.zeros(neg_score.size(0))]).to(y_pred.device)

    # Use sigmoid because logits are raw scores
    auc = roc_auc_score(y_true.cpu(), y_pred.cpu().sigmoid())

    return auc


In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = GNNEncoder(hidden_channels=64).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

x = x.to(device)
train_data = train_data.to(device)
val_data = val_data.to(device)

for epoch in range(1, 101):
    model.train()
    optimizer.zero_grad()

    # Encode nodes
    z = model(x, train_data.edge_index)

    # Positive and negative samples
    pos_edge = train_data.pos_edge_label_index
    neg_edge = train_data.neg_edge_label_index

    # Positive score
    pos_score = (z[pos_edge[0]] * z[pos_edge[1]]).sum(dim=1)
    neg_score = (z[neg_edge[0]] * z[neg_edge[1]]).sum(dim=1)

    # Loss: maximize pos_score, minimize neg_score
    loss = F.binary_cross_entropy_with_logits(
        torch.cat([pos_score, neg_score]),
        torch.cat([torch.ones(pos_score.size(0)), torch.zeros(neg_score.size(0))]).to(device)
    )

    loss.backward()
    optimizer.step()


    train_auc = evaluate(model, x, train_data)
    val_auc = evaluate(model, x, val_data)

    if epoch % 10 == 0:
      print(f"Epoch {epoch:03d}, Loss: {loss:.4f}, Train AUC: {train_auc:.4f}, Val AUC: {val_auc:.4f}")

     # print(f'Epoch {epoch}, Loss: {loss.item():.4f}')


Epoch 010, Loss: 0.6718, Train AUC: 0.7695, Val AUC: 0.7364
Epoch 020, Loss: 0.5971, Train AUC: 0.8794, Val AUC: 0.8527
Epoch 030, Loss: 0.5319, Train AUC: 0.8978, Val AUC: 0.8661
Epoch 040, Loss: 0.4915, Train AUC: 0.9275, Val AUC: 0.8836
Epoch 050, Loss: 0.4687, Train AUC: 0.9418, Val AUC: 0.8843
Epoch 060, Loss: 0.4524, Train AUC: 0.9573, Val AUC: 0.8862
Epoch 070, Loss: 0.4403, Train AUC: 0.9657, Val AUC: 0.8848
Epoch 080, Loss: 0.4306, Train AUC: 0.9713, Val AUC: 0.8845
Epoch 090, Loss: 0.4227, Train AUC: 0.9752, Val AUC: 0.8839
Epoch 100, Loss: 0.4163, Train AUC: 0.9776, Val AUC: 0.8823


In [53]:
# use trained model to make predictions

# get all node embeddings
G.to(device)
model.eval()
with torch.no_grad():
  z = model(x, G.edge_index)

In [54]:
'''num_users = len(user_mapping)
num_movies = len(movie_mapping)

user_embeddings = z[:num_users]
movie_embeddings = z[num_users:]
'''

'num_users = len(user_mapping)\nnum_movies = len(movie_mapping)\n\nuser_embeddings = z[:num_users]\nmovie_embeddings = z[num_users:]\n'

In [57]:
num_users = len(user_mapping)

user_emb = z[:num_users]
movie_emb = z[num_users:]

# batch users to find recommendations for
user_ids = [100, 200, 300]

internal_user_ids = [user_mapping[u] for u in user_ids]

batch_user_emb = user_emb[internal_user_ids]

# compute similarity score
scores = batch_user_emb @ movie_emb.T

user_to_seen_movies = {u: set(df[df['userId'] == u]['movieId']) for u in user_ids}

# mask movies each user has already seen
mask = torch.zeros_like(scores, dtype=torch.bool)

for i, user_id in enumerate(user_ids):
  seen_movies = user_to_seen_movies.get(user_id, set())
  internal_seen = [movie_mapping[movieId] - num_users for movieId in seen_movies]
  mask[i, internal_seen] = True

scores[mask] = -float('inf')

In [58]:
top_k = 10

# get top k for each user
top_k_scores, top_k_indices = scores.topk(top_k, dim=1)

inv_movie_mapping = {v: k for k, v in movie_mapping.items()}

# recs for each user in batch
recommendations = []
for indices in top_k_indices:
  recommended_movieIds = [inv_movie_mapping[index.item() + num_users] for index in indices]
  recommendations.append(recommended_movieIds)

In [59]:
# easy title look up
movieId_to_title = dict(zip(df_movies['movieId'], df_movies['title']))


In [60]:
# display top k recommendations for each user in batch
for i, (user_id, rec_movieIds) in enumerate(zip(user_ids, recommendations)):
    print(f"\nUser {user_id} Recommendations:")
    for j, movieId in enumerate(rec_movieIds):
        title = movieId_to_title.get(movieId, "Unknown Title")
        score = top_k_scores[i, j].item()
        print(f"  {movieId}: {title} (score: {score:.4f})")


User 100 Recommendations:
  339: While You Were Sleeping (1995) (score: 5.4932)
  595: Beauty and the Beast (1991) (score: 5.2533)
  355: Flintstones, The (1994) (score: 5.0455)
  110: Braveheart (1995) (score: 4.9926)
  3793: X-Men (2000) (score: 4.8592)
  4440: Big Boss, The (Fists of Fury) (Tang shan da xiong) (1971) (score: 4.8134)
  25: Leaving Las Vegas (1995) (score: 4.7515)
  347: Bitter Moon (1992) (score: 4.7403)
  161: Crimson Tide (1995) (score: 4.7298)
  733: Rock, The (1996) (score: 4.7235)

User 200 Recommendations:
  4440: Big Boss, The (Fists of Fury) (Tang shan da xiong) (1971) (score: 5.4695)
  161: Crimson Tide (1995) (score: 5.2790)
  160: Congo (1995) (score: 5.1406)
  454: Firm, The (1993) (score: 5.1147)
  434: Cliffhanger (1993) (score: 5.0804)
  292: Outbreak (1995) (score: 5.0450)
  380: True Lies (1994) (score: 4.8974)
  592: Batman (1989) (score: 4.7725)
  153: Batman Forever (1995) (score: 4.7589)
  177: Lord of Illusions (1995) (score: 4.6614)

User 300 

In [61]:
grouped = df.groupby('userId')

# Only process users in the batch
for user_id in user_ids:
    if user_id in grouped.groups:
        group = grouped.get_group(user_id)

        max_rating = group['rating'].max()
        top_movies = group[group['rating'] == max_rating]

        print(f"\nUser {user_id}'s Top Rated Movies (Rating {max_rating}):")

        for _, row in top_movies.iterrows():
            print(f"  {row['movieId']}: {row['title']}")
    else:
        print(f"\nUser {user_id} has no ratings.")


User 100's Top Rated Movies (Rating 5.0):
  1101: Top Gun (1986)
  1958: Terms of Endearment (1983)
  2423: Christmas Vacation (National Lampoon's Christmas Vacation) (1989)
  4041: Officer and a Gentleman, An (1982)
  5620: Sweet Home Alabama (2002)

User 200's Top Rated Movies (Rating 5.0):
  39: Clueless (1995)
  260: Star Wars: Episode IV - A New Hope (1977)
  296: Pulp Fiction (1994)
  318: Shawshank Redemption, The (1994)
  441: Dazed and Confused (1993)
  597: Pretty Woman (1990)
  1020: Cool Runnings (1993)
  1042: That Thing You Do! (1996)
  1196: Star Wars: Episode V - The Empire Strikes Back (1980)
  1197: Princess Bride, The (1987)
  1210: Star Wars: Episode VI - Return of the Jedi (1983)
  1380: Grease (1978)
  1500: Grosse Pointe Blank (1997)
  1777: Wedding Singer, The (1998)
  2144: Sixteen Candles (1984)
  2396: Shakespeare in Love (1998)
  2571: Matrix, The (1999)
  2572: 10 Things I Hate About You (1999)
  2918: Ferris Bueller's Day Off (1986)
  2959: Fight Club (19