<a href="https://colab.research.google.com/github/Frederik-Roeckle/xwines_recom/blob/feat%2Fgraph/GraphSAGE_XWines.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124

!pip install pyg-lib==0.4.0+pt26cu124 \
            torch-scatter==2.1.2+pt26cu124 \
            torch-sparse==0.6.18+pt26cu124 \
            torch-cluster==1.6.3+pt26cu124 \
            torch-spline-conv==1.2.2+pt26cu124 \
            -f https://data.pyg.org/whl/torch-2.6.0+cu124.html

!pip install torch-geometric==2.5.0

Looking in indexes: https://download.pytorch.org/whl/cu124
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading https://download.pytorch.org/whl/cu124/nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (24.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m99.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading https://download.pytorch.org/whl/cu124/nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (883 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading https://download.pytorch.org/whl/cu124/nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (13.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m128.1 MB/s[0m eta [36m0:00:00[0m
[?25

In [2]:
from google.colab import drive
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import HeteroData
from torch_geometric.nn import SAGEConv, to_hetero
from torch_geometric.loader import LinkNeighborLoader
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from tqdm import tqdm
import pathlib as pl

In [3]:
# Get PyTorch version
pytorch_version = torch.__version__
print(f"PyTorch version: {pytorch_version}")

# Check if CUDA is available
cuda_available = torch.cuda.is_available()
print(f"CUDA available: {cuda_available}")

# Get CUDA version (if available)
if cuda_available:
    cuda_version = torch.version.cuda
    print(f"CUDA version: {cuda_version}")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


PyTorch version: 2.6.0+cu124
CUDA available: True
CUDA version: 12.4


In [4]:
csv_wines = pl.Path("/content/drive/MyDrive/01_Master/WebMiningProject/XWines_Full_100K_wines.csv")
csv_reviews = pl.Path("/content/drive/MyDrive/01_Master/WebMiningProject/XWines_Full_21M_ratings.csv")

In [5]:
# 1. Load your data
reviews = pd.read_csv(csv_reviews)  # user_id, wine_id, rating, timestamp
wines = pd.read_csv(csv_wines)      # wine_id, country, grape_type, etc.

split = 0.3
split_index = len(reviews) * split
split_index = int(split_index)
reviews = reviews.iloc[:split_index]
unique_wines = set(reviews["WineID"])
wines = wines[wines["WineID"].isin(unique_wines)]

# 2. Encode categorical IDs
user_encoder = LabelEncoder()
wine_encoder = LabelEncoder()
reviews['UserID'] = user_encoder.fit_transform(reviews['UserID'])
wines['WineID'] = wine_encoder.fit_transform(wines['WineID'])
reviews['WineID'] = wine_encoder.transform(reviews['WineID'])

# reviews = reviews.iloc[:split_index]

  reviews = pd.read_csv(csv_reviews)  # user_id, wine_id, rating, timestamp


In [6]:
# 3. Build Heterogeneous Graph
data = HeteroData()
data['user'].num_nodes = reviews['UserID'].nunique()
data['wine'].num_nodes = reviews['WineID'].nunique()

data['user', 'rates', 'wine'].edge_index = torch.tensor([
    reviews['UserID'].values,
    reviews['WineID'].values
], dtype=torch.long)

# Add edge features (ratings)
data['user', 'rates', 'wine'].edge_attr = torch.tensor(reviews['Rating'].values, dtype=torch.float).unsqueeze(1)

# 4. Add node features
# Dummy features for users
data['user'].x = torch.ones((data['user'].num_nodes, 1))  # simple dummy vector

# Use selected wine metadata as features
selected_columns = ['Type', 'Elaborate', 'Body', 'Acidity', 'Country']
wine_metadata = wines.set_index('WineID')[selected_columns]
wine_metadata_encoded = pd.get_dummies(wine_metadata).astype(float)

# Align features to the wine node index order
aligned_wine_feats = wine_metadata_encoded.reindex(range(data['wine'].num_nodes)).fillna(0).values
data['wine'].x = torch.tensor(aligned_wine_feats, dtype=torch.float)


  data['user', 'rates', 'wine'].edge_index = torch.tensor([


In [7]:
data

HeteroData(
  user={
    num_nodes=806081,
    x=[806081, 1],
  },
  wine={
    num_nodes=81617,
    x=[81617, 96],
  },
  (user, rates, wine)={
    edge_index=[2, 6304060],
    edge_attr=[6304060, 1],
  }
)

In [8]:
# 5. Define GNN Model
class GNNEncoder(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.conv1 = SAGEConv((-1, -1), hidden_channels)
        self.relu = nn.ReLU()
        self.conv2 = SAGEConv((-1, -1), hidden_channels)

    def forward(self, x_dict, edge_index_dict):
        x = self.conv1(x_dict, edge_index_dict)
        x = self.relu(x)
        x = self.conv2(x, edge_index_dict)
        return x

# 6. Dot Product Predictor for link prediction
class DotProductPredictor(nn.Module):
    def forward(self, z_user, z_wine, edge_label_index):
        user_vecs = z_user[edge_label_index[0]]
        wine_vecs = z_wine[edge_label_index[1]]
        return (user_vecs * wine_vecs).sum(dim=1)

In [9]:
# 7. Prepare train/val split (temporal or random)
edge_index = data['user', 'rates', 'wine'].edge_index
num_edges = edge_index.size(1)
split_idx = int(0.9 * num_edges)

train_edge_index = edge_index[:, :split_idx]
val_edge_index = edge_index[:, split_idx:]

data['user', 'rates', 'wine'].edge_index = train_edge_index

# Opt
data['wine', 'rev_rates', 'user'].edge_index = data['user', 'rates', 'wine'].edge_index[[1, 0]]

data['user', 'rates', 'wine'].train_edge_index = train_edge_index

data['user', 'rates', 'wine'].val_edge_index = val_edge_index

In [10]:
# 8. Create loader for training
train_loader = LinkNeighborLoader(
    data,
    num_neighbors=[20, 10],
    batch_size=1024,
    edge_label_index=("user", "rates", "wine"),
    edge_label=torch.tensor(reviews['Rating'].values[:split_idx], dtype=torch.float),
    shuffle=True
)

# 9. Training loop
encoder = GNNEncoder(hidden_channels=64)
model = to_hetero(encoder, data.metadata(), aggr='sum').to(device)#
predictor = DotProductPredictor().to(device)#

optimizer = torch.optim.Adam(list(model.parameters()) + list(predictor.parameters()), lr=0.01)

model.train()
predictor.train()

for epoch in range(5):
    total_loss = 0
    for batch in tqdm(train_loader):
        batch = batch.to(device)
        optimizer.zero_grad()
        z_dict = model(batch.x_dict, batch.edge_index_dict)
        pred = predictor(z_dict['user'], z_dict['wine'], batch["user", "rates", "wine"].edge_label_index)
        loss = F.mse_loss(pred, batch["user", "rates", "wine"].edge_label.squeeze())
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch}, Loss: {total_loss:.4f}")

100%|██████████| 5541/5541 [06:35<00:00, 14.02it/s]


Epoch 0, Loss: 3166.3939


100%|██████████| 5541/5541 [06:35<00:00, 14.01it/s]


Epoch 1, Loss: 2809.6993


100%|██████████| 5541/5541 [06:36<00:00, 13.97it/s]


Epoch 2, Loss: 2784.2397


100%|██████████| 5541/5541 [06:38<00:00, 13.92it/s]


Epoch 3, Loss: 2774.9840


100%|██████████| 5541/5541 [06:37<00:00, 13.96it/s]

Epoch 4, Loss: 2783.0751





In [16]:
# 10. Evaluation on validation set (with GPU support)
model.eval()
predictor.eval()

with torch.no_grad():
    # Move data to device
    data = data.to(device)

    # Forward pass
    z_dict = model(data.x_dict, data.edge_index_dict)

    val_preds = predictor(
        z_dict['user'], z_dict['wine'], data['user', 'rates', 'wine'].val_edge_index
    )

    true_ratings = torch.tensor(
        reviews['Rating'].values[split_idx:], dtype=torch.float, device=device
    )

    val_loss = F.mse_loss(val_preds, true_ratings)
    print(f"Validation MSE: {val_loss.item():.4f}")

print("Training & evaluation complete. Ready for test set integration.")

Validation MSE: 0.5926
✅ Training & evaluation complete. Ready for test set integration.


In [20]:
# Add concept that predictions equal the format of the ratings, 5.0, 4.5, 4.0, ...

In [21]:
# Add evaluation for the test segments and the following evaluation metrices
# - Precision@K
# - Recall@K
# - nDCG@K
# - Hit Rate
# - Coverage