In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import torch
from torch_geometric import nn
from torch_geometric.data import Data, HeteroData
from torch_geometric.loader import DataLoader

In [34]:
import torch_geometric.transforms as T

In [3]:
# Load data
test_edges = pd.read_parquet('../data/graphs/test_edges.parquet')
test_features = pd.read_parquet('../data/graphs/test_features.parquet')
test_labels = pd.read_parquet('../data/graphs/test_labels_neg.parquet')

val_edges = pd.read_parquet('../data/graphs/val_edges.parquet')
val_features = pd.read_parquet('../data/graphs/val_features.parquet')
val_labels = pd.read_parquet('../data/graphs/val_labels_neg.parquet')

train_edges = pd.read_parquet('../data/graphs/train_edges.parquet')
train_features = pd.read_parquet('../data/graphs/train_features.parquet')
train_labels = pd.read_parquet('../data/graphs/train_labels_neg.parquet')

In [10]:
prod_features = np.load('../data/feature_emb/products.npy')
user_features = np.load('../data/feature_agg/train_user_features_norm.npy')

In [12]:
enc_user_id = train_edges[["user_id"]].drop_duplicates().sort_values("user_id").reset_index(drop=True).reset_index().rename(columns={"index": "enc_user_id"})
print(enc_user_id.shape)
enc_user_id.head()

(101696, 2)


Unnamed: 0,enc_user_id,user_id
0,0,1
1,1,2
2,2,3
3,3,7
4,4,13


In [17]:
enc_user_id_dict = dict(zip(enc_user_id.user_id, enc_user_id.enc_user_id))
enc_user_id_dict[7]

3

In [19]:
# encode user_id
train_edges["enc_user_id"] = train_edges["user_id"].map(enc_user_id_dict)
val_edges["enc_user_id"] = val_edges["user_id"].map(enc_user_id_dict)
test_edges["enc_user_id"] = test_edges["user_id"].map(enc_user_id_dict)
train_labels["enc_user_id"] = train_labels["user_id"].map(enc_user_id_dict)
val_labels["enc_user_id"] = val_labels["user_id"].map(enc_user_id_dict)
test_labels["enc_user_id"] = test_labels["user_id"].map(enc_user_id_dict)


In [20]:
train_edges

Unnamed: 0,user_id,product_id,weight,enc_user_id
0,1,196,0.700000,0
1,1,10258,0.600000,0
2,1,10326,0.100000,0
3,1,12427,0.700000,0
4,1,13032,0.200000,0
...,...,...,...,...
8675716,206209,41665,0.076923,101695
8675717,206209,43961,0.153846,101695
8675718,206209,44325,0.076923,101695
8675719,206209,48697,0.076923,101695


In [57]:
train_edge_index = torch.tensor(train_edges[["enc_user_id", "product_id"]].values, dtype=torch.long).t().contiguous()

In [58]:
train_data = HeteroData()

# Save node indices:
train_data["user"].node_id = torch.arange(user_features.shape[0])
train_data["prod"].node_id = torch.arange(prod_features.shape[0])


# Add the node features and edge indices:
train_data["user"].x = user_features
train_data["prod"].x = prod_features
train_data["user", "buy", "prod"].edge_index = edge_index


In [59]:
train_data = T.ToUndirected()(train_data)
print(train_data)

HeteroData(
  [1muser[0m={
    node_id=[101696],
    x=[101696, 22]
  },
  [1mprod[0m={
    node_id=[49689],
    x=[49689, 768]
  },
  [1m(user, buy, prod)[0m={ edge_index=[2, 8675721] },
  [1m(prod, rev_buy, user)[0m={ edge_index=[2, 8675721] }
)


In [61]:
train_data["user", "buy", "prod"].edge_label_index = torch.tensor(train_labels[["enc_user_id", "product_id"]].values, dtype=torch.long).t().contiguous()
train_data["user", "buy", "prod"].edge_label = torch.tensor(train_labels["label"].values, dtype=torch.long).t().contiguous()
train_data

HeteroData(
  [1muser[0m={
    node_id=[101696],
    x=[101696, 22]
  },
  [1mprod[0m={
    node_id=[49689],
    x=[49689, 768]
  },
  [1m(user, buy, prod)[0m={
    edge_index=[2, 8675721],
    edge_label_index=[2, 661052],
    edge_label=[661052]
  },
  [1m(prod, rev_buy, user)[0m={ edge_index=[2, 8675721] }
)

In [62]:
# In the first hop, we sample at most 20 neighbors.
# In the second hop, we sample at most 10 neighbors.
# In addition, during training, we want to sample negative edges on-the-fly with
# a ratio of 2:1.
# We can make use of the `loader.LinkNeighborLoader` from PyG:
from torch_geometric.loader import LinkNeighborLoader

# Define seed edges:
edge_label_index = train_data["user", "buy", "prod"].edge_label_index
edge_label = train_data["user", "buy", "prod"].edge_label

train_loader = LinkNeighborLoader(
    data=train_data,
    num_neighbors=[20, 10, 10],
    neg_sampling_ratio=2.0,
    edge_label_index=(("user", "buy", "prod"), edge_label_index),
    edge_label=edge_label,
    batch_size=128,
    shuffle=True,
)

# Inspect a sample:
sampled_data = next(iter(train_loader))

print("Sampled mini-batch:")
print("===================")
print(sampled_data)

assert sampled_data["user", "buy", "prod"].edge_label_index.size(1) == 3 * 128
assert sampled_data["user", "buy", "prod"].edge_label.min() == 0
assert sampled_data["user", "buy", "prod"].edge_label.max() == 1

ImportError: 'NeighborSampler' requires either 'pyg-lib' or 'torch-sparse'