In [1]:
import pandas as pd
import torch
import os
# Set working directory

try:
    data_train = pd.read_csv('dataset/ogbn_mag/split/time/paper/train.csv.gz', compression='gzip',header = None)
    data_valid = pd.read_csv('dataset/ogbn_mag/split/time/paper/valid.csv.gz', compression='gzip',header = None)
    data_test = pd.read_csv('dataset/ogbn_mag/split/time/paper/test.csv.gz', compression='gzip',header = None)
except FileNotFoundError:
    os.chdir("..")
    data_train = pd.read_csv('dataset/ogbn_mag/split/time/paper/train.csv.gz', compression='gzip',header = None)
    data_valid = pd.read_csv('dataset/ogbn_mag/split/time/paper/valid.csv.gz', compression='gzip',header = None)
    data_test = pd.read_csv('dataset/ogbn_mag/split/time/paper/test.csv.gz', compression='gzip',header = None)

data, _ = torch.load(r"dataset/ogbn_mag/processed/geometric_data_processed.pt", weights_only=False)

# Extract edges for "paper" -> "cites" -> "paper"
paper_c_paper = data.edge_index_dict[('paper', 'cites', 'paper')]

# Unique paper IDs to keep (Ensure it's a PyTorch tensor)
nums_valid = torch.tensor(data_valid[0])
nums_test = torch.tensor(data_test[0])
nums_train = torch.tensor(data_train[0])

mask_train = torch.isin(paper_c_paper[0], nums_train) | torch.isin(paper_c_paper[1], nums_train)
mask_valid = torch.isin(paper_c_paper[0], nums_valid) | torch.isin(paper_c_paper[1], nums_valid)
mask_test = torch.isin(paper_c_paper[0], nums_test) | torch.isin(paper_c_paper[1], nums_test)

paper_c_paper_train = paper_c_paper.clone()
paper_c_paper_valid = paper_c_paper.clone()
paper_c_paper_test = paper_c_paper.clone()

# Combine the conditions into a single mask that selects only the train edges
mask_train_done = mask_train & ~mask_valid & ~mask_test
mask_valid_done = mask_valid & ~mask_test

# Apply the combined mask to paper_c_paper_train
paper_c_paper_train = paper_c_paper_train[:, mask_train_done]
paper_c_paper_valid = paper_c_paper_valid[:, mask_valid_done]
paper_c_paper_test = paper_c_paper_test[:, mask_test]

len(paper_c_paper_train[1]) + len(paper_c_paper_valid[1]) + len(paper_c_paper_test[1]), paper_c_paper.shape[1]


(5416271, 5416271)

In [2]:
author_w_paper = data.edge_index_dict[('author', 'writes', 'paper')]

author_w_paper_train = author_w_paper.clone()
author_w_paper_valid = author_w_paper.clone()
author_w_paper_test = author_w_paper.clone()

# Unique paper IDs to keep (Ensure it's a PyTorch tensor)
nums_valid = torch.tensor(data_valid[0])
nums_test = torch.tensor(data_test[0])
nums_train = torch.tensor(data_train[0])

mask_train = torch.isin(author_w_paper[1], nums_train)
mask_valid = torch.isin(author_w_paper[1], nums_valid)
mask_test = torch.isin(author_w_paper[1], nums_test)

# Combine the conditions into a single mask that selects only the train edges
mask_train_done = mask_train & ~mask_valid & ~mask_test
mask_valid_done = mask_valid & ~mask_test

# Apply the combined mask to paper_c_paper_train
author_w_paper_train = author_w_paper_train[:, mask_train_done]
author_w_paper_valid = author_w_paper_valid[:, mask_valid_done]
author_w_paper_test = author_w_paper_test[:, mask_test]

len(author_w_paper_train[1]) + len(author_w_paper_valid[1]) + len(author_w_paper_test[1]), author_w_paper.shape[1]


(7145660, 7145660)

In [3]:
paper_t_field = data.edge_index_dict[('paper', 'has_topic', 'field_of_study')]

paper_t_field_train = paper_t_field.clone()
paper_t_field_valid = paper_t_field.clone()
paper_t_field_test = paper_t_field.clone()

# Unique paper IDs to keep (Ensure it's a PyTorch tensor)
nums_valid = torch.tensor(data_valid[0])
nums_test = torch.tensor(data_test[0])
nums_train = torch.tensor(data_train[0])

mask_train = torch.isin(paper_t_field[0], nums_train)
mask_valid = torch.isin(paper_t_field[0], nums_valid)
mask_test = torch.isin(paper_t_field[0], nums_test)

# Combine the conditions into a single mask that selects only the train edges
mask_train_done = mask_train & ~mask_valid & ~mask_test
mask_valid_done = mask_valid & ~mask_test

# Apply the combined mask to paper_c_paper_train
paper_t_field_train = paper_t_field_train[:, mask_train_done]
paper_t_field_valid = paper_t_field_valid[:, mask_valid_done]
paper_t_field_test = paper_t_field_test[:, mask_test]

len(paper_t_field_train[1]) + len(paper_t_field_valid[1]) + len(paper_t_field_test[1]), paper_t_field.shape[1]


(7505078, 7505078)

In [4]:
data

Data(
  num_nodes_dict={
    author=1134649,
    field_of_study=59965,
    institution=8740,
    paper=736389,
  },
  edge_index_dict={
    (author, affiliated_with, institution)=[2, 1043998],
    (author, writes, paper)=[2, 7145660],
    (paper, cites, paper)=[2, 5416271],
    (paper, has_topic, field_of_study)=[2, 7505078],
  },
  x_dict={ paper=[736389, 128] },
  node_year={ paper=[736389, 1] },
  edge_reltype={
    (author, affiliated_with, institution)=[1043998, 1],
    (author, writes, paper)=[7145660, 1],
    (paper, cites, paper)=[5416271, 1],
    (paper, has_topic, field_of_study)=[7505078, 1],
  },
  y_dict={ paper=[736389, 1] }
)

In [5]:
import numpy as np

paper_c_paper_train_0 = np.array(paper_c_paper_train[0])
paper_c_paper_train_1 = np.array(paper_c_paper_train[1])

datamatrix = np.column_stack([
    (paper_c_paper_train_0 == np.arange(len(paper_c_paper_train_0))).astype(int), paper_c_paper_train_0, paper_c_paper_train_1, 
])

# np.argwhere(d[0, 1] == 0)
len(np.argwhere(datamatrix[:, 0] == 0)), len(np.argwhere(datamatrix[:, 0] == 1))

  paper_c_paper_train_0 = np.array(paper_c_paper_train[0])
  paper_c_paper_train_1 = np.array(paper_c_paper_train[1])


(3879967, 1)

In [40]:
len(paper_c_paper_train_0)

3879968

In [7]:
np.unique(paper_c_paper_train_0), np.unique(paper_c_paper_train_1)

(array([     0,      2,      4, ..., 736386, 736387, 736388],
       shape=(512046,)),
 array([     0,      1,      2, ..., 736383, 736386, 736388],
       shape=(505777,)))

In [8]:
paper_c_paper_train

tensor([[     0,      0,      0,  ..., 736388, 736388, 736388],
        [    88,  27449, 121051,  ..., 421711, 427339, 439864]])

In [9]:
len(paper_c_paper_train[0]), len(paper_c_paper_train[1])

(3879968, 3879968)

In [None]:
import numpy as np
from scipy.sparse import coo_matrix
import torch

# Example citation data: Replace with your actual data
# For example:
# paper_ids = [0, 88, 27449, 121051, ...]  # Citing papers
# cited_ids = [88, 27449, 121051, ..., 421711, 427339, 439864]  # Cited papers

# Example tensor as you mentioned (2 rows, multiple columns)
# tensor[0] represents paper_ids (papers that cite others)
# tensor[1] represents cited paper_ids (papers being cited)
tensor = paper_c_paper_train 

# Extract the data
paper_ids = tensor[0].numpy()  # Citing papers
cited_ids = tensor[1].numpy()  # Cited papers

# Create a sparse matrix in COO format:
citation_values = np.ones(len(paper_ids))  # All citations will have value 1

# Create the sparse matrix with the correct shape
citation_matrix_coo = coo_matrix((citation_values, (paper_ids, cited_ids)), shape=(max(paper_ids) + 1, max(cited_ids) + 1))

# Now you have a sparse matrix with 3 columns (paper_id, cited_paper_id, 1 for citation)
# Optionally, you can convert it to CSR format for better performance:
citation_matrix_csr = citation_matrix_coo.tocsr()

# If you want to save this matrix to disk (e.g., in HDF5 format), you can do so:
# import h5py
# with h5py.File('citation_matrix.h5', 'w') as f:
#     f.create_dataset('rows', data=citation_matrix_coo.row)
#     f.create_dataset('cols', data=citation_matrix_coo.col)
#     f.create_dataset('data', data=citation_matrix_coo.data)


# use in pytorch
citation_matrix = torch.sparse_coo_tensor((citation_matrix_coo.row, citation_matrix_coo.col), citation_matrix_coo.data)


  citation_matrix = torch.sparse_coo_tensor((citation_matrix_coo.row, citation_matrix_coo.col), citation_matrix_coo.data)


NotImplementedError: Could not run 'aten::as_strided' with arguments from the 'SparseCPU' backend. This could be because the operator doesn't exist for this backend, or was omitted during the selective/custom build process (if using custom build). If you are a Facebook employee using PyTorch on mobile, please visit https://fburl.com/ptmfixes for possible resolutions. 'aten::as_strided' is only available for these backends: [CPU, CUDA, Meta, QuantizedCPU, QuantizedCUDA, BackendSelect, Python, FuncTorchDynamicLayerBackMode, Functionalize, Named, Conjugate, Negative, ZeroTensor, ADInplaceOrView, AutogradOther, AutogradCPU, AutogradCUDA, AutogradHIP, AutogradXLA, AutogradMPS, AutogradIPU, AutogradXPU, AutogradHPU, AutogradVE, AutogradLazy, AutogradMTIA, AutogradPrivateUse1, AutogradPrivateUse2, AutogradPrivateUse3, AutogradMeta, AutogradNestedTensor, Tracer, AutocastCPU, AutocastXPU, AutocastMPS, AutocastCUDA, FuncTorchBatched, BatchedNestedTensor, FuncTorchVmapMode, Batched, VmapMode, FuncTorchGradWrapper, PythonTLSSnapshot, FuncTorchDynamicLayerFrontMode, PreDispatch, PythonDispatcher].

CPU: registered at /pytorch/build/aten/src/ATen/RegisterCPU.cpp:30477 [kernel]
CUDA: registered at /pytorch/build/aten/src/ATen/RegisterCUDA.cpp:44731 [kernel]
Meta: registered at /pytorch/build/aten/src/ATen/RegisterMeta.cpp:27006 [kernel]
QuantizedCPU: registered at /pytorch/build/aten/src/ATen/RegisterQuantizedCPU.cpp:955 [kernel]
QuantizedCUDA: registered at /pytorch/build/aten/src/ATen/RegisterQuantizedCUDA.cpp:463 [kernel]
BackendSelect: fallthrough registered at /pytorch/aten/src/ATen/core/BackendSelectFallbackKernel.cpp:3 [backend fallback]
Python: registered at /pytorch/aten/src/ATen/core/PythonFallbackKernel.cpp:194 [backend fallback]
FuncTorchDynamicLayerBackMode: registered at /pytorch/aten/src/ATen/functorch/DynamicLayer.cpp:503 [backend fallback]
Functionalize: registered at /pytorch/build/aten/src/ATen/RegisterFunctionalization_0.cpp:23301 [kernel]
Named: fallthrough registered at /pytorch/aten/src/ATen/core/NamedRegistrations.cpp:11 [kernel]
Conjugate: fallthrough registered at /pytorch/aten/src/ATen/ConjugateFallback.cpp:21 [kernel]
Negative: fallthrough registered at /pytorch/aten/src/ATen/native/NegateFallback.cpp:22 [kernel]
ZeroTensor: registered at /pytorch/build/aten/src/ATen/RegisterZeroTensor.cpp:165 [kernel]
ADInplaceOrView: registered at /pytorch/torch/csrc/autograd/generated/ADInplaceOrViewType_0.cpp:4942 [kernel]
AutogradOther: registered at /pytorch/torch/csrc/autograd/generated/VariableType_0.cpp:18082 [autograd kernel]
AutogradCPU: registered at /pytorch/torch/csrc/autograd/generated/VariableType_0.cpp:18082 [autograd kernel]
AutogradCUDA: registered at /pytorch/torch/csrc/autograd/generated/VariableType_0.cpp:18082 [autograd kernel]
AutogradHIP: registered at /pytorch/torch/csrc/autograd/generated/VariableType_0.cpp:18082 [autograd kernel]
AutogradXLA: registered at /pytorch/torch/csrc/autograd/generated/VariableType_0.cpp:18082 [autograd kernel]
AutogradMPS: registered at /pytorch/torch/csrc/autograd/generated/VariableType_0.cpp:18082 [autograd kernel]
AutogradIPU: registered at /pytorch/torch/csrc/autograd/generated/VariableType_0.cpp:18082 [autograd kernel]
AutogradXPU: registered at /pytorch/torch/csrc/autograd/generated/VariableType_0.cpp:18082 [autograd kernel]
AutogradHPU: registered at /pytorch/torch/csrc/autograd/generated/VariableType_0.cpp:18082 [autograd kernel]
AutogradVE: registered at /pytorch/torch/csrc/autograd/generated/VariableType_0.cpp:18082 [autograd kernel]
AutogradLazy: registered at /pytorch/torch/csrc/autograd/generated/VariableType_0.cpp:18082 [autograd kernel]
AutogradMTIA: registered at /pytorch/torch/csrc/autograd/generated/VariableType_0.cpp:18082 [autograd kernel]
AutogradPrivateUse1: registered at /pytorch/torch/csrc/autograd/generated/VariableType_0.cpp:18082 [autograd kernel]
AutogradPrivateUse2: registered at /pytorch/torch/csrc/autograd/generated/VariableType_0.cpp:18082 [autograd kernel]
AutogradPrivateUse3: registered at /pytorch/torch/csrc/autograd/generated/VariableType_0.cpp:18082 [autograd kernel]
AutogradMeta: registered at /pytorch/torch/csrc/autograd/generated/VariableType_0.cpp:18082 [autograd kernel]
AutogradNestedTensor: registered at /pytorch/torch/csrc/autograd/generated/VariableType_0.cpp:18082 [autograd kernel]
Tracer: registered at /pytorch/torch/csrc/autograd/generated/TraceType_0.cpp:17100 [kernel]
AutocastCPU: fallthrough registered at /pytorch/aten/src/ATen/autocast_mode.cpp:322 [backend fallback]
AutocastXPU: fallthrough registered at /pytorch/aten/src/ATen/autocast_mode.cpp:465 [backend fallback]
AutocastMPS: fallthrough registered at /pytorch/aten/src/ATen/autocast_mode.cpp:209 [backend fallback]
AutocastCUDA: fallthrough registered at /pytorch/aten/src/ATen/autocast_mode.cpp:165 [backend fallback]
FuncTorchBatched: registered at /pytorch/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp:735 [kernel]
BatchedNestedTensor: registered at /pytorch/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp:758 [backend fallback]
FuncTorchVmapMode: fallthrough registered at /pytorch/aten/src/ATen/functorch/VmapModeRegistrations.cpp:27 [backend fallback]
Batched: registered at /pytorch/aten/src/ATen/LegacyBatchingRegistrations.cpp:1079 [kernel]
VmapMode: fallthrough registered at /pytorch/aten/src/ATen/VmapModeRegistrations.cpp:33 [backend fallback]
FuncTorchGradWrapper: registered at /pytorch/aten/src/ATen/functorch/TensorWrapper.cpp:207 [backend fallback]
PythonTLSSnapshot: registered at /pytorch/aten/src/ATen/core/PythonFallbackKernel.cpp:202 [backend fallback]
FuncTorchDynamicLayerFrontMode: registered at /pytorch/aten/src/ATen/functorch/DynamicLayer.cpp:499 [backend fallback]
PreDispatch: registered at /pytorch/aten/src/ATen/core/PythonFallbackKernel.cpp:206 [backend fallback]
PythonDispatcher: registered at /pytorch/aten/src/ATen/core/PythonFallbackKernel.cpp:198 [backend fallback]


tensor([1., 1., 1.,  ..., 1., 1., 1.], dtype=torch.float64)

In [None]:
import torch

class LossFunction:
    def __init__(self, alpha=1.0, eps=1e-8, use_regularization=False, lam=0.001):
        """
        Initialize the loss function with given parameters.
        
        Args:
            alpha (float): Scaling parameter for edge probability.
            eps (float): Small value to prevent log(0).
            use_regularization (bool): Whether to include Gaussian regularization.
        """
        self.alpha = alpha
        self.eps = eps
        self.lam = lam
        self.use_regularization = use_regularization

    def edge_probability(self, z_i, z_j):
        """Compute the probability of an edge existing between two embeddings."""
        dist = torch.norm(z_i - z_j) ** 2  # Squared Euclidean distance
        return 1 / (1 + torch.exp(-self.alpha + dist))  # Logistic function

    def link_loss(self, label, z_u, z_v):
        """Compute the loss for a single edge."""
        prob = self.edge_probability(z_u, z_v)
        prob = torch.clamp(prob, self.eps, 1 - self.eps)  # Numerical stability

        return label.float() * torch.log(prob) + (1 - label.float()) * torch.log(1 - prob)

    def compute_loss(self, z, datamatrix_tensor):
        """Compute the total loss for the dataset using sparse tensor."""
        
        # Extract indices and values from the sparse tensor
        indices = datamatrix_tensor.coalesce().indices()  # Shape: [2, nnz]
        values = datamatrix_tensor.coalesce().values()    # Shape: [nnz]
        
        # Assume the indices are in the form of (u_idx, v_idx) pairs
        u_idx = indices[0]  # The citing papers
        v_idx = indices[1]  # The cited papers
        
        # Compute the link loss for the pairs (u_idx, v_idx) using vectorized operations
        # self.link_loss should support batch operations for efficiency
        losses = self.link_loss(values, z[u_idx], z[v_idx])
        
        # Sum the losses and compute the final loss
        sum_loss = losses.sum()  # Sum over all citation events
        loss = -sum_loss / len(values)  # Divide by the number of non-zero values (citations)
        
        return loss
    

# loss_fn = LossFunction(alpha=1.0, use_regularization=True)
# loss_value = loss_fn.compute_loss(z, datamatrix_tensor)

In [56]:
print(citation_matrix.coalesce().indices()[0][0])
print(citation_matrix.coalesce().indices()[1][0])
print(citation_matrix.coalesce().values()[0])

tensor(0)
tensor(88)
tensor(1., dtype=torch.float64)


In [61]:
embedding_dim = 2
node_embeddings = torch.nn.Embedding(len(paper_c_paper_train_0), embedding_dim)


loss_function = LossFunction(alpha=1.0, eps=1e-10, use_regularization=False)

optimizer = torch.optim.Adam(node_embeddings.parameters(), lr=0.01)

alpha = 3
num_epochs = 5
for epoch in range(num_epochs):
    optimizer.zero_grad()
    z = node_embeddings.weight  # Get embeddings
    loss = loss_function.compute_loss(z, citation_matrix)  # Compute loss
    loss.backward()
    optimizer.step()

    print(f"Epoch {epoch}: Loss = {loss.item():.4f}")

Epoch 0: Loss = nan
Epoch 1: Loss = nan
Epoch 2: Loss = nan
Epoch 3: Loss = nan
Epoch 4: Loss = nan


In [None]:
paper_c_paper_train_1[:5]

array([    88,  27449, 121051, 151667, 308499])

In [None]:
paper_c_paper_train[:,88]

tensor([    28, 333019])

In [None]:
print(np.argwhere(paper_c_paper_train[0] == 2))
print(paper_c_paper_train[1][5])

tensor([[5, 6, 7]])
tensor(186851)


In [None]:
# creating datamatrix where 1 if link 0 if not, also add the 1st and 2nd column to the datamatrix
from tqdm import tqdm
datamatrix = []
for i in tqdm(range(len(paper_c_paper_train[0]))):
    label = 1 if i == paper_c_paper_train[0][i] else 0
    datamatrix.append([paper_c_paper_train[0][i], paper_c_paper_train[1][i], label])

print(datamatrix)

  0%|          | 0/3879968 [00:00<?, ?it/s]

100%|██████████| 3879968/3879968 [00:57<00:00, 67341.46it/s]


KeyboardInterrupt: 