# Read data

In [2]:
import pandas as pd
import torch
import os
# Set working directory

try:
    data_train = pd.read_csv('dataset/ogbn_mag/split/time/paper/train.csv.gz', compression='gzip',header = None)
    data_valid = pd.read_csv('dataset/ogbn_mag/split/time/paper/valid.csv.gz', compression='gzip',header = None)
    data_test = pd.read_csv('dataset/ogbn_mag/split/time/paper/test.csv.gz', compression='gzip',header = None)
except FileNotFoundError:
    os.chdir("..")
    data_train = pd.read_csv('dataset/ogbn_mag/split/time/paper/train.csv.gz', compression='gzip',header = None)
    data_valid = pd.read_csv('dataset/ogbn_mag/split/time/paper/valid.csv.gz', compression='gzip',header = None)
    data_test = pd.read_csv('dataset/ogbn_mag/split/time/paper/test.csv.gz', compression='gzip',header = None)

data, _ = torch.load(r"dataset/ogbn_mag/processed/geometric_data_processed.pt", weights_only=False)

# Extract edges for "paper" -> "cites" -> "paper"
paper_c_paper = data.edge_index_dict[('paper', 'cites', 'paper')]

# Unique paper IDs to keep (Ensure it's a PyTorch tensor)
nums_valid = torch.tensor(data_valid[0])
nums_test = torch.tensor(data_test[0])
nums_train = torch.tensor(data_train[0])

mask_train = torch.isin(paper_c_paper[0], nums_train) | torch.isin(paper_c_paper[1], nums_train)
mask_valid = torch.isin(paper_c_paper[0], nums_valid) | torch.isin(paper_c_paper[1], nums_valid)
mask_test = torch.isin(paper_c_paper[0], nums_test) | torch.isin(paper_c_paper[1], nums_test)

paper_c_paper_train = paper_c_paper.clone()
paper_c_paper_valid = paper_c_paper.clone()
paper_c_paper_test = paper_c_paper.clone()

# Combine the conditions into a single mask that selects only the train edges
mask_train_done = mask_train & ~mask_valid & ~mask_test
mask_valid_done = mask_valid & ~mask_test

# Apply the combined mask to paper_c_paper_train
paper_c_paper_train = paper_c_paper_train[:, mask_train_done]
paper_c_paper_valid = paper_c_paper_valid[:, mask_valid_done]
paper_c_paper_test = paper_c_paper_test[:, mask_test]

#Venues
venues_values = torch.unique(data['y_dict']['paper'])

len(paper_c_paper_train[1]) + len(paper_c_paper_valid[1]) + len(paper_c_paper_test[1]), paper_c_paper.shape[1]



(5416271, 5416271)

In [21]:

if not os.path.exists("dataset/ogbn_mag/processed/paper_embeddings.pt"):
    venue_embeddings = {}
    embdding_dim = 2

    embed = torch.nn.Embedding(len(venues_values), embdding_dim)

    venue_id_to_idx = {venue_id.item(): idx for idx, venue_id in enumerate(venues_values)}

    indices = torch.tensor([venue_id_to_idx[venue_id.item()] for venue_id in venues_values], dtype=torch.long)

    embeddings = embed(indices)

    venue_embeddings = {venue_id.item(): embeddings[venue_id_to_idx[venue_id.item()]].tolist() for venue_id in venues_values}

    # Save the embeddings to a file
    torch.save(venue_embeddings, "dataset/ogbn_mag/processed/paper_embeddings.pt")

In [20]:
import torch

if not os.path.exists("dataset/ogbn_mag/processed/paper_embeddings.pt"):
    paper_embeddings = {}
    embedding_dim = 2

    # Get unique paper IDs
    unique_paper_ids = torch.unique(paper_c_paper_train)

    # Define the embedding layer (one embedding per unique paper)
    embed = torch.nn.Embedding(len(unique_paper_ids), embedding_dim)

    # Create a mapping: paper ID → index in embedding layer
    paper_id_to_idx = {pid.item(): idx for idx, pid in enumerate(unique_paper_ids)}

    # Convert paper_c_paper_train to indices using vectorized operations
    indices = torch.tensor([paper_id_to_idx[pid.item()] for pid in paper_c_paper_train.flatten()])

    # Compute embeddings
    embeddings = embed(indices)

    # Convert to dictionary with original paper IDs
    paper_embeddings = {pid.item(): emb for pid, emb in zip(paper_c_paper_train.flatten(), embeddings)}

    torch.save(venue_embeddings, "dataset/ogbn_mag/processed/paper_embeddings.pt")

# Classes

In [2]:
import random
import itertools
import torch

class mini_batches_code:
    def __init__(self,data, unique_list, sample_size,edge_type):
        self.data = data
        self.sample_size = sample_size
        self.edge_type = edge_type
        self.unique_list = unique_list

    def get_batch(self):
        random.seed(99) 
        torch.manual_seed(99)
        list_pcp = self.unique_list
        random_sample = random.sample(list_pcp, self.sample_size)
        print(random_sample)
        for value in random_sample:
            list_pcp.remove(value)
        mask = torch.isin(self.data[0], torch.tensor(random_sample))
        filtered_data = self.data[:,mask]
        return filtered_data, random_sample, list_pcp
    
    def data_matrix(self):
        data, _ = torch.load(r"dataset/ogbn_mag/processed/geometric_data_processed.pt", weights_only=False)
        edge_entities = {
            'paper': 0,
            'author': 1,
            'institution': 2,
            'field_of_study': 3,
            'venue': 4,
        }
        # Get batch and initialize tensors
        tensor, random_sample, unique_list = self.get_batch()

        # Create result tensor from input batch
        result_tensor = torch.stack([torch.tensor([1, tensor[0, i], tensor[1, i],edge_entities[self.edge_type[0]],edge_entities[self.edge_type[2]]]) for i in range(tensor.shape[1])])

        # Initialize lists for non_edges and venues
        non_edges, venues = [], []

        # Add venue links for sampled nodes
        for i in random_sample:
            venues.append(torch.tensor([1, i.item(), data['y_dict']['paper'][i], edge_entities[self.edge_type[0]],edge_entities['venue']]))

            # Find non-existing edges
            for j in tensor[1].unique():
                if i != j and not torch.any((result_tensor[:, 1] == i) & (result_tensor[:, 2] == j)): 
                    non_edges.append(torch.tensor([0, i.item(), j.item(),edge_entities[self.edge_type[0]],edge_entities[self.edge_type[2]]]))

        for r, j in itertools.combinations(random_sample, 2):  # itertools generates all unique pairs
            if data['y_dict']['paper'][r] != data['y_dict']['paper'][j]:
                venues.append(torch.tensor([0, r, data['y_dict']['paper'][j],edge_entities['paper'],edge_entities['venue']]))
                venues.append(torch.tensor([0, j, data['y_dict']['paper'][r],edge_entities['paper'],edge_entities['venue']]))

        # Convert lists to tensors only once to optimize memory usage
        non_edges_tensor = torch.stack(non_edges) if non_edges else torch.empty((0, 5), dtype=torch.long)
        venues_tensor = torch.stack(venues) if venues else torch.empty((0, 5), dtype=torch.long)

        # Merge all tensors
        data_matrix = torch.cat((result_tensor, non_edges_tensor, venues_tensor), dim=0)
        return data_matrix, unique_list
    
    def node_mapping(self):

        datamatrix_tensor,ul = self.data_matrix()

        lm1 = torch.unique(torch.stack((datamatrix_tensor[:, 1], datamatrix_tensor[:, 3]), dim=1), dim=0)
        lm2 = torch.unique(torch.stack((datamatrix_tensor[:, 2], datamatrix_tensor[:, 4]), dim=1), dim=0)

        unique_global_node_ids = torch.unique(torch.cat([lm1, lm2], dim=0), dim=0)

        # Step 2: Create a mapping from global node IDs to local node indices
        node_mapping = {(global_id.item(), type_id.item()): idx 
                            for idx, (global_id, type_id) in enumerate(unique_global_node_ids)}

        # Step 3: Remap the indices in the datamatrix_tensor using the node_mapping
        # We are remapping columns 1 and 2 in the datamatrix (i.e., the source and destination node indices)
        remapped_datamatrix_tensor = datamatrix_tensor.clone()  # Clone the tensor to avoid modifying the original
        # Extract the global_id and type_id for remapping
        remapped_datamatrix_tensor[:, 1] = torch.tensor([
            node_mapping[(global_id.item(), type_id.item())]  
            for global_id, type_id in zip(datamatrix_tensor[:, 1], datamatrix_tensor[:, 3])  # Use both columns
        ])

        remapped_datamatrix_tensor[:, 2] = torch.tensor([
            node_mapping[(global_id.item(), type_id.item())]  
            for global_id, type_id in zip(datamatrix_tensor[:, 2], datamatrix_tensor[:, 4])  # Use both columns
        ])

        return datamatrix_tensor, ul, remapped_datamatrix_tensor


# mini_b = mini_batches_code(paper_c_paper_train, list(paper_c_paper.unique().numpy()), 10,('paper', 'cites', 'paper'))
# dm,l1 = mini_b.data_matrix()
# mini_b1 = mini_batches_code(paper_c_paper_train, l1, 10,('paper', 'cites', 'paper'))

In [3]:
import torch

class LossFunction:
    def __init__(self, alpha=1.0, eps=1e-8, use_regularization=False, lam=0.01):
        """
        Initialize the loss function with given parameters.
        
        Args:
            alpha (float): Scaling parameter for edge probability.
            eps (float): Small value to prevent log(0).
            use_regularization (bool): Whether to include Gaussian regularization.
        """
        self.alpha = alpha
        self.eps = eps
        self.use_regularization = use_regularization
        self.lam = lam

    # def edge_probability(self, z_i, z_j):
    #     """Compute the probability of an edge existing between two embeddings."""
    #     dist = torch.norm(z_i - z_j) ** 2  # Squared Euclidean distance
    #     return 1 / (1 + torch.exp(-self.alpha + dist))  # Logistic function

    # def link_loss(self, label, z_u, z_v):
    #     """Compute the loss for a single edge."""
    #     prob = self.edge_probability(z_u, z_v)
    #     prob = torch.clamp(prob, self.eps, 1 - self.eps)  # Numerical stability

    #     return label.float() * torch.log(prob) + (1 - label.float()) * torch.log(1 - prob)        

    # def compute_loss(self, z, datamatrix_tensor):
    #     """Compute the total loss for the dataset."""
    #     sum_loss = sum(
    #         self.link_loss(label, z[u_idx], z[v_idx])
    #         for label, u_idx, v_idx in datamatrix_tensor
    #     )

    #     loss = -sum_loss / len(datamatrix_tensor)

    #     if self.use_regularization:
    #         regularization = -0.5 * torch.sum(z ** 2)
    #         loss += regularization

    #     return loss



    def edge_probability(self, z_i, z_j, type_i, type_j):
        """Compute the probability of an edge existing between two nodes, considering embeddings and types."""
        type_i = (type_i.view(1, -1).float())
        type_j = (type_j.view(1, -1).float())

        # Combine the node embeddings and types
        z_i = z_i.view(1, -1).float()  # Ensure z_i is a float tensor
        z_j = z_j.view(1, -1).float()  # Ensure z_j is a float tensor
        
        combined_i = torch.cat((z_i, type_i), dim=-1)  # Concatenate embedding and type for node i
        combined_j = torch.cat((z_j, type_j), dim=-1)  # Concatenate embedding and type for node j
        
        dist = torch.norm(combined_i - combined_j) ** 2  # Squared Euclidean distance
        return 1 / (1 + torch.exp(-self.alpha + dist))  # Logistic function

    def link_loss(self, label, z_u, z_v, type_u, type_v):
        """Compute the loss for a single edge, considering node types."""
        prob = self.edge_probability(z_u, z_v, type_u, type_v)
        prob = torch.clamp(prob, self.eps, 1 - self.eps)  # Numerical stability

        return label.float() * torch.log(prob) + (1 - label.float()) * torch.log(1 - prob)

    def compute_loss(self, z, types, datamatrix_tensor):
        """Compute the total loss for the dataset, considering node types."""
        sum_loss = sum(
            self.link_loss(label, z[u_idx], z[v_idx], types[u_idx][0], types[v_idx][1])
            for label, u_idx, v_idx in datamatrix_tensor)
        

        loss = -sum_loss / len(datamatrix_tensor)

        if self.use_regularization:
            regularization = self.lam * torch.sum(z ** 2)
            loss += regularization

        return loss
    

# loss_fn = LossFunction(alpha=1.0, use_regularization=True)
# loss_value = loss_fn.compute_loss(z, datamatrix_tensor)

# Training

In [4]:
mini_b = mini_batches_code(paper_c_paper_train, list(paper_c_paper.unique().numpy()), 10,('paper', 'cites', 'paper'))
dm,l1,remapped_datamatrix_tensor = mini_b.node_mapping()

[423601, 399248, 209794, 628557, 187487, 241447, 260499, 139716, 90784, 263350]


In [5]:
len(remapped_datamatrix_tensor), len(dm)

(588, 588)

In [6]:
import matplotlib.pyplot as plt
import numpy as np
# dm1 = dm[torch.all(dm[:, 1:] != 90784, dim=1)]

datamatrix_tensor = dm
num_nodes = len(np.unique(dm[:, 1])) + len(np.unique(dm[:, 2]))
# 2️ Define Embeddings
embedding_dim = 2
node_embeddings = torch.nn.Embedding(num_nodes, embedding_dim)
optimizer = torch.optim.Adam(node_embeddings.parameters(), lr=0.01)

loss_function = LossFunction(alpha=1.0, eps=1e-10, use_regularization=True)

# 3️ Train Embeddings
alpha = 3
num_epochs = 100
for epoch in range(num_epochs):
    optimizer.zero_grad()
    z = node_embeddings.weight  # Get embeddings
    types = dm[:,3:]
    loss = loss_function.compute_loss(z, types, remapped_datamatrix_tensor[:,:3])  # Compute loss
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:
        print(f"Epoch {epoch}: Loss = {loss.item():.4f}")

Epoch 0: Loss = 2.5113
Epoch 10: Loss = 2.2348
Epoch 20: Loss = 1.9979
Epoch 30: Loss = 1.7974
Epoch 40: Loss = 1.6259
Epoch 50: Loss = 1.4764
Epoch 60: Loss = 1.3451
Epoch 70: Loss = 1.2307
Epoch 80: Loss = 1.1324
Epoch 90: Loss = 1.0487


# Predicting a sample - needs changes

In [None]:
node_embeddings.weight.requires_grad = False
emb_matrix = node_embeddings.weight.detach().clone()

sample = 1
# mini_b_new = mini_batches_code(paper_c_paper_train, l1, sample,('paper', 'cites', 'paper'))
# dm_new,l2 = mini_b_new.data_matrix()
# new_datamatrix = dm_new[torch.all(dm_new[:, 4:] != 4, dim=1)] #fjerner venues

dm_new = dm[(dm[:, 0] == 1) & torch.any(dm == 90784, dim=1)]
new_datamatrix = dm_new[torch.all(dm_new[:, 4:] != 4, dim=1)]


new_embedding = torch.nn.Embedding(sample, embedding_dim)

optimizer = torch.optim.Adam(new_embedding.parameters(), lr=0.01)

# Step 1: Extract unique global node IDs from the datamatrix
unique_global_node_ids_new = torch.unique(new_datamatrix[:, 1],new_datamatrix[:, 3])

# Step 2: Create a mapping from global node IDs to local node indices
node_mapping_new = {global_id.item(): idx for idx, global_id in enumerate(unique_global_node_ids_new)}

# Step 3: Remap the indices in the datamatrix_tensor using the node_mapping_new
# We are remapping columns 1 and 2 in the datamatrix (i.e., the source and destination node indices)
remapped_datamatrix_tensor_new = new_datamatrix.clone()  # Clone the tensor to avoid modifying the original
remapped_datamatrix_tensor_new[:, 1] = torch.tensor([node_mapping_new[global_id.item()] for global_id in new_datamatrix[:, 1]])
remapped_datamatrix_tensor_new[:, 2] = torch.tensor([node_mapping_new[global_id.item()] for global_id in new_datamatrix[:, 2]])

print(new_datamatrix)
print(remapped_datamatrix_tensor_new)

alpha = 3
num_epochs = 100
for epoch in range(num_epochs):
    optimizer.zero_grad()

    types = dm[:,3:]
    temp_embeddings = torch.cat([emb_matrix, new_embedding.weight], dim=0)
    loss = loss_function.compute_loss(temp_embeddings, types, remapped_datamatrix_tensor_new[:,:3])  # Compute loss
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:
        print(f"Epoch {epoch}: Loss = {loss.item():.4f}")


tensor([[     1,  90784,  44029,      0,      0],
        [     1,  90784, 112686,      0,      0],
        [     1,  90784, 227830,      0,      0],
        [     1,  90784, 267759,      0,      0],
        [     1,  90784, 286919,      0,      0],
        [     1,  90784, 391810,      0,      0]])
tensor([[1, 1, 0, 0, 0],
        [1, 1, 2, 0, 0],
        [1, 1, 3, 0, 0],
        [1, 1, 4, 0, 0],
        [1, 1, 5, 0, 0],
        [1, 1, 6, 0, 0]])
Epoch 0: Loss = 1.9824
Epoch 10: Loss = 1.9780
Epoch 20: Loss = 1.9740
Epoch 30: Loss = 1.9704
Epoch 40: Loss = 1.9674
Epoch 50: Loss = 1.9648
Epoch 60: Loss = 1.9626
Epoch 70: Loss = 1.9607
Epoch 80: Loss = 1.9591
Epoch 90: Loss = 1.9578


In [60]:
test1 = dm1[dm1[:, 4] == 4]
for i in list(torch.unique(test1[:,2]).numpy()):
    global_node_id = i  # Example global node ID
    local_index = node_mapping.get(global_node_id, None)
    if local_index is not None:
        embedding = emb_matrix[local_index]
        print(embedding)
    else:
        print("Node ID not found in mapping.")

tensor([ 0.2999, -1.7468])
tensor([0.0827, 0.2788])
tensor([-0.1822,  0.9507])
tensor([ 0.2537, -0.2698])
tensor([0.2911, 0.1724])
tensor([-1.4072,  1.7272])
tensor([ 0.2714, -0.4648])
tensor([ 0.2601, -0.0051])
tensor([-0.1245, -1.1417])


In [61]:
distances = []
for i in list(torch.unique(test1[:,2]).numpy()):
    global_node_id = i  # Example global node ID
    local_index = node_mapping.get(global_node_id, None)
    dist = torch.norm(new_embedding.weight - emb_matrix[local_index])  # Euclidean distance
    distances.append((dist.item(), i))

# 8️ Assign B_new to closest C node
min(distances, key=lambda x: x[0])[1], distances

(207,
 [(1.0306346416473389, 49),
  (1.0256558656692505, 51),
  (1.713150143623352, 108),
  (0.5174657702445984, 109),
  (0.9493674635887146, 128),
  (2.871931314468384, 134),
  (0.3568853437900543, 207),
  (0.7699180245399475, 214),
  (0.4329391419887543, 219)])

In [62]:
import torch
import torch.nn.functional as F

alpha = 0.001
logi_f = []

for i in list(torch.unique(test1[:, 2]).numpy()):
    global_node_id = i  # Example global node ID
    local_index = node_mapping.get(global_node_id, None)
    
    if local_index is not None:  # Ensure the index exists
        dist = torch.norm(new_embedding.weight - emb_matrix[local_index])**2  # Euclidean distance
        logi = 1 / (1 + torch.exp(alpha + dist))  # Logistic function
        logi_f.append((logi.item(), i))  # Store tuple (probability, node ID)

# Separate values for softmax computation
logits, node_ids = zip(*logi_f)  # Unzips into two lists

# Convert logits to a tensor and apply softmax
logi_f_tensor = torch.tensor(logits)
softma = F.softmax(logi_f_tensor, dim=0)

# Get the index of the highest probability
high_prob_idx = torch.argmax(softma).item()

# Get the corresponding node ID and its softmax probability
predicted_node_id = node_ids[high_prob_idx]
highest_prob_value = softma[high_prob_idx].item()

# Print the results
print(f"Predicted Node ID: {predicted_node_id}")
print(f"Highest Softmax Probability: {highest_prob_value}")


Predicted Node ID: 207
Highest Softmax Probability: 0.13181227445602417


# Multi-batches

In [7]:
mini_b_multi1 = mini_batches_code(paper_c_paper_train, list(paper_c_paper.unique().numpy()), 10,('paper', 'cites', 'paper'))
dm_multi1,l2,remapped_datamatrix_tensor_multi1 = mini_b_multi1.node_mapping()

[423601, 399248, 209794, 628557, 187487, 241447, 260499, 139716, 90784, 263350]


In [8]:
mini_b_multi2 = mini_batches_code(paper_c_paper_train, l2, 10,('paper', 'cites', 'paper'))
dm_multi2,l3,remapped_datamatrix_tensor_multi2 = mini_b_multi2.node_mapping()

[423610, 399256, 209798, 628567, 187490, 241452, 260505, 139718, 90785, 263357]


In [9]:
import matplotlib.pyplot as plt
import numpy as np
# dm1 = dm_multi1[torch.all(dm_multi1[:, 1:] != 90784, dim=1)]

datamatrix_tensor_multi1 = dm_multi1
num_nodes = len(np.unique(dm_multi1[:, 1])) + len(np.unique(dm_multi1[:, 2]))
# 2️ Define Embeddings
embedding_dim = 2
node_embeddings_multi1 = torch.nn.Embedding(num_nodes, embedding_dim)
optimizer = torch.optim.Adam(node_embeddings_multi1.parameters(), lr=0.01)

loss_function = LossFunction(alpha=1.0, eps=1e-10, use_regularization=True)

# 3️ Train Embeddings
alpha = 3
num_epochs = 50
for epoch in range(num_epochs):
    optimizer.zero_grad()
    z = node_embeddings_multi1.weight  # Get embeddings
    types = dm_multi1[:,3:]
    loss = loss_function.compute_loss(z, types, remapped_datamatrix_tensor_multi1[:,:3])  # Compute loss
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:
        print(f"Epoch {epoch}: Loss = {loss.item():.4f}")

Epoch 0: Loss = 2.5113
Epoch 10: Loss = 2.2348
Epoch 20: Loss = 1.9979
Epoch 30: Loss = 1.7974
Epoch 40: Loss = 1.6259


In [10]:
import matplotlib.pyplot as plt
import numpy as np
# dm1 = dm_multi2[torch.all(dm_multi2[:, 1:] != 90784, dim=1)]

datamatrix_tensor_multi2 = dm_multi2
num_nodes = len(np.unique(dm_multi2[:, 1])) + len(np.unique(dm_multi2[:, 2]))
# 2️ Define Embeddings
embedding_dim = 2
node_embeddings_multi2 = torch.nn.Embedding(num_nodes, embedding_dim)
optimizer = torch.optim.Adam(node_embeddings_multi2.parameters(), lr=0.01)

loss_function = LossFunction(alpha=1.0, eps=1e-10, use_regularization=True)

# 3️ Train Embeddings
alpha = 3
num_epochs = 50
for epoch in range(num_epochs):
    optimizer.zero_grad()
    z = node_embeddings_multi2.weight  # Get embeddings
    types = dm_multi2[:,3:]
    loss = loss_function.compute_loss(z, types, remapped_datamatrix_tensor_multi2[:,:3])  # Compute loss
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:
        print(f"Epoch {epoch}: Loss = {loss.item():.4f}")

Epoch 0: Loss = 2.8094
Epoch 10: Loss = 2.4676
Epoch 20: Loss = 2.1712
Epoch 30: Loss = 1.9151
Epoch 40: Loss = 1.6951
