# Read data

In [1]:
import pandas as pd
import torch
import os
# Set working directory

try:
    data_train = pd.read_csv('dataset/ogbn_mag/split/time/paper/train.csv.gz', compression='gzip',header = None)
    data_valid = pd.read_csv('dataset/ogbn_mag/split/time/paper/valid.csv.gz', compression='gzip',header = None)
    data_test = pd.read_csv('dataset/ogbn_mag/split/time/paper/test.csv.gz', compression='gzip',header = None)
except FileNotFoundError:
    os.chdir("..")
    data_train = pd.read_csv('dataset/ogbn_mag/split/time/paper/train.csv.gz', compression='gzip',header = None)
    data_valid = pd.read_csv('dataset/ogbn_mag/split/time/paper/valid.csv.gz', compression='gzip',header = None)
    data_test = pd.read_csv('dataset/ogbn_mag/split/time/paper/test.csv.gz', compression='gzip',header = None)

data, _ = torch.load(r"dataset/ogbn_mag/processed/geometric_data_processed.pt", weights_only=False)

# Extract edges for "paper" -> "cites" -> "paper"
paper_c_paper = data.edge_index_dict[('paper', 'cites', 'paper')]

# Unique paper IDs to keep (Ensure it's a PyTorch tensor)
nums_valid = torch.tensor(data_valid[0])
nums_test = torch.tensor(data_test[0])
nums_train = torch.tensor(data_train[0])

mask_train = torch.isin(paper_c_paper[0], nums_train) | torch.isin(paper_c_paper[1], nums_train)
mask_valid = torch.isin(paper_c_paper[0], nums_valid) | torch.isin(paper_c_paper[1], nums_valid)
mask_test = torch.isin(paper_c_paper[0], nums_test) | torch.isin(paper_c_paper[1], nums_test)

paper_c_paper_train = paper_c_paper.clone()
paper_c_paper_valid = paper_c_paper.clone()
paper_c_paper_test = paper_c_paper.clone()

# Combine the conditions into a single mask that selects only the train edges
mask_train_done = mask_train & ~mask_valid & ~mask_test
mask_valid_done = mask_valid & ~mask_test

# Apply the combined mask to paper_c_paper_train
paper_c_paper_train = paper_c_paper_train[:, mask_train_done]
paper_c_paper_valid = paper_c_paper_valid[:, mask_valid_done]
paper_c_paper_test = paper_c_paper_test[:, mask_test]

#Venues
venues_values = torch.unique(data['y_dict']['paper'])

len(paper_c_paper_train[1]) + len(paper_c_paper_valid[1]) + len(paper_c_paper_test[1]), paper_c_paper.shape[1]



(5416271, 5416271)

In [2]:

# if not os.path.exists("dataset/ogbn_mag/processed/venue_embeddings.pt"):
#     venue_embeddings = {}
#     embdding_dim = 2

#     embed = torch.nn.Embedding(len(venues_values), embdding_dim)

#     venue_id_to_idx = {venue_id.item(): idx for idx, venue_id in enumerate(venues_values)}

#     indices = torch.tensor([venue_id_to_idx[venue_id.item()] for venue_id in venues_values], dtype=torch.long)

#     embeddings = embed(indices)

#     venue_embeddings = {venue_id.item(): embeddings[venue_id_to_idx[venue_id.item()]] for venue_id in venues_values}

#     # Save the embeddings to a file
#     torch.save(venue_embeddings, "dataset/ogbn_mag/processed/venue_embeddings.pt")

In [3]:
# import torch

# if not os.path.exists("dataset/ogbn_mag/processed/paper_embeddings.pt"):
#     paper_embeddings = {}
#     embedding_dim = 2

#     # Get unique paper IDs
#     unique_paper_ids = torch.unique(paper_c_paper_train)

#     # Define the embedding layer (one embedding per unique paper)
#     embed = torch.nn.Embedding(len(unique_paper_ids), embedding_dim)

#     # Create a mapping: paper ID → index in embedding layer
#     paper_id_to_idx = {pid.item(): idx for idx, pid in enumerate(unique_paper_ids)}

#     # Convert paper_c_paper_train to indices using vectorized operations
#     indices = torch.tensor([paper_id_to_idx[pid.item()] for pid in paper_c_paper_train.flatten()])

#     # Compute embeddings
#     embeddings = embed(indices)

#     # Convert to dictionary with original paper IDs
#     paper_embeddings = {pid.item(): emb for pid, emb in zip(paper_c_paper_train.flatten(), embeddings)}

#     torch.save(venue_embeddings, "dataset/ogbn_mag/processed/paper_embeddings.pt")

# Classes

In [4]:
import random
import itertools
import torch

class mini_batches_code:
    def __init__(self,data, unique_list, sample_size,edge_type):
        self.data = data
        self.sample_size = sample_size
        self.edge_type = edge_type
        self.unique_list = unique_list

    def get_batch(self):
        random.seed(99) 
        torch.manual_seed(99)
        list_pcp = self.unique_list
        random_sample = random.sample(list_pcp, self.sample_size)
        print(random_sample)
        for value in random_sample:
            list_pcp.remove(value)
        mask = torch.isin(self.data[0], torch.tensor(random_sample))
        filtered_data = self.data[:,mask]
        return filtered_data, random_sample, list_pcp
    
    def data_matrix(self):
        data, _ = torch.load(r"dataset/ogbn_mag/processed/geometric_data_processed.pt", weights_only=False)
        edge_entities = {
            'paper': 0,
            'author': 1,
            'institution': 2,
            'field_of_study': 3,
            'venue': 4,
        }
        # Get batch and initialize tensors
        tensor, random_sample, unique_list = self.get_batch()

        # Create result tensor from input batch
        result_tensor = torch.stack([torch.tensor([1, tensor[0, i], tensor[1, i],edge_entities[self.edge_type[0]],edge_entities[self.edge_type[2]]]) for i in range(tensor.shape[1])])

        # Initialize lists for non_edges and venues
        non_edges, venues = [], []

        # Add venue links for sampled nodes
        for i in random_sample:
            venues.append(torch.tensor([1, i.item(), data['y_dict']['paper'][i], edge_entities[self.edge_type[0]],edge_entities['venue']]))

            # Find non-existing edges
            for j in tensor[1].unique():
                if i != j and not torch.any((result_tensor[:, 1] == i) & (result_tensor[:, 2] == j)): 
                    non_edges.append(torch.tensor([0, i.item(), j.item(),edge_entities[self.edge_type[0]],edge_entities[self.edge_type[2]]]))

        for r, j in itertools.combinations(random_sample, 2):  # itertools generates all unique pairs
            if data['y_dict']['paper'][r] != data['y_dict']['paper'][j]:
                venues.append(torch.tensor([0, r, data['y_dict']['paper'][j],edge_entities['paper'],edge_entities['venue']]))
                venues.append(torch.tensor([0, j, data['y_dict']['paper'][r],edge_entities['paper'],edge_entities['venue']]))

        # Convert lists to tensors only once to optimize memory usage
        non_edges_tensor = torch.stack(non_edges) if non_edges else torch.empty((0, 5), dtype=torch.long)
        venues_tensor = torch.stack(venues) if venues else torch.empty((0, 5), dtype=torch.long)

        # Merge all tensors
        data_matrix = torch.cat((result_tensor, non_edges_tensor, venues_tensor), dim=0)
        return data_matrix, unique_list
    
    def node_mapping(self):

        datamatrix_tensor,ul = self.data_matrix()

        lm1 = torch.unique(torch.stack((datamatrix_tensor[:, 1], datamatrix_tensor[:, 3]), dim=1), dim=0)
        lm2 = torch.unique(torch.stack((datamatrix_tensor[:, 2], datamatrix_tensor[:, 4]), dim=1), dim=0)

        unique_global_node_ids = torch.unique(torch.cat([lm1, lm2], dim=0), dim=0)

        # Step 2: Create a mapping from global node IDs to local node indices
        node_mapping = {(global_id.item(), type_id.item()): idx 
                            for idx, (global_id, type_id) in enumerate(unique_global_node_ids)}

        # Step 3: Remap the indices in the datamatrix_tensor using the node_mapping
        # We are remapping columns 1 and 2 in the datamatrix (i.e., the source and destination node indices)
        remapped_datamatrix_tensor = datamatrix_tensor.clone()  # Clone the tensor to avoid modifying the original
        # Extract the global_id and type_id for remapping
        remapped_datamatrix_tensor[:, 1] = torch.tensor([
            node_mapping[(global_id.item(), type_id.item())]  
            for global_id, type_id in zip(datamatrix_tensor[:, 1], datamatrix_tensor[:, 3])  # Use both columns
        ])

        remapped_datamatrix_tensor[:, 2] = torch.tensor([
            node_mapping[(global_id.item(), type_id.item())]  
            for global_id, type_id in zip(datamatrix_tensor[:, 2], datamatrix_tensor[:, 4])  # Use both columns
        ])

        return datamatrix_tensor, ul, remapped_datamatrix_tensor


# mini_b = mini_batches_code(paper_c_paper_train, list(paper_c_paper.unique().numpy()), 10,('paper', 'cites', 'paper'))
# dm,l1 = mini_b.data_matrix()
# mini_b1 = mini_batches_code(paper_c_paper_train, l1, 10,('paper', 'cites', 'paper'))

In [11]:
import torch

class LossFunction:
    def __init__(self, alpha=1.0, eps=1e-8, use_regularization=False, lam=0.01):
        """
        Initialize the loss function with given parameters.
        
        Args:
            alpha (float): Scaling parameter for edge probability.
            eps (float): Small value to prevent log(0).
            use_regularization (bool): Whether to include Gaussian regularization.
        """
        self.alpha = alpha
        self.eps = eps
        self.use_regularization = use_regularization
        self.lam = lam

    def edge_probability(self, z_i, z_j):
        """Compute the probability of an edge existing between two embeddings."""
        dist = torch.norm(z_i - z_j) ** 2  # Squared Euclidean distance
        return 1 / (1 + torch.exp(-self.alpha + dist))  # Logistic function

    def link_loss(self, label, z_u, z_v):
        """Compute the loss for a single edge."""
        prob = self.edge_probability(z_u, z_v)
        prob = torch.clamp(prob, self.eps, 1 - self.eps)  # Numerical stability

        return label.float() * torch.log(prob) + (1 - label.float()) * torch.log(1 - prob)        

    def compute_loss(self, z, datamatrix_tensor):
        """Compute the total loss for the dataset."""
        sum_loss = sum(
            self.link_loss(label, z[u_idx], z[v_idx])
            for label, u_idx, v_idx in datamatrix_tensor
        )

        loss = -sum_loss / len(datamatrix_tensor)

        if self.use_regularization:
            regularization = -self.lam * torch.sum(z ** 2)
            loss += regularization

        return loss



#     def edge_probability(self, z_i, z_j, type_i, type_j):
#         """Compute the probability of an edge existing between two nodes, considering embeddings and types."""
#         type_i = (type_i.view(1, -1).float())*0
#         type_j = (type_j.view(1, -1).float())*0

#         # Combine the node embeddings and types
#         z_i = z_i.view(1, -1).float()  # Ensure z_i is a float tensor
#         z_j = z_j.view(1, -1).float()  # Ensure z_j is a float tensor
        
#         combined_i = torch.cat((z_i, type_i), dim=-1)  # Concatenate embedding and type for node i
#         combined_j = torch.cat((z_j, type_j), dim=-1)  # Concatenate embedding and type for node j
        
#         dist = torch.norm(combined_i - combined_j) ** 2  # Squared Euclidean distance
#         return 1 / (1 + torch.exp(-self.alpha + dist))  # Logistic function

#     def link_loss(self, label, z_u, z_v, type_u, type_v):
#         """Compute the loss for a single edge, considering node types."""
#         prob = self.edge_probability(z_u, z_v, type_u, type_v)
#         prob = torch.clamp(prob, self.eps, 1 - self.eps)  # Numerical stability

#         return label.float() * torch.log(prob) + (1 - label.float()) * torch.log(1 - prob)

#     def compute_loss(self, z, types, datamatrix_tensor):
#         """Compute the total loss for the dataset, considering node types."""
#         sum_loss = sum(
#             self.link_loss(label, z[u_idx], z[v_idx], types[u_idx][0], types[v_idx][1])
#             for label, u_idx, v_idx in datamatrix_tensor)
        

#         loss = -sum_loss / len(datamatrix_tensor)

#         if self.use_regularization:
#             regularization = self.lam * torch.sum(z ** 2)
#             loss += regularization

#         return loss
    

# # loss_fn = LossFunction(alpha=1.0, use_regularization=True)
# # loss_value = loss_fn.compute_loss(z, datamatrix_tensor)

In [6]:
import torch
import matplotlib.pyplot as plt
import numpy as np
# from LossFunction import LossFunction

class NodeEmbeddingTrainer:
    def __init__(self, dm, remapped_datamatrix_tensor, paper_dict, venue_dict, embedding_dim=2, num_epochs=50, lr=0.01, alpha=3):
        # Initialize input data, parameters, and setup
        self.dm = dm
        self.remapped_datamatrix_tensor = remapped_datamatrix_tensor
        self.paper_dict = paper_dict
        self.venue_dict = venue_dict
        self.embedding_dim = embedding_dim
        self.num_epochs = num_epochs
        self.alpha = alpha
        self.lr = lr

        # Process data
        self.dm1 = dm[dm[:, 4] != 4]
        self.dm2 = dm[dm[:, 4] == 4]

        # Get node indices
        self.specific_papernode_indices = torch.cat([torch.unique(self.dm1[:, 1]), torch.unique(self.dm1[:, 2])], dim=0)
        self.specific_venuenode_indices = torch.unique(self.dm2[:, 2], dim=0)

        # Create embeddings
        self.papernode_embeddings = torch.nn.Embedding(len(self.specific_papernode_indices), self.embedding_dim)
        self.venuenode_embeddings = torch.nn.Embedding(len(self.specific_venuenode_indices), self.embedding_dim)

        # Optimizers
        self.paper_optimizer = torch.optim.Adam(self.papernode_embeddings.parameters(), lr=self.lr)
        self.venue_optimizer = torch.optim.Adam(self.venuenode_embeddings.parameters(), lr=self.lr)

        # Loss function (assumed to be defined elsewhere)
        self.loss_function = LossFunction(alpha=1.0, eps=1e-10, use_regularization=True)

    def train(self):
        venue_dict = self.venue_dict
        paper_dict = self.paper_dict
        # Training loop
        for epoch in range(self.num_epochs):
            self.paper_optimizer.zero_grad()
            self.venue_optimizer.zero_grad()

            # Concatenate the embeddings
            z = torch.cat((self.papernode_embeddings.weight, self.venuenode_embeddings.weight), dim=0)
            types = self.dm[:, 3:]
            loss = self.loss_function.compute_loss(z, types, self.remapped_datamatrix_tensor[:, :3])  # Compute loss
            
            # Backpropagation and optimization
            loss.backward()
            self.paper_optimizer.step()
            self.venue_optimizer.step()

            # Print loss every 10 epochs
            if epoch % 10 == 0:
                print(f"Epoch {epoch}: Loss = {loss.item():.4f}")

        print(self.specific_venuenode_indices)

        for idx, node in enumerate(self.specific_papernode_indices):
            paper_dict[int(node)] = self.papernode_embeddings.weight[idx]

        for idx, node in enumerate(self.specific_venuenode_indices):
            venue_dict[int(node)] = self.venuenode_embeddings.weight[idx]
            
        return paper_dict, venue_dict


# # Example usage:

# # Assuming 'dm' is your data matrix
# trainer = NodeEmbeddingTrainer(dm, embedding_dim=2, num_epochs=10, lr=0.01, alpha=3)
# trainer.train()  # Train embeddings

# # Get the resulting dictionaries with embeddings
# paper_dict, venue_dict = trainer.get_embeddings()

# # Optionally: Print the dictionaries
# print("Paper Embeddings:", paper_dict)
# print("Venue Embeddings:", venue_dict)

# Training

In [7]:
# Load initial embeddings
embed_venue = torch.load("/mnt/c/Users/Bruger/Desktop/Bachelor/GraphML_Bachelorprojekt/dataset/ogbn_mag/processed/venue_embeddings.pt")
embed_paper = torch.load("/mnt/c/Users/Bruger/Desktop/Bachelor/GraphML_Bachelorprojekt/dataset/ogbn_mag/processed/paper_embeddings.pt")

  embed_venue = torch.load("/mnt/c/Users/Bruger/Desktop/Bachelor/GraphML_Bachelorprojekt/dataset/ogbn_mag/processed/venue_embeddings.pt")
  embed_paper = torch.load("/mnt/c/Users/Bruger/Desktop/Bachelor/GraphML_Bachelorprojekt/dataset/ogbn_mag/processed/paper_embeddings.pt")


In [8]:
import torch
import copy

# # Load initial embeddings
# embed_venue = torch.load("dataset/ogbn_mag/processed/venue_embeddings.pt")
# embed_paper = torch.load("dataset/ogbn_mag/processed/paper_embeddings.pt")

# Initialize dictionaries to store embeddings
paper_dict = copy.deepcopy(embed_paper)  # Ensure we don't modify the original embeddings
venue_dict = copy.deepcopy(embed_venue)
l_prev = list(paper_c_paper_train.unique().numpy())  # Initial list of nodes

# Number of iterations (adjust as needed)
num_iterations = 1 

for i in range(num_iterations):
    print(f"Iteration {i+1}")

    # Generate mini-batches
    mini_b = mini_batches_code(paper_c_paper_train, l_prev, 5, ('paper', 'cites', 'paper'))
    dm, l_next, remapped_datamatrix_tensor = mini_b.node_mapping()

    # Train embeddings and update dictionaries **in place**
    N_emb = NodeEmbeddingTrainer(
        dm=dm,
        remapped_datamatrix_tensor=remapped_datamatrix_tensor,
        paper_dict=paper_dict,  # Pass reference (no copy)
        venue_dict=venue_dict
    )
    paper_dict, venue_dict = N_emb.train()  # Directly update original dictionaries

    # Update node list for the next iteration
    l_prev = l_next


Iteration 1
[427176, 402674, 211669, 189176, 243585]
Epoch 0: Loss = 2.4848
Epoch 10: Loss = 2.1904
Epoch 20: Loss = 1.9335
Epoch 30: Loss = 1.7108
Epoch 40: Loss = 1.5196
tensor([  1, 100, 134, 250, 277])


# Predicting a sample - needs changes

In [9]:
for key in paper_dict:
    paper_dict[key] = paper_dict[key].detach().clone()
    paper_dict[key].requires_grad = False  # Ensure no gradients are tracked

for key in venue_dict:
    venue_dict[key] = venue_dict[key].detach().clone()
    venue_dict[key].requires_grad = False  # Ensure no gradients are tracked

emb_matrix = torch.stack(list(paper_dict.values()) + list(venue_dict.values()))

In [None]:
sample = 1
mini_b_new = mini_batches_code(paper_c_paper_train, list(paper_c_paper_train.unique().numpy()), sample,('paper', 'cites', 'paper'))
dm_new,l_new,remapped_datamatrix_tensor_new = mini_b_new.node_mapping()
new_datamatrix = dm_new[torch.all(dm_new[:, 4:] != 4, dim=1)]
new_remapped_datamatrix_tensor_new = remapped_datamatrix_tensor_new[torch.all(remapped_datamatrix_tensor_new[:, 4:] != 4, dim=1)]

loss_function = LossFunction(alpha=1.0, eps=1e-10, use_regularization=True)

new_embedding = torch.nn.Embedding(sample, 2)
print(new_embedding.weight)

new_optimizer = torch.optim.Adam(new_embedding.parameters(), lr=0.01)

venue_dict = venue_dict.copy()
paper_dict = paper_dict.copy()
num_epochs = 20

# Training loop
for epoch in range(num_epochs):
    new_optimizer.zero_grad()

    # Concatenate the embeddings
    temp_embed = torch.cat([emb_matrix, new_embedding.weight], dim=0)
    # types = new_datamatrix[:, 3:]
    print(len(temp_embed),len(new_remapped_datamatrix_tensor_new))
    loss = loss_function.compute_loss(temp_embed, new_remapped_datamatrix_tensor_new[:, :3])  # Compute loss
    
    # Backpropagation and optimization
    loss.backward()
    new_optimizer.step()

    # Print loss every 10 epochs
    if epoch % 10 == 0:
        print(f"Epoch {epoch}: Loss = {loss.item():.4f}")

[427176]
Parameter containing:
tensor([[ 0.6127, -1.1754]], requires_grad=True)
623918 1
Epoch 0: Loss = -12479.9619
623918 1
623918 1
623918 1
623918 1
623918 1
623918 1
623918 1
623918 1
623918 1
623918 1
Epoch 10: Loss = -12479.9648
623918 1
623918 1
623918 1
623918 1
623918 1
623918 1
623918 1
623918 1
623918 1


: 

tensor(0)

In [None]:
new_embedding.weight

Parameter containing:
tensor([[ 0.8159, -1.3772]], requires_grad=True)

In [None]:
import torch
import torch.nn.functional as F

alpha = 0.001
logi_f = []

for i in range(len(venue_dict)):
        dist = torch.norm(new_embedding.weight - venue_dict[i])**2  # Euclidean distance
        logi = 1 / (1 + torch.exp(alpha + dist))  # Logistic function
        logi_f.append((logi.item(), i))  # Store tuple (probability, node ID)

# Separate values for softmax computation
logits, node_ids = zip(*logi_f)  # Unzips into two lists

# Convert logits to a tensor and apply softmax
logi_f_tensor = torch.tensor(logits)
softma = F.softmax(logi_f_tensor, dim=0)

# Get the index of the highest probability
high_prob_idx = torch.argmax(softma).item()

# Get the corresponding node ID and its softmax probability
predicted_node_id = node_ids[high_prob_idx]
highest_prob_value = softma[high_prob_idx].item()

# Print the results
print(f"Predicted Node ID: {predicted_node_id}")
print(f"Highest Softmax Probability: {highest_prob_value}")


Predicted Node ID: 322
Highest Softmax Probability: 0.004016751889139414


In [None]:
softma[232]

tensor(0.0026)