In [101]:
import pandas as pd
import torch
import os
# Set working directory

try:
    data_train = pd.read_csv('dataset/ogbn_mag/split/time/paper/train.csv.gz', compression='gzip',header = None)
    data_valid = pd.read_csv('dataset/ogbn_mag/split/time/paper/valid.csv.gz', compression='gzip',header = None)
    data_test = pd.read_csv('dataset/ogbn_mag/split/time/paper/test.csv.gz', compression='gzip',header = None)
except FileNotFoundError:
    os.chdir("..")
    os.chdir("..")
    data_train = pd.read_csv('dataset/ogbn_mag/split/time/paper/train.csv.gz', compression='gzip',header = None)
    data_valid = pd.read_csv('dataset/ogbn_mag/split/time/paper/valid.csv.gz', compression='gzip',header = None)
    data_test = pd.read_csv('dataset/ogbn_mag/split/time/paper/test.csv.gz', compression='gzip',header = None)

data, _ = torch.load(r"dataset/ogbn_mag/processed/geometric_data_processed.pt", weights_only=False)

# Extract edges for "paper" -> "cites" -> "paper"
paper_c_paper = data.edge_index_dict[('paper', 'cites', 'paper')]

# Unique paper IDs to keep (Ensure it's a PyTorch tensor)
nums_valid = torch.tensor(data_valid[0])
nums_test = torch.tensor(data_test[0])
nums_train = torch.tensor(data_train[0])

mask_train = torch.isin(paper_c_paper[0], nums_train) | torch.isin(paper_c_paper[1], nums_train)
mask_valid = torch.isin(paper_c_paper[0], nums_valid) | torch.isin(paper_c_paper[1], nums_valid)
mask_test = torch.isin(paper_c_paper[0], nums_test) | torch.isin(paper_c_paper[1], nums_test)

paper_c_paper_train = paper_c_paper.clone()
paper_c_paper_valid = paper_c_paper.clone()
paper_c_paper_test = paper_c_paper.clone()

# Combine the conditions into a single mask that selects only the train edges
mask_train_done = mask_train & ~mask_valid & ~mask_test
mask_valid_done = mask_valid & ~mask_test

# Apply the combined mask to paper_c_paper_train
paper_c_paper_train = paper_c_paper_train[:, mask_train_done]
paper_c_paper_valid = paper_c_paper_valid[:, mask_valid_done]
paper_c_paper_test = paper_c_paper_test[:, mask_test]

len(paper_c_paper_train[1]) + len(paper_c_paper_valid[1]) + len(paper_c_paper_test[1]), paper_c_paper.shape[1]


(5416271, 5416271)

In [102]:
tensor_y = data['y_dict']['paper']

# Get the indices of the tensor
indices = torch.arange(tensor_y.size(0)).view(-1, 1)  # Create a tensor of indices

# Concatenate the indices with the original tensor
tensor_y = torch.cat((indices, tensor_y), dim=1)
tensor_y


tensor([[     0,    246],
        [     1,    131],
        [     2,    189],
        ...,
        [736386,    266],
        [736387,    289],
        [736388,      1]])

In [103]:
import random
import itertools
import torch
random.seed(99)
torch.manual_seed(99)

class mini_batches_code:
    def __init__(self,data, unique_list, sample_size,edge_type):
        self.data = data
        self.sample_size = sample_size
        self.edge_type = edge_type
        self.unique_list = unique_list

    def get_batch(self):
        # random.seed(99) 
        # torch.manual_seed(99)
        list_pcp = self.unique_list
        random_sample = random.sample(list_pcp, self.sample_size)
        print(random_sample)
        for value in random_sample:
            list_pcp.remove(value)
        mask = torch.isin(self.data[0], torch.tensor(random_sample))
        filtered_data = self.data[:,mask]
        return filtered_data, random_sample, list_pcp
    
    def data_matrix(self):
        data, _ = torch.load(r"dataset/ogbn_mag/processed/geometric_data_processed.pt", weights_only=False)
        edge_entities = {
            'paper': 0,
            'author': 1,
            'institution': 2,
            'field_of_study': 3,
            'venue': 4,
        }
        # Get batch and initialize tensors
        tensor, random_sample, unique_list = self.get_batch()

        # Create result tensor from input batch
        result_tensor = torch.stack([torch.tensor([1, tensor[0, i], tensor[1, i],edge_entities[self.edge_type[0]],edge_entities[self.edge_type[2]]]) for i in range(tensor.shape[1])])

        # Initialize lists for non_edges and venues
        non_edges, venues = [], []

        # Add venue links for sampled nodes
        for i in random_sample:
            venues.append(torch.tensor([1, i.item(), data['y_dict']['paper'][i], edge_entities[self.edge_type[0]],edge_entities['venue']]))

            # Find non-existing edges
            for j in tensor[1].unique():
                if i != j and not torch.any((result_tensor[:, 1] == i) & (result_tensor[:, 2] == j)): 
                    non_edges.append(torch.tensor([0, i.item(), j.item(),edge_entities[self.edge_type[0]],edge_entities[self.edge_type[2]]]))

        for r, j in itertools.combinations(random_sample, 2):  # itertools generates all unique pairs
            if data['y_dict']['paper'][r] != data['y_dict']['paper'][j]:
                venues.append(torch.tensor([0, r, data['y_dict']['paper'][j],1,0]))
                venues.append(torch.tensor([0, j, data['y_dict']['paper'][r],1,0]))

        # Convert lists to tensors only once to optimize memory usage
        non_edges_tensor = torch.stack(non_edges) if non_edges else torch.empty((0, 5), dtype=torch.long)
        venues_tensor = torch.stack(venues) if venues else torch.empty((0, 5), dtype=torch.long)

        # Merge all tensors
        data_matrix = torch.cat((result_tensor, non_edges_tensor, venues_tensor), dim=0)
        return data_matrix, unique_list


# mini_b = mini_batches_code(paper_c_paper_train, list(paper_c_paper.unique().numpy()), 10,('paper', 'cites', 'paper'))
# dm,l1 = mini_b.data_matrix()
# mini_b1 = mini_batches_code(paper_c_paper_train, l1, 10,('paper', 'cites', 'paper'))

In [104]:
import torch

class LossFunction:
    def __init__(self, alpha=1.0, eps=1e-8, use_regularization=False):
        """
        Initialize the loss function with given parameters.
        
        Args:
            alpha (float): Scaling parameter for edge probability.
            eps (float): Small value to prevent log(0).
            use_regularization (bool): Whether to include Gaussian regularization.
        """
        self.alpha = alpha
        self.eps = eps
        self.use_regularization = use_regularization

    # def edge_probability(self, z_i, z_j):
    #     """Compute the probability of an edge existing between two embeddings."""
    #     dist = torch.norm(z_i - z_j) ** 2  # Squared Euclidean distance
    #     return 1 / (1 + torch.exp(-self.alpha + dist))  # Logistic function

    # def link_loss(self, label, z_u, z_v):
    #     """Compute the loss for a single edge."""
    #     prob = self.edge_probability(z_u, z_v)
    #     prob = torch.clamp(prob, self.eps, 1 - self.eps)  # Numerical stability

    #     return label.float() * torch.log(prob) + (1 - label.float()) * torch.log(1 - prob)        

    # def compute_loss(self, z, datamatrix_tensor):
    #     """Compute the total loss for the dataset."""
    #     sum_loss = sum(
    #         self.link_loss(label, z[u_idx], z[v_idx])
    #         for label, u_idx, v_idx in datamatrix_tensor
    #     )

    #     loss = -sum_loss / len(datamatrix_tensor)

    #     if self.use_regularization:
    #         regularization = -0.5 * torch.sum(z ** 2)
    #         loss += regularization

    #     return loss



    def edge_probability(self, z_i, z_j, type_i, type_j):
        """Compute the probability of an edge existing between two nodes, considering embeddings and types."""
        # Convert types to one-hot encoded vectors
        type_i = type_i.view(1, -1).float()
        type_j = type_j.view(1, -1).float()
        # Ensure types are float tensors for concatenation

        # Combine the node embeddings and types (one-hot encoded)
        z_i = z_i.view(1, -1).float()  # Ensure z_i is a float tensor
        z_j = z_j.view(1, -1).float()  # Ensure z_j is a float tensor
        
        combined_i = torch.cat((z_i, type_i), dim=-1)  # Concatenate embedding and type for node i
        combined_j = torch.cat((z_j, type_j), dim=-1)  # Concatenate embedding and type for node j
        
        dist = torch.norm(combined_i - combined_j) ** 2  # Squared Euclidean distance
        return 1 / (1 + torch.exp(-self.alpha + dist))  # Logistic function

    def link_loss(self, label, z_u, z_v, type_u, type_v):
        """Compute the loss for a single edge, considering node types."""
        prob = self.edge_probability(z_u, z_v, type_u, type_v)
        prob = torch.clamp(prob, self.eps, 1 - self.eps)  # Numerical stability

        return label.float() * torch.log(prob) + (1 - label.float()) * torch.log(1 - prob)

    def compute_loss(self, z, types, datamatrix_tensor):
        """Compute the total loss for the dataset, considering node types."""
        sum_loss = sum(
            self.link_loss(label, z[idx][0], z[idx][1], types[idx][0], types[idx][1])
            for idx, (label, _, _) in enumerate(datamatrix_tensor)
        )

        loss = -sum_loss / len(datamatrix_tensor)

        if self.use_regularization:
            regularization = self.lam * torch.sum(z ** 2)
            loss += regularization

        return loss
    

# loss_fn = LossFunction(alpha=1.0, use_regularization=True)
# loss_value = loss_fn.compute_loss(z, datamatrix_tensor)

In [105]:
mini_b = mini_batches_code(paper_c_paper_train, list(paper_c_paper.unique().numpy()), 10,('paper', 'cites', 'paper'))
dm,l1 = mini_b.data_matrix()
dm[:,3:]

[np.int64(423601), np.int64(399248), np.int64(209794), np.int64(628557), np.int64(187487), np.int64(241447), np.int64(260499), np.int64(139716), np.int64(90784), np.int64(263350)]


tensor([[0, 0],
        [0, 0],
        [0, 0],
        ...,
        [1, 0],
        [1, 0],
        [1, 0]])

In [106]:
import matplotlib.pyplot as plt
datamatrix_tensor = dm
num_nodes = dm.shape[0]
# 2️ Define Embeddings
embedding_dim = 2
node_embeddings = torch.nn.Embedding(num_nodes, embedding_dim)
optimizer = torch.optim.Adam(node_embeddings.parameters(), lr=0.01)

loss_function = LossFunction(alpha=1.0, eps=1e-10, use_regularization=False)

# 3️ Train Embeddings
alpha = 3
num_epochs = 50
for epoch in range(num_epochs):
    optimizer.zero_grad()
    z = node_embeddings.weight  # Get embeddings
    types = dm[:,3:]
    loss = loss_function.compute_loss(z, types, datamatrix_tensor[:,:3])  # Compute loss
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:
        print(f"Epoch {epoch}: Loss = {loss.item():.4f}")

Epoch 0: Loss = 1.0530
Epoch 10: Loss = 0.9286
Epoch 20: Loss = 0.8058
Epoch 30: Loss = 0.6884
Epoch 40: Loss = 0.5831


# starter på dataloader her