In [1]:
import pandas as pd
import torch
import os
# Set working directory

try:
    data_train = pd.read_csv('dataset/ogbn_mag/split/time/paper/train.csv.gz', compression='gzip',header = None)
    data_valid = pd.read_csv('dataset/ogbn_mag/split/time/paper/valid.csv.gz', compression='gzip',header = None)
    data_test = pd.read_csv('dataset/ogbn_mag/split/time/paper/test.csv.gz', compression='gzip',header = None)
except FileNotFoundError:
    os.chdir("..")
    os.chdir("..")
    data_train = pd.read_csv('dataset/ogbn_mag/split/time/paper/train.csv.gz', compression='gzip',header = None)
    data_valid = pd.read_csv('dataset/ogbn_mag/split/time/paper/valid.csv.gz', compression='gzip',header = None)
    data_test = pd.read_csv('dataset/ogbn_mag/split/time/paper/test.csv.gz', compression='gzip',header = None)

data, _ = torch.load(r"dataset/ogbn_mag/processed/geometric_data_processed.pt", weights_only=False)

# Extract edges for "paper" -> "cites" -> "paper"
paper_c_paper = data.edge_index_dict[('paper', 'cites', 'paper')]

# Unique paper IDs to keep (Ensure it's a PyTorch tensor)
nums_valid = torch.tensor(data_valid[0])
nums_test = torch.tensor(data_test[0])
nums_train = torch.tensor(data_train[0])

mask_train = torch.isin(paper_c_paper[0], nums_train) | torch.isin(paper_c_paper[1], nums_train)
mask_valid = torch.isin(paper_c_paper[0], nums_valid) | torch.isin(paper_c_paper[1], nums_valid)
mask_test = torch.isin(paper_c_paper[0], nums_test) | torch.isin(paper_c_paper[1], nums_test)

paper_c_paper_train = paper_c_paper.clone()
paper_c_paper_valid = paper_c_paper.clone()
paper_c_paper_test = paper_c_paper.clone()

# Combine the conditions into a single mask that selects only the train edges
mask_train_done = mask_train & ~mask_valid & ~mask_test
mask_valid_done = mask_valid & ~mask_test

# Apply the combined mask to paper_c_paper_train
paper_c_paper_train = paper_c_paper_train[:, mask_train_done]
paper_c_paper_valid = paper_c_paper_valid[:, mask_valid_done]
paper_c_paper_test = paper_c_paper_test[:, mask_test]

len(paper_c_paper_train[1]) + len(paper_c_paper_valid[1]) + len(paper_c_paper_test[1]), paper_c_paper.shape[1]


(5416271, 5416271)

In [2]:
tensor_y = data['y_dict']['paper']

# Get the indices of the tensor
indices = torch.arange(tensor_y.size(0)).view(-1, 1)  # Create a tensor of indices

# Concatenate the indices with the original tensor
tensor_y = torch.cat((indices, tensor_y), dim=1)
tensor_y


tensor([[     0,    246],
        [     1,    131],
        [     2,    189],
        ...,
        [736386,    266],
        [736387,    289],
        [736388,      1]])

In [3]:
import numpy as np
from scipy.sparse import coo_matrix
import torch

# Example citation data: Replace with your actual data
# For example:
# paper_ids = [0, 88, 27449, 121051, ...]  # Citing papers
# cited_ids = [88, 27449, 121051, ..., 421711, 427339, 439864]  # Cited papers

# Example tensor as you mentioned (2 rows, multiple columns)
# tensor[0] represents paper_ids (papers that cite others)
# tensor[1] represents cited paper_ids (papers being cited)
tensor = paper_c_paper_train 

# Extract the data
paper_ids = tensor[0].numpy()  # Citing papers
cited_ids = tensor[1].numpy()  # Cited papers

# Create a sparse matrix in COO format:
citation_values = np.ones(len(paper_ids))  # All citations will have value 1

# Create the sparse matrix with the correct shape
citation_matrix_coo = coo_matrix((citation_values, (paper_ids, cited_ids)), shape=(max(paper_ids) + 1, max(cited_ids) + 1))

# Now you have a sparse matrix with 3 columns (paper_id, cited_paper_id, 1 for citation)
# Optionally, you can convert it to CSR format for better performance:
citation_matrix_csr = citation_matrix_coo.tocsr()

# If you want to save this matrix to disk (e.g., in HDF5 format), you can do so:
# import h5py
# with h5py.File('citation_matrix.h5', 'w') as f:
#     f.create_dataset('rows', data=citation_matrix_coo.row)
#     f.create_dataset('cols', data=citation_matrix_coo.col)
#     f.create_dataset('data', data=citation_matrix_coo.data)


# use in pytorch
citation_matrix = torch.sparse_coo_tensor((citation_matrix_coo.row, citation_matrix_coo.col), citation_matrix_coo.data)
citation_matrix

  citation_matrix = torch.sparse_coo_tensor((citation_matrix_coo.row, citation_matrix_coo.col), citation_matrix_coo.data)


tensor(indices=tensor([[     0,      0,      0,  ..., 736388, 736388, 736388],
                       [    88,  27449, 121051,  ..., 421711, 427339, 439864]]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]),
       size=(736389, 736389), nnz=3879968, dtype=torch.float64,
       layout=torch.sparse_coo)

In [4]:
# Ensure the sparse tensor is coalesced
citation_matrix = citation_matrix.coalesce()

# Extract first 10 indices and values
indices = citation_matrix.indices()[:, :100]  
values = citation_matrix.values()[:100]       

new_indices = []
new_values = []

# Check for missing entries
for i in indices[0].unique():
    for j in indices[1].unique():
        if citation_matrix[i, j] == 1:
            continue
        else:
            new_indices.append(torch.tensor([[i], [j]]))  # Store new index
            new_values.append(torch.tensor([0]))  # Store new value

# If new indices exist, add them all at once
if new_indices:
    new_indices = torch.cat(new_indices, dim=1)
    new_values = torch.cat(new_values, dim=0)

    # Append to existing data
    updated_indices = torch.cat((indices, new_indices), dim=1)
    updated_values = torch.cat((values, new_values), dim=0)

    # Create new sparse tensor
    citation_matrix = torch.sparse_coo_tensor(updated_indices, updated_values)

import torch

# Ensure the sparse tensor is coalesced
citation_matrix = citation_matrix.coalesce()

# Extract indices and values
indices = citation_matrix.indices()  # Shape: (2, nnz) where nnz = number of nonzero elements
values = citation_matrix.values()    # Shape: (nnz,)

# Stack values, row indices, and column indices into a single array
tensor_array = torch.stack((values, indices[0], indices[1]), dim=1)  # Shape: (nnz, 3)
tensor_array = tensor_array.to(torch.long)
# Print the transformed tensor
print(len(tensor_array))
all_nodes = torch.cat((indices[0], indices[1]), dim=0).unique()
nodes_set_1 = indices[0].unique()
nodes_set_2 = indices[1].unique()
max_node_id = torch.max(indices[1])  # Assuming the second column of `indices` represents node IDs
num_nodes = max_node_id + 1

citation_matrix,tensor_array

2300


(tensor(indices=tensor([[     0,      0,      0,  ...,     32,     32,     32],
                        [    61,     88,    115,  ..., 584314, 610792, 721561]]),
        values=tensor([0., 1., 0.,  ..., 0., 0., 0.]),
        size=(33, 721562), nnz=2300, dtype=torch.float64, layout=torch.sparse_coo),
 tensor([[     0,      0,     61],
         [     1,      0,     88],
         [     0,      0,    115],
         ...,
         [     0,     32, 584314],
         [     0,     32, 610792],
         [     0,     32, 721561]]))

In [5]:
import torch

class LossFunction:
    def __init__(self, alpha=1.0, eps=1e-8, use_regularization=False):
        """
        Initialize the loss function with given parameters.
        
        Args:
            alpha (float): Scaling parameter for edge probability.
            eps (float): Small value to prevent log(0).
            use_regularization (bool): Whether to include Gaussian regularization.
        """
        self.alpha = alpha
        self.eps = eps
        self.use_regularization = use_regularization

    def edge_probability(self, z_i, z_j):
        """Compute the probability of an edge existing between two embeddings."""
        dist = torch.norm(z_i - z_j) ** 2  # Squared Euclidean distance
        return 1 / (1 + torch.exp(-self.alpha + dist))  # Logistic function

    def link_loss(self, label, z_u, z_v):
        """Compute the loss for a single edge."""
        prob = self.edge_probability(z_u, z_v)
        prob = torch.clamp(prob, self.eps, 1 - self.eps)  # Numerical stability

        return label.float() * torch.log(prob) + (1 - label.float()) * torch.log(1 - prob)        

    def compute_loss(self, z, datamatrix_tensor):
        """Compute the total loss for the dataset."""
        sum_loss = sum(
            self.link_loss(label, z[u_idx], z[v_idx])
            for label, u_idx, v_idx in datamatrix_tensor
        )

        loss = -sum_loss / len(datamatrix_tensor)

        if self.use_regularization:
            regularization = -0.5 * torch.sum(z ** 2)
            loss += regularization

        return loss
    

# loss_fn = LossFunction(alpha=1.0, use_regularization=True)
# loss_value = loss_fn.compute_loss(z, datamatrix_tensor)

In [6]:
import matplotlib.pyplot as plt
datamatrix_tensor = tensor_array

# 2️ Define Embeddings
embedding_dim = 2
node_embeddings = torch.nn.Embedding(num_nodes, embedding_dim)
optimizer = torch.optim.Adam(node_embeddings.parameters(), lr=0.01)

loss_function = LossFunction(alpha=1.0, eps=1e-10, use_regularization=False)

# 3️ Train Embeddings
alpha = 3
num_epochs = 5
for epoch in range(num_epochs):
    optimizer.zero_grad()
    z = node_embeddings.weight  # Get embeddings
    loss = loss_function.compute_loss(z, datamatrix_tensor)  # Compute loss
    loss.backward()
    optimizer.step()

    if epoch % 1 == 0:
        print(f"Epoch {epoch}: Loss = {loss.item():.4f}")

Epoch 0: Loss = 0.4476
Epoch 1: Loss = 0.4441
Epoch 2: Loss = 0.4407
Epoch 3: Loss = 0.4373
Epoch 4: Loss = 0.4339


# starter på dataloader her

In [46]:
import random
paper_c_paper_train[0],paper_c_paper_train[1]
list_pcp = list(paper_c_paper.unique().numpy())
random_sample = random.sample(list_pcp, 1)
print(random_sample)

for value in random_sample:
    list_pcp.remove(value)

mask = torch.isin(paper_c_paper_train[0], torch.tensor(random_sample))

# Use the mask to select the rows that match the random sample
filtered_data = paper_c_paper_train[:,mask]

filtered_data

[662957]


tensor([], size=(2, 0), dtype=torch.int64)

In [135]:
import random
random.seed(99) 
torch.manual_seed(99)
class mini_batches_code:
    def __init__(self,data,sample_size):
        self.data = data
        self.sample_size = sample_size

    def get_batch(self):
        random.seed(99) 
        torch.manual_seed(99)
        list_pcp = list(self.data[0].unique().numpy())
        random_sample = random.sample(list_pcp, self.sample_size)
        print(random_sample)
        for value in random_sample:
            list_pcp.remove(value)
        mask = torch.isin(self.data[0], torch.tensor(random_sample))
        filtered_data = self.data[:,mask]
        return filtered_data, random_sample
    
    def data_matrix(self):
        tensor, random_sample = self.get_batch()
        new_arrays = []

        for i in range(tensor.shape[1]):
            # For each column, create an array starting with 1 and followed by the values from the column
            new_array = torch.tensor([1, tensor[0, i], tensor[1, i]])
            new_arrays.append(new_array)

        # Stack the arrays to form the final tensor
        result_tensor = torch.stack(new_arrays)

        non_edges = []

        for i in random_sample:
            for j in tensor[1].unique():
                if i == j:
                    continue
                if not (torch.any((result_tensor[:, 1] == i) & (result_tensor[:, 2] == j))): # Check if the edge exists
                    non_edges.append(torch.tensor([0, i.item(), j.item()]))

        non_edges_tensor = torch.stack(non_edges)
        data_matrix = torch.cat((result_tensor, non_edges_tensor), dim=0)
        return data_matrix


In [138]:
mini_b = mini_batches_code(paper_c_paper_train[:,:10], 2)
mini_b.data_matrix()

[5, 2]


tensor([[     1,      2, 186851],
        [     1,      2, 376347],
        [     1,      2, 410167],
        [     1,      5,  13032],
        [     0,      5, 186851],
        [     0,      5, 376347],
        [     0,      5, 410167],
        [     0,      2,  13032]])

In [146]:
import torch
mini_b = mini_batches_code(paper_c_paper_train[:,:10], 3)

# Original tensor
tensor, random_sample = mini_b.get_batch()

# Initialize an empty list to collect the new arrays
new_arrays = []

# Iterate over each column of the tensor
for i in range(tensor.shape[1]):
    # For each column, create an array starting with 1 and followed by the values from the column
    new_array = torch.tensor([1, tensor[0, i], tensor[1, i]])
    new_arrays.append(new_array)

# Stack the arrays to form the final tensor
result_tensor = torch.stack(new_arrays)

non_edges = []
venues = []

for i in random_sample:
    venues.append(torch.tensor([1,i.item(),data['y_dict']['paper'][i]]))
    for j in tensor[1].unique():
        if i == j:
            continue
        if not (torch.any((result_tensor[:, 1] == i) & (result_tensor[:, 2] == j))): # Check if the edge exists
            non_edges.append(torch.tensor([0, i.item(), j.item()]))
            print(data['y_dict']['paper'][i])

for idx,r in enumerate(random_sample[:-1]):
    if data['y_dict']['paper'][r] != data['y_dict']['paper'][random_sample[idx+1]]:
        venues.append(torch.tensor([0,r.item(),data['y_dict']['paper'][random_sample[idx+1]]]))

non_edges_tensor = torch.stack(non_edges)
venues_tensor = torch.stack(venues)
data_matrix_1 = torch.cat((result_tensor, non_edges_tensor), dim=0)
data_matrix = torch.cat((data_matrix_1, venues_tensor), dim=0)
data_matrix

# tensor_y = data['y_dict']['paper'][i]

# # Get the indices of the tensor
# indices = torch.arange(tensor_y.size(0)).view(-1, 1)  # Create a tensor of indices

# # Concatenate the indices with the original tensor
# tensor_y = torch.cat((indices, tensor_y), dim=1)
# tensor_y


[5, 2, 0]
tensor([9])
tensor([9])
tensor([9])
tensor([9])
tensor([9])
tensor([9])
tensor([9])
tensor([9])
tensor([189])
tensor([189])
tensor([189])
tensor([189])
tensor([189])
tensor([189])
tensor([246])
tensor([246])
tensor([246])
tensor([246])


tensor([[     1,      0,     88],
        [     1,      0,  27449],
        [     1,      0, 121051],
        [     1,      0, 151667],
        [     1,      0, 308499],
        [     1,      2, 186851],
        [     1,      2, 376347],
        [     1,      2, 410167],
        [     1,      5,  13032],
        [     0,      5,     88],
        [     0,      5,  27449],
        [     0,      5, 121051],
        [     0,      5, 151667],
        [     0,      5, 186851],
        [     0,      5, 308499],
        [     0,      5, 376347],
        [     0,      5, 410167],
        [     0,      2,     88],
        [     0,      2,  13032],
        [     0,      2,  27449],
        [     0,      2, 121051],
        [     0,      2, 151667],
        [     0,      2, 308499],
        [     0,      0,  13032],
        [     0,      0, 186851],
        [     0,      0, 376347],
        [     0,      0, 410167],
        [     1,      5,      9],
        [     1,      2,    189],
        [     

In [None]:
random_sample

tensor([[     0,    246],
        [     1,    131],
        [     2,    189],
        ...,
        [736386,    266],
        [736387,    289],
        [736388,      1]])