In [1]:
import pickle as pkl
import os 
import sys
import numpy as np
from xopen import xopen
import json
from tqdm import tqdm
from transformers import BertTokenizer, BertModel
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import MessagePassing
from torch_geometric.utils import add_self_loops

def simMatrix(A: torch.tensor, B: torch.tensor) -> torch.tensor:
    # Assume A and B are your input tensors of shape (N, d)
    # Example: A = torch.randn(N, d)
    #          B = torch.randn(N, d)

    # Step 1: Normalize A and B
    A_norm = A / A.norm(dim=1, keepdim=True)
    B_norm = B / B.norm(dim=1, keepdim=True)

    # Step 2: Compute the dot product
    cosine_similarity_matrix = torch.mm(A_norm, B_norm.transpose(0, 1))

    # The resulting cosine_similarity_matrix is of shape (N, N)
    # and contains values in the range [-1, 1]
    return cosine_similarity_matrix

DATA_PATH = "/home/ubuntu/proj/data/graph/node_pubmed"
DATA_NAME = "text_graph_pubmed" # "text_graph_pubmed" #"text_graph_aids" #"text_graph_pubmed" # # 

with open(os.path.join(DATA_PATH, f"{DATA_NAME}.pkl"), 'rb') as f:
    graph = pkl.load(f)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
all_levels_embedding = dict()
for relevance_type in ['pos','neg','random_1','random_2','random_3']:
    all_levels_embedding[relevance_type] = dict()
    # build 0-order textual-graph
    text_nodes = graph.text_nodes
    edge_index = graph.edge_index
    k = 2
 
    for order in range(0, k+1):
        all_levels_embedding[relevance_type][order] = torch.load(os.path.join(DATA_PATH, relevance_type, f"order-{order}-bert.pt"))

In [3]:
types_choice = ['pos','neg','random_1','random_2','random_3']
for i in range(len(types_choice)):
    for j in range(i+1,len(types_choice)):
        relevance_type_1 = types_choice[i]
        relevance_type_2 = types_choice[j]
        for k in range(3):
            cosine_values = simMatrix(
                all_levels_embedding[relevance_type_1][k].squeeze(-2),
                all_levels_embedding[relevance_type_2][k].squeeze(-2)
            ).diag().numpy()
            average_value = np.mean(cosine_values)
            std_value = np.std(cosine_values)
            print(f"{k=}, {relevance_type_1=}, {relevance_type_2=}, {average_value:.4f}".format(average_value))


k=0, relevance_type_1='pos', relevance_type_2='neg', 1.0000
k=1, relevance_type_1='pos', relevance_type_2='neg', 0.9997
k=2, relevance_type_1='pos', relevance_type_2='neg', 0.9998
k=0, relevance_type_1='pos', relevance_type_2='random_1', 1.0000
k=1, relevance_type_1='pos', relevance_type_2='random_1', 0.9987
k=2, relevance_type_1='pos', relevance_type_2='random_1', 0.9970
k=0, relevance_type_1='pos', relevance_type_2='random_2', 1.0000
k=1, relevance_type_1='pos', relevance_type_2='random_2', 0.9987
k=2, relevance_type_1='pos', relevance_type_2='random_2', 0.9970
k=0, relevance_type_1='pos', relevance_type_2='random_3', 1.0000
k=1, relevance_type_1='pos', relevance_type_2='random_3', 0.9987
k=2, relevance_type_1='pos', relevance_type_2='random_3', 0.9970
k=0, relevance_type_1='neg', relevance_type_2='random_1', 1.0000
k=1, relevance_type_1='neg', relevance_type_2='random_1', 0.9987
k=2, relevance_type_1='neg', relevance_type_2='random_1', 0.9970
k=0, relevance_type_1='neg', relevance_t

In [4]:
for k in range(3):
    print(simMatrix(
                    all_levels_embedding['neg'][k].squeeze(-2),
                    all_levels_embedding['neg'][k].squeeze(-2)
                ).mean())

tensor(0.9493)
tensor(0.9241)
tensor(0.9285)
