In [2]:
from tqdm import tqdm
import json
import networkx as nx

G = nx.Graph()
max_citations = -1

# This json file has all the filtered data provided by OpenAlex API
with open("../data/openalex_cs_papers.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Collecting author and co-authorship features
for work in tqdm(data["results"]):
    authors = []
    cited_by = work.get("cited_by_count", 0)  

    # Adding/updating attributes for each author
    for author_data in work["authorships"]:
        author_id = author_data["author"]["id"]
        affiliation = (
            author_data["institutions"][0]["display_name"]
            if author_data.get("institutions")
            else "Unknown"
        )
        
        authors.append({"id": author_id, "title": work['title']})

        # Custom attributes for author nodes
        if author_id not in G:
            G.add_node(
                author_id,
                affiliated_institution=affiliation,
                citation_count=cited_by
            )
        else:
            G.nodes[author_id]["citation_count"] += cited_by

        max_citations = max(max_citations, G.nodes[author_id]["citation_count"])

    # Adding co-authorship edges
    for i in range(len(authors)):
        for j in range(i + 1, len(authors)):
            id_1, id_2 = authors[i]["id"], authors[j]["id"]
            if G.has_edge(id_1, id_2):
                G[id_1][id_2]["title"].append(authors[i]["title"])
            else:
                G.add_edge(id_1, id_2)
                G[id_1][id_2]["title"] = [
                    authors[i]["title"]
                ]


100%|██████████| 200/200 [00:00<00:00, 1712.75it/s]


In [None]:
import torch
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader

from sentence_transformers import SentenceTransformer
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
node_features_list = []

for node_id in tqdm(G.nodes()):
    node = G.nodes[node_id]
    scaled_citation_count = node["citation_count"] / max_citations
    feat = [scaled_citation_count]
    feat.extend(sentence_model.encode(node['affiliated_institution']).tolist())
    node_features_list.append(torch.tensor(feat, dtype=torch.float))

node_features = torch.stack(node_features_list)

100%|██████████| 2070/2070 [00:06<00:00, 302.70it/s]


In [9]:
institution_names = [G.nodes[node_id]['affiliated_institution'] for node_id in G.nodes()]
institution_embeddings = sentence_model.encode(institution_names, convert_to_tensor=True).to('cuda')

node_features_list = []
for i, node_id in enumerate(G.nodes()):
    node = G.nodes[node_id]
    scaled_citation_count = torch.tensor([node["citation_count"] / max_citations], dtype=torch.float).to('cuda')
    feat = torch.cat((scaled_citation_count, institution_embeddings[i]))
    node_features_list.append(feat)

node_features = torch.stack(node_features_list)

In [None]:
edge_list = list(G.edges())
edge_indices = torch.empty((2, 0), dtype=torch.long)
edge_features = torch.empty((0, sentence_model.get_sentence_embedding_dimension()), dtype=torch.float)

assert edge_list is not None

node_to_idx = {node: i for i, node in enumerate(G.nodes())}
mapped_edges = [[node_to_idx[u], node_to_idx[v]] for u, v in edge_list]
edge_indices = torch.tensor(mapped_edges, dtype=torch.long).t().contiguous()
edge_features_list = []

for u, v in tqdm(edge_list):
    titles = G[u][v].get('title', [])

    title_embeddings = []
    for title in titles:
        title_embeddings.append(torch.tensor(sentence_model.encode(title), dtype=torch.float))
    
    averaged_embedding = torch.stack(title_embeddings).mean(dim=0)
    edge_features_list.append(averaged_embedding)
        
edge_features = torch.stack(edge_features_list)


  0%|          | 0/43246 [00:00<?, ?it/s]


AttributeError: 'numpy.ndarray' object has no attribute 'cuda'

In [10]:
edge_list = list(G.edges())
edge_indices = torch.empty((2, 0), dtype=torch.long)
edge_features = torch.empty((0, sentence_model.get_sentence_embedding_dimension()), dtype=torch.float).to('cuda')

assert edge_list is not None

node_to_idx = {node: i for i, node in enumerate(G.nodes())}
mapped_edges = [[node_to_idx[u], node_to_idx[v]] for u, v in edge_list]
edge_indices = torch.tensor(mapped_edges, dtype=torch.long).t().contiguous().to('cuda')

all_individual_titles = []
title_slices = []
current_idx = 0
for u, v in edge_list:
    titles = G[u][v]['title']
    all_individual_titles.extend(titles)
    title_slices.append((current_idx, current_idx + len(titles)))
    current_idx += len(titles)

batched_title_embeddings = sentence_model.encode(all_individual_titles, convert_to_tensor=True).to('cuda')

edge_features_list = []
for i, (u, v) in enumerate(edge_list):
    start_idx, end_idx = title_slices[i]
    individual_embeddings_for_edge = batched_title_embeddings[start_idx:end_idx]
    averaged_embedding = individual_embeddings_for_edge.mean(dim=0)
    edge_features_list.append(averaged_embedding)

edge_features = torch.stack(edge_features_list)

In [11]:
graph_data = Data(
    x=node_features,
    edge_index=edge_indices,
    edge_attr=edge_features
)

dataset = [graph_data]
dataloader = DataLoader(dataset, batch_size=1, shuffle=False)


In [12]:
# Assuming 'single_batch_dataloader' is the DataLoader object you created

for batch in dataloader:
    print(f"Type of batch object: {type(batch)}")
    print(f"Batch content: {batch}")
    
    print(f"\nNode features (x):")
    print(f"  Shape: {batch.x.shape}")
    print(f"  Device: {batch.x.device}")
    
    print(f"\nEdge indices (edge_index):")
    print(f"  Shape: {batch.edge_index.shape}")
    print(f"  Device: {batch.edge_index.device}")
    
    print(f"\nEdge features (edge_attr):")
    print(f"  Shape: {batch.edge_attr.shape}")
    print(f"  Device: {batch.edge_attr.device}")

    # You can also check other attributes of the Data object
    print(f"\nNumber of nodes: {batch.num_nodes}")
    print(f"Number of edges: {batch.num_edges}")

    # Break after the first (and only) batch if you only want to inspect it
    break

Type of batch object: <class 'abc.DataBatch'>
Batch content: DataBatch(x=[2070, 385], edge_index=[2, 43246], edge_attr=[43246, 384], batch=[2070], ptr=[2])

Node features (x):
  Shape: torch.Size([2070, 385])
  Device: cuda:0

Edge indices (edge_index):
  Shape: torch.Size([2, 43246])
  Device: cuda:0

Edge features (edge_attr):
  Shape: torch.Size([43246, 384])
  Device: cuda:0

Number of nodes: 2070
Number of edges: 43246
