In [3]:
# IMPORTS AND DIRECTORY INITIALIZATION
import numpy as np
import torch
from pathlib import Path
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from tqdm.auto import tqdm
from docx import Document
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE
from torch.nn import TripletMarginLoss
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
import warnings
warnings.filterwarnings('ignore')

BASE_DIR = Path.cwd().parent.parent
DATA_DIR = BASE_DIR / "oc_mini"

In [4]:
def compute_embeddings(texts, model, tokenizer, device):
    """Compute embeddings for a list of texts."""
    if not texts:
        return np.array([])
    
    inputs = tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors="pt"
    ).to(device)
    
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state[:, 0, :]
    
    return embeddings.cpu().numpy()

In [6]:
import json

# CLUSTER LOADING

cluster_path = DATA_DIR / "clustering" / "hierarchical" / "oc_mini_paris.json"
metadata_path = DATA_DIR / "metadata" / "oc_mini_node_metadata.csv"

with open(cluster_path, 'r') as f:
    cluster_data = json.load(f)

# Sanity Check
print(f"Loaded cluster data: {len(cluster_data)}")

Loaded cluster data: 4


In [7]:
# Load SciBERT model and tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model_name = "allenai/scibert_scivocab_uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)

print(f"SciBERT model loaded: {model_name}")

# Embed a sample sentence
sample_sentence = "The relationship between quantum mechanics and general relativity remains one of the most important unsolved problems in theoretical physics."

embedding = compute_embeddings([sample_sentence], model, tokenizer, device)

print(f"\nSample sentence: {sample_sentence}")
print(f"Embedding shape: {embedding.shape}")
print(f"Embedding (first 10 dimensions): {embedding[0][:10]}")

Using device: cuda


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

SciBERT model loaded: allenai/scibert_scivocab_uncased

Sample sentence: The relationship between quantum mechanics and general relativity remains one of the most important unsolved problems in theoretical physics.
Embedding shape: (1, 768)
Embedding (first 10 dimensions): [-0.38251862 -0.10934478  0.50328714  0.07526603  0.40628052 -0.21742031
  0.8403158  -0.9267022   0.02716607  0.9420572 ]


In [8]:
metadata_df = pd.read_csv(metadata_path)

metadata_df.head()

Unnamed: 0,id,doi,title,abstract
0,128,10.1101/2021.05.10.443415,Improved protein contact prediction using dime...,AbstractDeep residual learning has shown great...
1,163,10.1101/2021.05.07.443114,Following the Trail of One Million Genomes: Fo...,AbstractSevere acute respiratory syndrome coro...
2,200,10.1101/2021.05.11.443555,Mechanism of molnupiravir-induced SARS-CoV-2 m...,Molnupiravir is an orally available antiviral ...
3,941,10.3390/ijms20020449,Bactericidal and Cytotoxic Properties of Silve...,Silver nanoparticles (AgNPs) can be synthesize...
4,1141,10.3390/ijms20040865,Silver Nanoparticles: Synthesis and Applicatio...,"Over the past few decades, metal nanoparticles..."
