In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
from tqdm import tqdm


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load dataset
df = pd.read_csv("../xtract-api/DataSet/arxiv_processed.csv")  

# Combine title and abstract for embedding
df["text"] = (df["title"].fillna('') + ". " + df["abstract"].fillna('')).str.strip()

print("Total papers:", len(df))
df.head()


Total papers: 194065


Unnamed: 0,id,title,abstract,authors,category_code,update_date,clean_title,clean_abstract,category,text
0,acc-phys/9607001,An Investigation of Stochastic Cooling in the ...,This report provides a description of unbunc...,O. Meincke,acc-phys physics.acc-ph,2008-02-03,an investigation of stochastic cooling in the ...,this report provides a description of unbunche...,"Accelerator Physics, Physics – Accelerator Phy...",An Investigation of Stochastic Cooling in the ...
1,acc-phys/9601001,Particle Motion in the Stable Region Near an E...,This paper studies the particle motion when ...,G. Parzen (Brookhaven National Laboratory),acc-phys physics.acc-ph,2008-02-03,particle motion in the stable region near an e...,this paper studies the particle motion when th...,"Accelerator Physics, Physics – Accelerator Phy...",Particle Motion in the Stable Region Near an E...
2,acc-phys/9602001,Muon Colliders,Muon Colliders have unique technical and phy...,"R. B. Palmer(BNL), A. Sessler(LBNL), A. Skrins...",acc-phys physics.acc-ph,2012-08-29,muon colliders,muon colliders have unique technical and physi...,"Accelerator Physics, Physics – Accelerator Phy...",Muon Colliders. Muon Colliders have unique t...
3,adap-org/9306005,Prediction and Adaptation in an Evolving Chaot...,We describe the results of analytic calculat...,"Alfred H\""ubler and David Pines (Santa Fe Inst...",adap-org chao-dyn nlin.AO nlin.CD,2008-02-03,prediction and adaptation in an evolving chaot...,we describe the results of analytic calculatio...,"Adaptation, Noise, and Self-Organizing Systems...",Prediction and Adaptation in an Evolving Chaot...
4,chao-dyn/9407001,Pattern Dynamics of a Coupled Map Lattice for ...,The pattern dynamics of the one-way coupled ...,Frederick H. Willeboordse (University of Tokyo...,chao-dyn adap-org nlin.AO nlin.CD nlin.PS patt...,2015-06-24,pattern dynamics of a coupled map lattice for ...,the pattern dynamics of the one way coupled lo...,"Adaptation, Noise, and Self-Organizing Systems...",Pattern Dynamics of a Coupled Map Lattice for ...


In [3]:
# Load pretrained SPECTER2 model
model = SentenceTransformer("allenai/specter2_base")


No sentence-transformers model found with name allenai/specter2_base. Creating a new one with mean pooling.


In [4]:
# Convert text to list
texts = df["text"].tolist()

# Compute embeddings (batching automatically handled)
print("Encoding texts using SPECTER2...")
embeddings = model.encode(
    texts,
    batch_size=32,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True  # important for cosine similarity
)

print("Embeddings shape:", embeddings.shape)


Encoding texts using SPECTER2...


Batches: 100%|██████████| 6065/6065 [3:01:29<00:00,  1.80s/it]     


Embeddings shape: (194065, 768)


In [5]:
d = embeddings.shape[1]  # embedding dimension (768)
index = faiss.IndexFlatIP(d)  # Inner product = cosine similarity
index.add(embeddings)         # Add all paper embeddings

print("Total papers indexed:", index.ntotal)

Total papers indexed: 194065


In [6]:
def recommend_similar_papers(query_title, query_abstract, top_k=5):
    query_text = f"{query_title}. {query_abstract}"
    query_vec = model.encode([query_text], normalize_embeddings=True)
    
    # Search in FAISS index
    D, I = index.search(query_vec, top_k)
    
    results = df.iloc[I[0]][["id", "title", "authors", "category_code", "update_date"]].copy()
    results["similarity"] = D[0]
    return results


In [7]:
query_title = "Graph Neural Networks for Statistical Modeling"
query_abstract = "We explore graph-based methods for learning dependencies among statistical entities..."

recommendations = recommend_similar_papers(query_title, query_abstract, top_k=5)
print(recommendations)


               id                                              title  \
39519  2104.07396  Node Co-occurrence based Graph Neural Networks...   
64914  2507.19527  Research on the application of graph data stru...   
38828  2507.10772  Applying Text Embedding Models for Efficient A...   
38711  1911.02562  Gextext: Disease Network Extraction from Biome...   
67481  2402.17906  Representation learning in multiplex graphs: W...   

                                                 authors category_code  \
39519  Dai Quoc Nguyen and Vinh Tong and Dinh Phung a...   cs.CL cs.LG   
64914                           Yihan Wang, Jianing Zhao         cs.LG   
38828                                  Michal Podstawski   cs.CL cs.IR   
38711                                      Robert O'Shea   cs.DL cs.CL   
67481                   Piotr Bielak, Tomasz Kajdanowicz   cs.LG cs.SI   

      update_date  similarity  
39519  2021-12-28    0.909727  
64914  2025-07-29    0.907358  
38828  2025-08-21    0.905

In [8]:
# Save
faiss.write_index(index, "papers_index.faiss")
np.save("embeddings.npy", embeddings)
df.to_csv("papers_with_embeddings.csv", index=False)

In [None]:
# # Later: Load
# index = faiss.read_index("papers_index.faiss")
# embeddings = np.load("embeddings.npy")
# df = pd.read_csv("papers_with_embeddings.csv")