In [1]:
download_dir = './arxiv_downloads'

In [2]:
import os
files = os.listdir(download_dir)
files[:3]

['2503.20488v1.Adaptive_Local_Clustering_over_Attributed_Graphs.pdf',
 '2503.20491v1.VPO__Aligning_Text_to_Video_Generation_Models_with_Prompt_Optimization.pdf',
 '2503.20492v1.Towards_Efficient_and_General_Purpose_Few_Shot_Misclassification_Detection_for_Vision_Language_Models.pdf']

In [3]:
from pypdf import PdfReader
reader = PdfReader(os.path.join(download_dir, files[0]))
len(reader.pages), reader.pages[0].extract_text()[:1000]

(21,
 'Adaptive Local Clustering over Attributed Graphs\nTechnical Report\nHaoran Zheng\nHong Kong Baptist University\nHong Kong SAR, China\ncshrzheng@comp.hkbu.edu.hk\nRenchi Yang\nHong Kong Baptist University\nHong Kong SAR, China\nrenchi@hkbu.edu.hk\nJianliang Xu\nHong Kong Baptist University\nHong Kong SAR, China\nxujl@hkbu.edu.hk\nAbstract—Given a graph G and a seed node vs, the objective\nof local graph clustering (LGC) is to identify a subgraph Cs ∈ G\n(a.k.a. local cluster) surrounding vs in time roughly linear with\nthe size of Cs. This approach yields personalized clusters without\nneeding to access the entire graph, which makes it highly suitable\nfor numerous applications involving large graphs. However, most\nexisting solutions merely rely on the topological connectivity\nbetween nodes in G, rendering them vulnerable to missing or\nnoisy links that are commonly present in real-world graphs.\nTo address this issue, this paper resorts to leveraging the\ncomplementary nature 

In [6]:
from rolling.embedding import Corpus
import tqdm
corpora = []
for file in tqdm.tqdm(files[:3]):
    reader = PdfReader(os.path.join(download_dir, file))
    text = ''
    for page in reader.pages:
        text += page.extract_text()
    corpora.append(Corpus(file, text, n=256))

100%|██████████| 3/3 [00:22<00:00,  7.52s/it]


In [7]:
import random
def random_color():
    return (random.random(), random.random(), random.random())  # RGB tuple

In [8]:
from sklearn.decomposition import PCA
import numpy as np

# Collect all embeddings and corresponding PDF labels
all_embeddings = np.vstack([emb for corpus in corpora for emb in corpus.embeddings])
labels = [corpus.title for corpus in corpora for _ in corpus.embeddings]

# Reduce to 3D using PCA
reducer = PCA(n_components=3)
low_dim_embeddings = reducer.fit_transform(all_embeddings)

color_map = {title: random_color() for title in labels}

In [9]:
import matplotlib
matplotlib.use("TkAgg") 

In [13]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D  # For 3D plotting
import mplcursors

# Your 3D plotting code
fig = plt.figure(figsize=(10, 6))
ax = fig.add_subplot(111, projection='3d')

start = 0
scatter_plots = []
for corpus in corpora:
    num_points = len(corpus.embeddings)
    subset = low_dim_embeddings[start:start+num_points]
    
    # Scatter plot for each PDF in 3D
    scatter = ax.scatter(subset[:, 0], subset[:, 1], subset[:, 2], 
                         color=color_map[corpus.title], label=corpus.title, alpha=0.6)
    
    # Add the scatter plot and corresponding text parts for hover
    scatter_plots.append((scatter, corpus.parts))  # Store the scatter and the corresponding text parts
    
    start += num_points

# Add the legend and labels
ax.legend(loc="upper center", bbox_to_anchor=(0.5, -0.1))
ax.set_title("3D Embeddings Visualization")
ax.set_xlabel("UMAP Dim 1")
ax.set_ylabel("UMAP Dim 2")
ax.set_zlabel("UMAP Dim 3")

# Use mplcursors to display text on hover
cursor = mplcursors.cursor(hover=True)

# Define the callback to show and hide annotations
@cursor.connect("add")
def on_add(sel):
    # Get the index of the hovered point
    index = sel.index
    # Find the corresponding part of the text
    part_text = corpus.parts[index]
    # Set the annotation text to the text part
    sel.annotation.set_text(part_text)

@cursor.connect("remove")
def on_remove(sel):
    # Hide the annotation when the mouse moves off
    sel.annotation.set_text("")

plt.show()


Traceback (most recent call last):
  File "d:\rolling_embedding\.venv\Lib\site-packages\matplotlib\cbook.py", line 361, in process
    func(*args, **kwargs)
  File "d:\rolling_embedding\.venv\Lib\site-packages\mplcursors\_mplcursors.py", line 580, in _on_hover_motion_notify
    self._on_select_event(event)
  File "d:\rolling_embedding\.venv\Lib\site-packages\mplcursors\_mplcursors.py", line 628, in _on_select_event
    self.add_selection(pi)
  File "d:\rolling_embedding\.venv\Lib\site-packages\mplcursors\_mplcursors.py", line 411, in add_selection
    cb(sel)
  File "C:\Users\Tobias\AppData\Local\Temp\ipykernel_18664\177993679.py", line 40, in on_add
    part_text = corpus.parts[index]
                ~~~~~~~~~~~~^^^^^^^
IndexError: list index out of range
Traceback (most recent call last):
  File "d:\rolling_embedding\.venv\Lib\site-packages\matplotlib\cbook.py", line 361, in process
    func(*args, **kwargs)
  File "d:\rolling_embedding\.venv\Lib\site-packages\mplcursors\_mplcursors.

In [23]:
from sklearn.neighbors import NearestNeighbors
import numpy as np

# Prepare flat lists
all_embeddings = []
all_texts = []
all_titles = []
paper_indices = []  # map each embedding to its corpus index

for paper_idx, corpus in enumerate(corpora):
    all_embeddings.extend(corpus.embeddings)
    all_texts.extend(corpus.parts)
    all_titles.extend([corpus.title] * len(corpus.embeddings))
    paper_indices.extend([paper_idx] * len(corpus.embeddings))

all_embeddings = np.vstack(all_embeddings)

# Fit NearestNeighbors
nbrs = NearestNeighbors(n_neighbors=20, algorithm='auto').fit(all_embeddings)
distances, indices = nbrs.kneighbors(all_embeddings)

# For each paper, find 3 closest inter-paper matches
seen = set()
for paper_idx, corpus in enumerate(corpora):
    print("#"*80)
    print(f"Paper: {corpus.title}")
    matches = []

    for i, title in enumerate(all_titles):
        if paper_indices[i] != paper_idx:
            continue  # only check embeddings from this paper

        for dist, j in zip(distances[i], indices[i]):
            if paper_indices[j] == paper_idx:
                continue  # skip same paper
            pair_key = (i, j)
            if pair_key in seen:
                continue
            seen.add(pair_key)
            matches.append((dist, i, j))
            break  # only take the closest match for this embedding

        if len(matches) >= 3:
            break

    for dist, i, j in sorted(matches, key=lambda x: x[0]):
        print("-"*50)
        print(f"{i}: {dist:.4f}")
        print(f"  - A: {all_texts[i].replace('\n', ' ')}")
        print(f"  - B: {all_texts[j].replace('\n', ' ')}")
        print(f"  - P1: {all_titles[i]}")
        print(f"  - P2: {all_titles[j]}")
    print()

    if paper_idx >= 3:
        break

################################################################################
Paper: 2503.20488v1.Adaptive_Local_Clustering_over_Attributed_Graphs.pdf
--------------------------------------------------
58: 13.1734
  - A:  0.4 0.5 ⋯ ⋯ ⋯ 0.1 0.1 0.2 𝑣1 𝑣2 𝑣3 𝑣4 ⋮ 𝑣𝑛 seed 𝝅′(𝒗𝒔,𝒗𝒊) 0.3 0.4 0.4 0.4 0.3 0.1 0.3 ⋯ 0.1 0.5 0.6 0.1 0.4 ⋯ 0.1 0.4 0.3 0.1 0.5 ⋯ 0.2 𝑣1 𝑣2 𝑣4 𝑣4 𝑣5 𝑣1 𝑣2 𝑣1 𝑣4 𝑣2 ⋮ 𝑣5 𝑣𝑛 ∙ 𝑣3𝑣3 0.48 0.45 0.11 0.45 0 TNAM 𝒁 𝝍 TNAM 𝒁 𝝓′ 𝝆′ 0 1.0 0 0 0 0 0 𝟏(𝑠) Fig. 2: B
  - B: 50.76 76.37 72.16 82.26 Ours 74.38 280.36 132.07 77.38 77.64 77.67 72.39 350.57 148.32 76.21 73.46 82.66 74 75 76 77 78 1 2 4 8 16 72 74 76 78 80 1 2 4 8 16 CoOp LoCoOp Ours 47 49 51 53 55 1 2 4 8 16 AUROC ↑FPR95 ↓ Accuracy ↑ 70 74 78 82 86 1 2 4 8 16 CoOp
  - P1: 2503.20488v1.Adaptive_Local_Clustering_over_Attributed_Graphs.pdf
  - P2: 2503.20492v1.Towards_Efficient_and_General_Purpose_Few_Shot_Misclassification_Detection_for_Vision_Language_Models.pdf
--------------------------------------------------
50:

In [15]:
neighbors_text

['Adaptive Local Clustering over Attributed Graphs\nTechnical Report\nHaoran Zheng\nHong Kong Baptist University\nHong Kong SAR, China\ncshrzheng@comp.hkbu.edu.hk\nRenchi Yang\nHong Kong Baptist University\nHong Kong SAR, China\nrenchi@hkbu.edu.hk\nJianliang Xu\nHong K',
 'ang, “Local graph clustering\nwith noisy labels,” in The Twelfth International Conference on Learning\nRepresentations, 2023.\n[73] X. Huang, L. V . Lakshmanan, and J. Xu, Community Search over Big\nGraphs. Morgan & Claypool Publishers, 2019.\n[74] ——, “Communi',
 'and\nexploring data graphs locally,” Journal of Machine Learning Research ,\nvol. 13, no. 77, pp. 2339–2365, 2012.\n[14] D. A. Spielman and S.-H. Teng, “A local clustering algorithm for mas-\nsive graphs and its application to nearly linear time graph partitio',
 'ning for\nnetworks,” Proceedings of the 22nd ACM SIGKDD International Con-\nference on Knowledge Discovery and Data Mining , 2016.\n[60] R. Yang, J. Shi, X. Xiao, Y . Yang, J. Liu, S. S. Bhowmick