In [41]:
preprocessed_data = './arxiv_downloads_processed'

In [42]:
import os
files = os.listdir(preprocessed_data)
files[:3], len(files)

(['2503.20488v1.Adaptive_Local_Clustering_over_Attributed_Graphs.pkl',
  '2503.20491v1.VPO__Aligning_Text_to_Video_Generation_Models_with_Prompt_Optimization.pkl',
  '2503.20492v1.Towards_Efficient_and_General_Purpose_Few_Shot_Misclassification_Detection_for_Vision_Language_Models.pkl'],
 100)

In [43]:
from rolling.paper import Paper, load_paper, print_paper

load_n = 100

papers:list[Paper] = []
for i in range(load_n):
    papers.append(load_paper(os.path.join(preprocessed_data, files[i])))

In [4]:
print_paper(papers[0])

Paper: 2503.20488v1.Adaptive_Local_Clustering_over_Attributed_Graphs.pdf
> [ 1.09270895  1.30160093 -2.66840649] Adaptive Local Clustering over Attributed Graphs Technical Report Haoran Zheng Hong Kong Baptist University Hong Kong SAR, China cshrzheng@comp.hkbu.edu.hk Renchi Yang Hong Kong Baptist University Hong Kong SAR, China renchi@hkbu.edu.hk Jianliang Xu Hong Kong Baptist University Hong Kong SAR, China xujl@hkbu.edu.hk Abstract—Given a graph G and a seed node vs, the objective of local graph clustering (LGC) is to identify a subgraph Cs ∈ G (a.k.a. local cluster) surrounding vs in time roughly linear with the size of Cs. This approach yields personalized clusters without needing to access the entire graph, which makes it highly suitable for numerous applications involving large graphs. However, most existing solutions merely rely on the topological connectivity between nodes in G, rendering them vulnerable to missing or noisy links that are commonly present in real-world graphs.

In [44]:
import random
def random_color():
    return (random.random(), random.random(), random.random())  # RGB tuple

In [45]:
import numpy as np
all_embeddings = np.stack([emb for paper in papers for emb in paper.embeddings])
len(all_embeddings)

64706

In [36]:
from sklearn.decomposition import PCA
import numpy as np

# Collect all embeddings and corresponding PDF labels
all_embeddings = np.stack([emb for paper in papers for emb in paper.embeddings])
labels = [paper.title for paper in papers for _ in paper.embeddings]

# Reduce to 3D using PCA
reducer = PCA(n_components=3)
low_dim_embeddings = reducer.fit_transform(all_embeddings)

color_map = {title: random_color() for title in labels}

In [23]:
import matplotlib
matplotlib.use("TkAgg") 

In [37]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D  # For 3D plotting
import mplcursors

# Your 3D plotting code
fig = plt.figure(figsize=(10, 6))
ax = fig.add_subplot(111, projection='3d')

start = 0
scatter_plots = []
for paper in papers:
    num_points = len(paper.embeddings)
    subset = low_dim_embeddings[start:start+num_points]
    
    # Scatter plot for each PDF in 3D
    scatter = ax.scatter(subset[:, 0], subset[:, 1], subset[:, 2], 
                         color=color_map[paper.title], label=paper.title, alpha=0.6)
    
    # Add the scatter plot and corresponding text parts for hover
    scatter_plots.append((scatter, paper.texts))  # Store the scatter and the corresponding text parts
    
    start += num_points

# Add the legend and labels
ax.legend(loc="upper center", bbox_to_anchor=(0.5, -0.1))
ax.set_title("3D Embeddings Visualization")
ax.set_xlabel("UMAP Dim 1")
ax.set_ylabel("UMAP Dim 2")
ax.set_zlabel("UMAP Dim 3")

# Use mplcursors to display text on hover
cursor = mplcursors.cursor(hover=True)

# Define the callback to show and hide annotations
@cursor.connect("add")
def on_add(sel):
    # Get the index of the hovered point
    index = sel.index
    # Find the corresponding part of the text
    part_text = paper.texts[index]
    # Set the annotation text to the text part
    sel.annotation.set_text(part_text)

@cursor.connect("remove")
def on_remove(sel):
    # Hide the annotation when the mouse moves off
    sel.annotation.set_text("")

plt.show()


In [48]:
from sklearn.neighbors import NearestNeighbors
import numpy as np

# Prepare flat lists
all_embeddings = []
all_texts = []
all_titles = []
paper_indices = []  # map each embedding to its corpus index

for paper_idx, paper in enumerate(papers):
    all_embeddings.extend(paper.embeddings)
    all_texts.extend(paper.texts)
    all_titles.extend([paper.title] * len(paper.embeddings))
    paper_indices.extend([paper_idx] * len(paper.embeddings))

all_embeddings = np.vstack(all_embeddings)

# Fit NearestNeighbors
nbrs = NearestNeighbors(n_neighbors=10, algorithm='auto').fit(all_embeddings)
distances, indices = nbrs.kneighbors(all_embeddings)

In [87]:

# Store matched pairs
matches = []

# Loop through each embedding and find best inter-paper matches
for i, title in enumerate(all_titles):
    for dist, j in zip(distances[i], indices[i]):
        if paper_indices[i] != paper_indices[j]:  # Ensure it's not the same paper
            matches.append({
                'distance': dist,
                'embedding_i': i,
                'embedding_j': j,
                'paper_i': all_titles[i],
                'paper_j': all_titles[j],
                'text_i': all_texts[i],
                'text_j': all_texts[j]
            })

# Sort matches by distance (ascending)
matches = sorted(matches, key=lambda x: x['distance'])

# Print the top matches
print_count = 0
for match in matches:
    if print_count >= 100:
        break

    if match['distance'] < 10 or len(match['text_i']) > 1000 or len(match['text_i']) < 100:
        continue

    print_count += 1
    print(f"Match (Distance: {match['distance']:.5f}):")
    print(f"Paper 1:     {match['paper_i']}")
    print(f"Paper 2:     {match['paper_j']}")
    print(f"Embedding 1: {all_embeddings[match['embedding_i']][:5]}...")
    print(f"Embedding 2: {all_embeddings[match['embedding_j']][:5]}...")
    print(f"Text 1:      {match['text_i']}")  # Limit text to first 100 chars
    print(f"Text 2:      {match['text_j']}")  # Limit text to first 100 chars
    print("-" * 80)

Match (Distance: 10.00047):
Paper 1:     2503.20698v1.MMMORRF__Multimodal_Multilingual_Modularized_Reciprocal_Rank_Fusion.pdf
Paper 2:     2503.20630v1._β__GNN__A_Robust_Ensemble_Approach_Against_Graph_Structure_Perturbation.pdf
Embedding 1: [-0.31322026  0.32337621 -2.7252872  -0.08644752  0.83922893]...
Embedding 2: [ 0.25969315  0.25360304 -2.9244976  -0.30857641  1.14607215]...
Text 1:      arXiv:1611.09268 [cs.CL] https://arxiv.org/abs/1611.09268 [4] Meng Cao, Haoran Tang, Jinfa Huang, Peng Jin, Can Zhang, Ruyang Liu, Long Chen, Xiaodan Liang, Li Yuan, and Ge Li.
Text 2:      arXiv:1609.02907 http://arxiv.org/abs/1609.02907 [13] Y. Li, W. Jin, H. Xu, and J. Tang.
--------------------------------------------------------------------------------
Match (Distance: 10.00090):
Paper 1:     2503.20776v1.Feature4X__Bridging_Any_Monocular_Video_to_4D_Agentic_AI_with_Versatile_Gaussian_Feature_Fields.pdf
Paper 2:     2503.20784v1.FB_4D__Spatial_Temporal_Coherent_Dynamic_3D_Content_Generation