In [1]:
from rolling.pdf import list_pdfs
files = list_pdfs()
files[:3], len(files)

(['./arxiv_downloads\\0812.0743v2.A_Novel_Clustering_Algorithm_Based_on_Quantum_Games.pdf',
  './arxiv_downloads\\1103.4487v1.Handwritten_Digit_Recognition_with_a_Committee_of_Deep_Neural_Nets_on_GPUs.pdf',
  './arxiv_downloads\\1106.4509v1.Machine_Learning_Markets.pdf'],
 1000)

In [2]:
output_dir = './arxiv_downloads_processed'
import os
os.makedirs(output_dir, exist_ok=True)

In [3]:
from rolling.embedding import GTEEmbeddingModel
model = GTEEmbeddingModel()

In [4]:
import tqdm
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed

from rolling.pdf import read_pdf
from rolling.paper import create_paper, save_paper

lock = threading.Lock()

def sync_encode(args):
    global lock

    with lock:
        return model.encode(args)

def process_paper(pdf_path):
    file_name = os.path.basename(pdf_path)
    file_name = file_name.replace('.pdf', '.pkl')
    file_name = os.path.join(output_dir, file_name)
    
    if os.path.exists(file_name):
        return False
    
    try:
        text = read_pdf(pdf_path)
        title = pdf_path
        paper = create_paper(title, text, embedding_function=sync_encode)
        save_paper(paper, file_name)
    except Exception as e:
        print(f'Error processing {file_name}: {e}')
        return False

    return file_name


with ThreadPoolExecutor(max_workers=3) as executor:
    futures = [executor.submit(process_paper, pdf_path) for pdf_path in files]

    for future in tqdm.tqdm(as_completed(futures), total=len(files)):
        pass

print('ok')

 64%|██████▎   | 636/1000 [04:21<09:43,  1.60s/it]Ignoring wrong pointing object 6 0 (offset 0)
 64%|██████▎   | 637/1000 [04:22<08:18,  1.37s/it]Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 16 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)
Ignoring wrong pointing object 20 0 (offset 0)
Ignoring wrong pointing object 28 0 (offset 0)
Ignoring wrong pointing object 30 0 (offset 0)
Ignoring wrong pointing object 37 0 (offset 0)
Ignoring wrong pointing object 39 0 (offset 0)
Ignoring wrong pointing object 41 0 (offset 0)
Ignoring wrong pointing object 43 0 (offset 0)
Ignoring wrong pointing object 52 0 (offset 0)
Ignoring wrong pointing object 64 0 (offset 0)
Ignoring wrong pointing object 83 0 (offset 0)
 72%|███████▏  | 721/1000 [06:04<12:56,  2.78s/it]Ignoring wrong pointing object 6 0 (offset 0)
Ignorin

Error processing ./arxiv_downloads_processed\2401.10510v3.When_Large_Language_Models_Meet_Evolutionary_Algorithms__Potential_Enhancements_and_Challenges.pkl: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]


100%|██████████| 1000/1000 [12:09<00:00,  1.37it/s]

ok





In [5]:
# Compare directories
import os
for file in files:
    basename = os.path.basename(file)
    target_file = basename.replace('.pdf', '.pkl')
    target_path = os.path.join('./arxiv_downloads_processed', target_file)
    if not os.path.exists(target_path):
        print(f'WARNING: file {target_file} not found')

