In [3]:
import os
import re
import json
from tqdm import tqdm
from pypdf import PdfReader

In [2]:
directory = "" # your directory here.

renaming paper from arxiv

In [None]:
for filename in os.listdir(directory):
    if filename.endswith(".pdf"):
        filepath = os.path.join(directory, filename)
        with open(filepath, 'rb') as pdf_file:
            pdf_reader = PdfReader(pdf_file)
            first_page = pdf_reader.pages[0]
            text = first_page.extract_text()

            # Search for the arXiv ID pattern on the first page
            match = re.search(r'(\d{4}\.\d{5}v?\d?)', text)
            if match:
                arxiv_id = match.group(1)
                # Remove the "v" and digit from the arXiv ID
                arxiv_id = re.sub(r'v\d', '', arxiv_id)
                new_filename = f"{arxiv_id}.pdf"
                new_filepath = os.path.join(directory, new_filename)
                os.rename(filepath, new_filepath)
                print(f"Renamed: {filename} -> {new_filename}")
            else:
                print(f"No arXiv ID found for: {filename}")

fetch abstracts from api and save to jsonl

In [4]:
import requests

jsonl_filename = "~/arxiv_paper_abs.jsonl"
jsonl_filepath = os.path.expanduser(jsonl_filename)

with open(jsonl_filepath, 'w') as jsonl_file:
    for filename in tqdm(os.listdir(directory)):
        if filename.endswith(".pdf"):
            arxiv_id = filename.replace('.pdf', '')
            url = f"http://export.arxiv.org/api/query?id_list={arxiv_id}"
            response = requests.get(url)
            if response.status_code == 200:
                feed = response.content.decode('utf-8')
                start_index = feed.find('<summary>')
                end_index = feed.find('</summary>')
                if start_index != -1 and end_index != -1:
                    abstract = feed[start_index + 9:end_index].strip().replace("\n", " ")
                    data = {"id": arxiv_id, "abstract": abstract}
                    jsonl_file.write(json.dumps(data) + '\n')

100%|██████████| 497/497 [04:25<00:00,  1.87it/s]


find and rename those with a doi id

In [None]:
for filename in tqdm(os.listdir(directory)):
    if filename.endswith(".pdf"):
        filepath = os.path.join(directory, filename)
        with open(filepath, 'rb') as pdf_file:
            pdf_reader = PdfReader(pdf_file)
            first_page = pdf_reader.pages[0]
            text = first_page.extract_text() if first_page else ""

            # Regex pattern for matching a DOI link
            doi_pattern = r'https?://doi\.org/10\.[0-9]+/[^\s]+'
            match = re.search(doi_pattern, text)
            if match:
                doi_link = match.group(0)
                # Extract the DOI identifier and replace slashes with dots
                doi_identifier = doi_link.replace('https://doi.org/', '').replace('/', '.')
                new_filename = f"{doi_identifier}.pdf"
                new_filepath = os.path.join(directory, new_filename)
                os.rename(filepath, new_filepath)
                print(f"Renamed: {filename} -> {new_filename}")
            else:
                print(f"No DOI link found for: {filename}")