# Processing and transforming raw data

In [1]:
import os
import re
import sys
import json
import dotenv
import pandas as pd
from tqdm import tqdm

dotenv.load_dotenv()

True

## Extract ArXiv abstracts from local paper directory

In [None]:
import os
import re
import json
from tqdm import tqdm
from pypdf import PdfReader

In [None]:
arxiv_paper_directory = "" # your directory here

during clean up, i had the need to rename paper files with their arxiv id. to do this, i parse the first page and filter for arxiv id format

In [None]:
for filename in os.listdir(arxiv_paper_directory):
    if filename.endswith(".pdf"):
        filepath = os.path.join(arxiv_paper_directory, filename)
        with open(filepath, 'rb') as pdf_file:
            pdf_reader = PdfReader(pdf_file)
            first_page = pdf_reader.pages[0]
            text = first_page.extract_text()

            # Search for the arXiv ID pattern on the first page
            match = re.search(r'(\d{4}\.\d{5}v?\d?)', text)
            if match:
                arxiv_id = match.group(1)
                # Remove the "v" and digit from the arXiv ID
                arxiv_id = re.sub(r'v\d', '', arxiv_id)
                new_filename = f"{arxiv_id}.pdf"
                new_filepath = os.path.join(arxiv_paper_directory, new_filename)
                os.rename(filepath, new_filepath)
                print(f"Renamed: {filename} -> {new_filename}")
            else:
                print(f"No arXiv ID found for: {filename}")

fetch abstracts using ArXiv api and save to jsonl

In [None]:
import requests

output_filename = "~/arxiv_paper_abs.jsonl"
jsonl_filepath = os.path.expanduser(output_filename)

with open(jsonl_filepath, "w") as jsonl_file:
    for filename in tqdm(os.listdir(arxiv_paper_directory)):
        if filename.endswith(".pdf"):
            arxiv_id = filename.replace(".pdf", "")
            url = f"http://export.arxiv.org/api/query?id_list={arxiv_id}"
            response = requests.get(url)
            if response.status_code == 200:
                feed = response.content.decode("utf-8")
                start_index = feed.find("<summary>")
                end_index = feed.find("</summary>")
                if start_index != -1 and end_index != -1:
                    abstract = (
                        feed[start_index + 9 : end_index].strip().replace("\n", " ")
                    )
                    data = {"id": arxiv_id, "abstract": abstract}
                    jsonl_file.write(json.dumps(data) + "\n")

## Extract doi ids from local paper directory

This assumes that the file names contain valid doi ids.

In [None]:
doi_paper_directory = "" # put the directory path to the doi documents here

find and rename those with a doi id

In [None]:
for filename in tqdm(os.listdir(doi_paper_directory)):
    if filename.endswith(".pdf"):
        filepath = os.path.join(doi_paper_directory, filename)
        with open(filepath, 'rb') as pdf_file:
            pdf_reader = PdfReader(pdf_file)
            first_page = pdf_reader.pages[0]
            text = first_page.extract_text() if first_page else ""

            # Regex pattern for matching a DOI link
            doi_pattern = r'https?://doi\.org/10\.[0-9]+/[^\s]+'
            match = re.search(doi_pattern, text)
            if match:
                doi_link = match.group(0)
                # Extract the DOI identifier and replace slashes with dots, 
                # we do this so the saving path is not messed up
                doi_identifier = doi_link.replace('https://doi.org/', '').replace('/', '.')
                new_filename = f"{doi_identifier}.pdf"
                new_filepath = os.path.join(doi_paper_directory, new_filename)
                os.rename(filepath, new_filepath)
                print(f"Renamed: {filename} -> {new_filename}")
            else:
                print(f"No DOI link found for: {filename}")

extract the doi ids from filename again. this seems redundant, it was there because i first had to rename the files for my own sanity.

In [2]:
doi_ids = []
doi_pattern = re.compile(r"^\d{2}\.\d{4,}\..*?(?:doi:)?\.pdf$")

for filename in os.listdir(doi_paper_directory):
    if doi_pattern.match(filename):
        doi_id = filename[:-4]
        first_period_idx = doi_id.find(".")
        doi_id = doi_id[: first_period_idx + 1] + doi_id[
            first_period_idx + 1 :
        ].replace(".", "/", 1)
        doi_ids.append(doi_id)

In [3]:
print("Collected DOI IDs:")
print(len(doi_ids))
print(doi_ids)

Collected DOI IDs:
59
['10.1016/j.cell.2023.12.034', '10.1101/2024.01.02.573943', '10.1016/j.acha.2021.12.009', '10.1016/j.cell.2023.12.035', '10.1016/j.cell.2024.01.026', '10.1038/s41467-021-26529-9', '10.1016/j.cell.2023.12.037', '10.1093/gbe.evad084', '10.1038/s41467-024-46631-y', '10.1093/molbev.msx095', '10.1016/j.cell.2023.12.026', '10.1016/j.cell.2023.12.032', '10.1101/2023.04.30.538439', '10.1038/s41586-019-1923-7', '10.1101/2024.03.21.585615', '10.1038/s41586-023-06291-2', '10.1038/s41467-024-46715-9', '10.1101/2021.02.12.430858', '10.1038/s41586-019-1724-z', '10.1016/j.cell.2024.01.036', '10.1038/s41467-023-38539-w', '10.1126/science.abo7201', '10.1038/s41467-021-25756-4', '10.1038/s41564-023-01584-8', '10.7554/eLife.50524.001', '10.1126/science.aay8015', '10.1038/s42004-024-01098-2', '10.1101/2024.02.06.579080', '10.1016/j.bpj.2017.10.028', '10.1038/s41588-023-01649-8', '10.1101/2024.03.07.584001', '10.1145/3600006.3613165', '10.1016/j.cell.2023.04.032', '10.1038/s41593-023-

we can use crossref.org api to obtain metadata about each doi document. typically, we can look for the abstract info in the response's `message.abstract` field.

In [4]:
import requests
import os
import json
from bs4 import BeautifulSoup

elsevier_api_key = os.getenv("ELSEVIER_API_KEY")

save_dir = "/Users/yxz/tp/datasets/papers/abstracts"
os.makedirs(save_dir, exist_ok=True)

doi_without_abstract_list = []

with open(os.path.join(save_dir, "doi_abstracts.jsonl"), "w") as outfile:
    for doi in tqdm(doi_ids):
        url = f"https://api.crossref.org/works/{doi}"
        response = requests.get(url)
        abstract = None

        if response.status_code == 200:
            data = response.json()
            if "elsevier" in data["message"]["publisher"].lower():
                headers = {
                    "X-ELS-APIKey": elsevier_api_key,
                    "Accept": "application/json",
                }
                url = f"https://api.elsevier.com/content/article/doi/{doi}"
                response = requests.get(url, headers=headers)
                if response.status_code == 200:
                    data = response.json()
                    abstract = (
                        data.get("full-text-retrieval-response", {})
                        .get("coredata", {})
                        .get("dc:description")
                    )
            elif "abstract" in data["message"] and not abstract:
                abstract = data["message"]["abstract"]
        if abstract:
            # Elsevier returns a XML formatted abstract that contains both a title and paragraphs.
            # We only want the paragraphs, so we use BeautifulSoup to parse the XML and extract the paragraphs.
            soup = BeautifulSoup(abstract, "html.parser")
            paragraphs = soup.find_all("jats:p")
            # Join the text of all paragraphs
            abstract_text = " ".join(p.text for p in paragraphs)
            abstract_text = abstract_text.replace("\n", " ").strip()
            json.dump(
                {"doi_id": doi, "abstract": abstract_text}, outfile, ensure_ascii=False
            )
            outfile.write("\n")
        else:
            doi_without_abstract_list.append(doi)

print(f"DOI IDs without abstract: {doi_without_abstract_list}")

100%|██████████| 59/59 [00:38<00:00,  1.54it/s]

DOI IDs without abstract: ['10.1093/gbe.evad084', '10.1093/molbev.msx095', '10.1038/s41586-019-1923-7', '10.1038/s41586-019-1724-z', '10.1038/s41564-023-01584-8', '10.7554/eLife.50524.001', '10.1145/3600006.3613165', '10.1038/s41593-023-01304-9', '10.2307/2334029']





manually examine the doi ids that are not found in the crossref api and save the response to a json file

In [5]:
from pprint import pprint

id_to_fix = [
    "10.1093/gbe/evad084",
    "10.1093/molbev/msx095",
    "10.1038/s41586-019-1923-7",
    "10.1038/s41586-019-1724-z",
    "10.1038/s41564-023-01584-8",
    "10.7554/eLife.50524.001",
    "10.1145/3600006.3613165",
    "10.1038/s41593-023-01304-9",
    "10.2307/2334029",
]

for doi_test in id_to_fix:
    url_test = f"https://api.crossref.org/works/{doi_test}"
    response_test = requests.get(url_test)

    if response_test.status_code == 200:
        data_test = response_test.json()
        with open(f"response_{doi_test.replace('/', '.')}.json", "w") as f:
            json.dump(data_test, f, ensure_ascii=False, indent=4)
    else:
        print("Failed to fetch data for DOI:", doi_test)

i also check the already created `doi_abstracts.jsonl` file to see if there are empty abstracts (there are many)

In [7]:
test_2 = "10.1016/j.cell.2023.12.034"

url_test_2 = f"https://api.crossref.org/works/{test_2}"
response_test_2 = requests.get(url_test_2)

if response_test_2.status_code == 200:
        data_test_2 = response_test_2.json()
        with open(f"response2_{test_2.replace('/', '.')}.json", "w") as f:
            json.dump(data_test_2, f, ensure_ascii=False, indent=4)



In the end, i manually searched and added the abstracts for the 9 papers that the api failed, and there are about 17 more that the api fetched empty abstracts.