# Generating The Meta Data and Json

In the previous notebook we worked through converting our ground truth PDFs into two formats, a TEI XML and a mark down. The TEI XML will provide the structure and important meta data like titles, authors and references while the mark down will provide us with body text.

The goal for this notebook will be to combine both of those files into one json file.

# A Beautiful Soup

To turn the raw TEI XML that GROBID produced into clean, program‑friendly metadata we’ll lean on Beautiful Soup, a lightweight Python library that makes scraped or machine generated markup feel like a native data structure. Under the hood, Beautiful Soup parses HTML or XML into a tree of Python objects; tags become nodes, attributes turn into dictionary like fields, and text sits exactly where you expect it. That means you can “walk” an XML document with intuitive commands such as find("title") or CSS style selectors like header.select("analytic > author > persName") rather than juggling fragile, low level string operations or the verbose standard library xml.etree.

For our pipeline this is crucial: TEI files are verbose and deeply nested, but Beautiful Soup lets us pluck out just the bits we care about titles, author names, abstracts, reference lists then immediately cast them to JSON for downstream chunking and LLM ingestion.

In short, it turns messy scholarly XML into tidy Python dicts with only a few lines of code, saving both compute time and human patience.

In [None]:
import json, re, glob, bs4, textwrap, hashlib
from pathlib import Path

import os, time, requests, shutil, json, re, glob, textwrap, hashlib
import xml.etree.ElementTree as ET          # still handy elsewhere
import bs4                                  # BeautifulSoup for quick XML grabs
from pathlib import Path
from google.colab import drive, files

# ────────────────────────────────────────────────────────────────
# 0.  Mount Drive & define folders
# ────────────────────────────────────────────────────────────────
drive.mount('/content/drive', force_remount=True)
base_directory = "/content/drive/MyDrive/Colab_notebooks/AI/"
PDF_DIR = base_directory+"arxiv_pdfs"   # input PDFs
OUT_XML = base_directory+"/grobid_xml"   # GROBID TEI files
OUT_MD  = base_directory+"arxiv_markdowns2"  # Nougat markdowns

Mounted at /content/drive


In [None]:

def title_author_abstract(soup: bs4.BeautifulSoup) -> dict:
    """Return title, author list, abstract string."""
    title = soup.find("title").get_text(" ")
    header = soup.find("teiHeader")
    authors = [p.get_text(" ")
               for p in header.select("analytic > author > persName")] if header else []
    abstract = soup.abstract.get_text(" ") if soup.abstract else ""
    return {"title": title, "authors": authors, "abstract": abstract}

def _clean_name(pers: bs4.Tag) -> str:
    """Build 'Firstname M. Surname' from <persName> content."""
    parts = []
    # preserve given order of forename(s) then surname
    for tag in pers.find_all(["forename", "surname"]):
        parts.append(tag.get_text(" "))
    return re.sub(r"\s+", " ", " ".join(parts)).strip()




In [None]:

def reference_list(soup: bs4.BeautifulSoup) -> list:
    """
    Parse <biblStruct> elements and return a list of
    {id, title, authors, year, venue, doi}.
    Handles the common GROBID patterns you showed:
      · title may live under <analytic> or <monogr>
      · authors may live in either place (or both)
      · some records have DOI / ArXiv / volume info, others don't
    """

    refs = []
    for bib in soup.find_all("biblStruct"):

        # 1‒ id (xml:id or id attribute)
        r_id = bib.get("xml:id") or bib.get("id")

        # 2‒ main title  ─────────────────────────────────────────
        title_tag = None
        if bib.analytic:
            # prefer <title type="main"> if present
            title_tag = bib.analytic.find("title", {"type": "main"}) \
                        or bib.analytic.find("title")
        if not title_tag and bib.monogr:
            title_tag = bib.monogr.find("title", {"type": "main"}) \
                        or bib.monogr.find("title")

        r_title = title_tag.get_text(" ").strip() if title_tag else ""

        # 3‒ authors  ───────────────────────────────────────────
        # search <author><persName> in analytic first, then monogr
        author_nodes = []
        if bib.analytic:
            author_nodes = bib.analytic.find_all("author")
        if not author_nodes and bib.monogr:
            author_nodes = bib.monogr.find_all("author")

        r_authors = []
        for a in author_nodes:
            p = a.find("persName")
            if p:
                r_authors.append(_clean_name(p))

        # 4‒ year  ──────────────────────────────────────────────
        date_tag = bib.find("date", {"type": "published"}) or bib.find("date")
        year = ""
        if date_tag:
            if date_tag.has_attr("when"):
                year = date_tag["when"][:4]          # "2005-07" → "2005"
            else:
                year = re.search(r"\d{4}", date_tag.get_text() or "") or ""
                year = year.group(0) if year else ""

        # 5‒ venue (monograph / journal / booktitle)  ───────────
        venue_tag = None
        if bib.monogr:
            # prefer a <title level="m">, else first monogr <title>
            venue_tag = bib.monogr.find("title", {"level": "m"}) \
                        or bib.monogr.find("title")
        r_venue = venue_tag.get_text(" ").strip() if venue_tag else ""

        # 6‒ DOI / other identifier  ────────────────────────────
        doi_tag = bib.find("idno", {"type": lambda v: v and v.lower() == "doi"})
        doi     = doi_tag.get_text().strip() if doi_tag else ""

        refs.append({
            "id"     : r_id,
            "title"  : r_title,
            "authors": r_authors,
            "year"   : year,
            "venue"  : r_venue,
            "doi"    : doi
        })

    return refs

In [None]:
import re, unicodedata
import bs4

# ------------------------------------------------------------------
# 1.  Normalise a heading so the TEI and markdown keys line up
# ------------------------------------------------------------------
def _norm(title: str) -> str:
    """Lower-case, strip accents & punctuation, collapse whitespace."""
    txt = unicodedata.normalize("NFKD", title)
    txt = "".join(ch for ch in txt if ch.isalnum() or ch.isspace())
    return re.sub(r"\s+", " ", txt).strip().lower()

# ------------------------------------------------------------------
# 2.  Slice the Nougat markdown into {norm_title: markdown_chunk}
# ------------------------------------------------------------------
def md_sections(md_text: str) -> dict[str, str]:
    """
    Returns {normalised_title: markdown_text_between_this_and_next_H1}.
    Assumes Nougat emits   # Title   for top-level sections.
    """
    sections = {}
    current_title, buff = None, []
    for line in md_text.splitlines():
        h = re.match(r"^#\s+(.+?)\s*$", line)
        if h:
            # flush previous
            if current_title:
                sections[_norm(current_title)] = "\n".join(buff).strip()
            current_title = h.group(1)
            buff = []          # reset buffer
        else:
            buff.append(line)
    # last section
    if current_title:
        sections[_norm(current_title)] = "\n".join(buff).strip()
    return sections

# ------------------------------------------------------------------
# 3.  Combine GROBID outline with Nougat prose
# ------------------------------------------------------------------
def combined_sections(tei_soup: bs4.BeautifulSoup, md_text: str):
    """
    Yields {'title': original_title, 'text': chosen_markdown_or_plain_text}.
    """
    md_map = md_sections(md_text)
    body = tei_soup.find("body")
    if not body:
        return []

    out = []
    for div in body.find_all("div", recursive=False):
        head = div.find("head")
        raw_title = head.get_text(" ") if head else "Untitled"
        key = _norm(raw_title)

        # remove headings so get_text() doesn't duplicate titles
        for h in div.select("head"):
            h.decompose()
        fallback_plain = " ".join(div.get_text(" ").split())

        text = md_map.get(key, fallback_plain)   # prefer Nougat; else TEI
        out.append({"title": raw_title, "text": text})
    return out


In [None]:
# make an output folder once
JSON_DIR = "/content/drive/MyDrive/arxiv_json"
Path(JSON_DIR).mkdir(exist_ok=True)


def pipeline_and_writeout(tei_path: Path, json_dir: Path = Path(JSON_DIR)) -> None:
    """End-to-end: parse one TEI, gather fields, dump single JSON file."""
    soup = bs4.BeautifulSoup(tei_path.read_text(), "xml")

    header_info = title_author_abstract(soup)
    refs        = reference_list(soup)
    # Nougat markdown
    md_file = Path(OUT_MD) / f"{pid}.mmd"
    if md_file.exists():
        md_text = md_file.read_text()
    else:
        print("⚠️ Nougat missing for", pid)
        md_text = ""
    sections = combined_sections(soup, md_text)

    data = {
        **header_info,
        "references": refs,
        "sections"  : sections,
    }

    out_path = JSON_DIR / f"{tei_path.stem.replace('.tei', '').replace('.grobid', '')}.json"
    with out_path.open("w", encoding="utf-8") as fh:
        json.dump(data, fh, ensure_ascii=False, indent=2)
    print(f"✅ wrote {out_path}")

# ────────────────────────────────────────────────────────────────
# 2.  Run the pipeline on every TEI file
# ────────────────────────────────────────────────────────────────
for filename in sorted(os.listdir(OUT_XML)):      # alphabetic order
    full_path = Path(os.path.join(OUT_XML, filename))   # prepend the folder path
    if full_path.name.endswith(".xml"): # ignore the .txt (and any other non-XML)
      pipeline_and_writeout(Path(full_path))              # or print(filename) for names only

print("🎉  All papers processed!")

🎉  All papers processed!


In [None]:
xfor filename in sorted(os.listdir(OUT_XML)):      # alphabetic order
    full_path = os.path.join(OUT_XML, filename)   # prepend the folder path
    if os.path.isfile(full_path):                 # skip sub-folders if any
        print(full_path)                          # or print(filename) for names only

/content/drive/MyDrive/grobid_xml/1911.09661v1.grobid.tei.xml
/content/drive/MyDrive/grobid_xml/2102.02503v1.grobid.tei.xml
/content/drive/MyDrive/grobid_xml/2112.02969v1.grobid.tei.xml
/content/drive/MyDrive/grobid_xml/2201.11903v6.grobid.tei.xml
/content/drive/MyDrive/grobid_xml/2207.08982v1.grobid.tei.xml
/content/drive/MyDrive/grobid_xml/2210.10723v2.grobid.tei.xml
/content/drive/MyDrive/grobid_xml/2210.11630v1.grobid.tei.xml
/content/drive/MyDrive/grobid_xml/2211.02069v2.grobid.tei.xml
/content/drive/MyDrive/grobid_xml/2211.04715v1.grobid.tei.xml
/content/drive/MyDrive/grobid_xml/2212.09196v3.grobid.tei.xml
/content/drive/MyDrive/grobid_xml/2212.09271v2.grobid.tei.xml
/content/drive/MyDrive/grobid_xml/2212.09420v2.grobid.tei.xml
/content/drive/MyDrive/grobid_xml/2301.13820v1.grobid.tei.xml
/content/drive/MyDrive/grobid_xml/2302.00093v3.grobid.tei.xml
/content/drive/MyDrive/grobid_xml/2302.03491v1.grobid.tei.xml
/content/drive/MyDrive/grobid_xml/2302.11957v1.grobid.tei.xml
/content