# GrobidClient Interactive Playground

This notebook demonstrates how to use `grobid-client-python` to process a PDF and extract its fulltext XML output.

In [5]:
# Import Required Libraries
import sys
from grobid_client.grobid_client import GrobidClient

In [9]:
# Initialize GrobidClient
client = GrobidClient(config_path="/home/jakub/Projects/paper_testing/config.json", grobid_server="http://localhost:8070")  # use Docker container URL

GROBID server is up and running


In [13]:
# Process PDF to Extract Fulltext
pdf_path = "/home/jakub/Projects/paper_testing/documents_to_test/locust"  # Change this to your PDF file
output_dir = "output"  # Output directory for results

result = client.process("processFulltextDocument", pdf_path, n=20, consolidate_citations=False, tei_coordinates=True, consolidate_header=False, include_raw_affiliations=False, include_raw_citations=False, verbose=True, force=True, output=output_dir)

modus_oa.pdf
max_oa.pdf
min_oa.pdf
3 files to process in current batch
Adding /home/jakub/Projects/paper_testing/documents_to_test/locust/modus_oa.pdf to the queue.
Adding /home/jakub/Projects/paper_testing/documents_to_test/locust/max_oa.pdf to the queue.
Adding /home/jakub/Projects/paper_testing/documents_to_test/locust/min_oa.pdf to the queue.


In [3]:
# Go through all the papers using PyMuPDF and flag papers that have "Corrigendum" as a keyword
import fitz
import os
from pathlib import Path

def check_for_corrigendum(pdf_path):
    """
    Check if a PDF contains the word "Corrigendum" in its text.
    
    Args:
        pdf_path (str): Path to the PDF file
        
    Returns:
        bool: True if "Corrigendum" is found, False otherwise
    """
    try:
        doc = fitz.open(pdf_path)
        for page in doc:
            text = page.get_text()
            if "Corrigendum" in text:
                doc.close()
                return True
        doc.close()
        return False
    except Exception as e:
        print(f"Error processing {pdf_path}: {e}")
        return False

def scan_directory_for_corrigendum(directory_path, recursive=True):
    """
    Scan a directory for PDF files and check each for "Corrigendum".
    
    Args:
        directory_path (str): Path to the directory containing PDFs
        recursive (bool): Whether to search subdirectories recursively
        
    Returns:
        tuple: (corrigendum_papers, all_papers) - lists of file paths
    """
    directory = Path(directory_path)
    corrigendum_papers = []
    all_papers = []
    
    # Get PDF files
    if recursive:
        pdf_files = directory.rglob("*.pdf")
    else:
        pdf_files = directory.glob("*.pdf")
    
    for pdf_file in pdf_files:
        pdf_path = str(pdf_file)
        all_papers.append(pdf_path)
        
        print(f"Checking: {pdf_file.name}")
        
        if check_for_corrigendum(pdf_path):
            corrigendum_papers.append(pdf_path)
            print(f"  ✓ CORRIGENDUM FOUND in {pdf_file.name}")
    
    return corrigendum_papers, all_papers

# Set your directory path here
pdf_directory = '/home/jakub/Projects/paper_testing/documents_to_test/psych_science_pdf_oa'

print(f"\nScanning directory: {pdf_directory}")
print("=" * 50)

corrigendum_papers, all_papers = scan_directory_for_corrigendum(pdf_directory)

print("\n" + "=" * 50)
print("SCAN COMPLETE")
print("=" * 50)

print(f"\nTotal papers processed: {len(all_papers)}")
print(f"Papers with 'Corrigendum': {len(corrigendum_papers)}")

if corrigendum_papers:
    print("\nPapers flagged with 'Corrigendum':")
    for i, paper in enumerate(corrigendum_papers, 1):
        print(f"{i}. {Path(paper).name}")
        print(f"   Full path: {paper}")
else:
    print("\nNo papers with 'Corrigendum' found.")



Scanning directory: /home/jakub/Projects/paper_testing/documents_to_test/psych_science_pdf_oa
Checking: 09567976231194590.pdf
Checking: 0956797619881134.pdf
Checking: 09567976231180588.pdf
Checking: 09567976231164553.pdf
Checking: 09567976241242105.pdf
Checking: 0956797619866627.pdf
Checking: 09567976221147258.pdf
Checking: 0956797617693326.pdf
Checking: 09567976221145316.pdf
Checking: 0956797617702699.pdf
Checking: 09567976211001317.pdf
Checking: 09567976231194221.pdf
Checking: 09567976231196145.pdf
Checking: 09567976231218640.pdf
Checking: 09567976211005767.pdf
Checking: 0956797621997350.pdf
Checking: 09567976221140341.pdf
Checking: 09567976221150616.pdf
Checking: 0956797620985832.pdf
Checking: 09567976231173900.pdf
Checking: 09567976211043428.pdf
Checking: 09567976231170878.pdf
Checking: 0956797617716922.pdf
Checking: 0956797614560771.pdf
Checking: 0956797618773095.pdf
Checking: 0956797621995197.pdf
Checking: 0956797615583071.pdf
Checking: 0956797620916782.pdf
Checking: 09567976221

In [3]:
# Find the PDF(s) with the mode number of pages in a directory
from collections import Counter

def get_pdf_page_counts(directory_path, recursive=True):
    directory = Path(directory_path)
    if recursive:
        pdf_files = directory.rglob("*.pdf")
    else:
        pdf_files = directory.glob("*.pdf")
    page_counts = []
    pdf_info = []
    for pdf_file in pdf_files:
        try:
            doc = fitz.open(str(pdf_file))
            num_pages = doc.page_count
            page_counts.append(num_pages)
            pdf_info.append((pdf_file.name, num_pages, str(pdf_file)))
            doc.close()
        except Exception as e:
            print(f"Error reading {pdf_file}: {e}")
    return page_counts, pdf_info

# Scan the directory
page_counts, pdf_info = get_pdf_page_counts(pdf_directory)

# Find the mode page count
if page_counts:
    mode_count = Counter(page_counts).most_common(1)[0][0]
    print(f"Mode page count: {mode_count}")
    print("PDF(s) with this page count:")
    for name, num_pages, full_path in pdf_info:
        if num_pages == mode_count:
            print(f"- {name} ({num_pages} pages) | {full_path}")
            break
    min_count = min(page_counts)
    print(f"Minimum page count: {min_count}")
    print("PDF(s) with this page count:")
    for name, num_pages, full_path in pdf_info:
        if num_pages == min_count:
            print(f"- {name} ({num_pages} pages) | {full_path}")
            break
    max_count = max(page_counts)
    print(f"Maximum page count: {max_count}")
    print("PDF(s) with this page count:")
    for name, num_pages, full_path in pdf_info:
        if num_pages == max_count:
            print(f"- {name} ({num_pages} pages) | {full_path}")
            break
        
else:
    print("No PDFs found or could not read page counts.")

Mode page count: 13
PDF(s) with this page count:
- 09567976241242105.pdf (13 pages) | /home/jakub/Projects/paper_testing/documents_to_test/psych_science_pdf_oa/09567976241242105.pdf
Minimum page count: 1
PDF(s) with this page count:
- 09567976231217508.pdf (1 pages) | /home/jakub/Projects/paper_testing/documents_to_test/psych_science_pdf_oa/09567976231217508.pdf
Maximum page count: 27
PDF(s) with this page count:
- 09567976221116892.pdf (27 pages) | /home/jakub/Projects/paper_testing/documents_to_test/psych_science_pdf_oa/09567976221116892.pdf
