In [13]:
!pip install PyPDF2 pymupdf pdfminer.six

Collecting pdfminer.six
  Downloading pdfminer_six-20260107-py3-none-any.whl (6.6 MB)
     ---------------------------------------- 0.0/6.6 MB ? eta -:--:--
      --------------------------------------- 0.1/6.6 MB 2.4 MB/s eta 0:00:03
     -- ------------------------------------- 0.5/6.6 MB 5.1 MB/s eta 0:00:02
     --------- ------------------------------ 1.6/6.6 MB 11.1 MB/s eta 0:00:01
     -------------------- ------------------- 3.3/6.6 MB 19.5 MB/s eta 0:00:01
     ------------------------ --------------- 4.1/6.6 MB 17.5 MB/s eta 0:00:01
     ---------------------------------------  6.6/6.6 MB 23.4 MB/s eta 0:00:01
     ---------------------------------------- 6.6/6.6 MB 21.1 MB/s eta 0:00:00
Collecting cryptography>=36.0.0
  Downloading cryptography-46.0.4-cp38-abi3-win_amd64.whl (3.5 MB)
     ---------------------------------------- 0.0/3.5 MB ? eta -:--:--
     ----------------- ---------------------- 1.5/3.5 MB 94.1 MB/s eta 0:00:01
     ----------------- --------------------


[notice] A new release of pip is available: 23.0.1 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
from pathlib import Path
from PyPDF2 import PdfReader, PdfWriter
from pdfminer.high_level import extract_text
import pymupdf
import string
import time

In [2]:

path = Path("data/")
files = [file.name for file in path.iterdir() if file.suffix == ".pdf"]

In [3]:
print(files)

['Changing Data Sources in the Age of Machine Learning for Official Statistics.pdf', 'Physics-Inspired Interpretability Of Machine Learning Models.pdf']


## Using PyPDF2

In [4]:
reader = PdfReader(f"data/{files[0]}")
print(len(reader.pages))

8


In [5]:
def extract_text_from_pdf_PyPDF2(file_path):
    doc = pymupdf.open(file_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

In [6]:
start_time = time.time()
text_PyPDF2 = extract_text_from_pdf_PyPDF2(f"data/{files[0]}")
finish_time = time.time()

In [7]:
print(f"time PyPDF2: {finish_time - start_time} seconds")
print("len text PyPDF2:", len(text_PyPDF2))

time PyPDF2: 0.13853788375854492 seconds
len text PyPDF2: 37624


## Using pymupdf

In [8]:
def extract_text_from_pdf_pymupdf(file_path):
    doc = pymupdf.open(file_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

In [9]:
start_time = time.time()
text_pymupdf = extract_text_from_pdf_pymupdf(f"data/{files[0]}")
finish_time = time.time()

In [10]:
print(f"time pymupdf: {finish_time - start_time} seconds")
print("len text pymupdf:", len(text_pymupdf))

time pymupdf: 0.06223297119140625 seconds
len text pymupdf: 37624


## Using pdfminer

In [11]:
def extract_text_from_pdf_pdfminer(file_path):
    text = extract_text(file_path)
    return text

In [12]:
start_time = time.time()
text_pdfminer = extract_text_from_pdf_pdfminer(f"data/{files[0]}")
finish_time = time.time()


In [13]:
print(f"time pdfminer: {finish_time - start_time} seconds")
print("len text pdfminer:", len(text_pdfminer))

time pdfminer: 2.3055579662323 seconds
len text pdfminer: 37807


## Compare alphabet ratio

In [14]:
def alphabetic_ratio(text):
    if not text:
        return 0.0

    alpha_count = sum(c.isalpha() for c in text)
    total_count = len(text)

    return alpha_count / total_count


In [15]:
alphabetic_ratio_PyPDF2 = alphabetic_ratio(text_PyPDF2)

In [16]:
alphabetic_ratio_pymupdf = alphabetic_ratio(text_pymupdf)

In [17]:
alphabetic_ratio_pdfminer = alphabetic_ratio(text_pdfminer)

In [18]:
print(f"Alphabetic ratio PyPDF2: {alphabetic_ratio_PyPDF2:.4f}")
print(f"Alphabetic ratio pymupdf: {alphabetic_ratio_pymupdf:.4f}")
print(f"Alphabetic ratio pdfminer: {alphabetic_ratio_pdfminer:.4f}")

Alphabetic ratio PyPDF2: 0.8109
Alphabetic ratio pymupdf: 0.8109
Alphabetic ratio pdfminer: 0.8069


## Clean text

In [19]:
def clean_text(text):
    #Remove punctuation and newlines
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator).replace('\n', ' ')


In [20]:
cleaned_text_PyPDF2 = clean_text(text_PyPDF2)
cleaned_text_pymupdf = clean_text(text_pymupdf)
cleaned_text_pdfminer = clean_text(text_pdfminer)

In [21]:
print(f"len cleaned text PyPDF2: {len(cleaned_text_PyPDF2)}")
print(f"len cleaned text pymupdf: {len(cleaned_text_pymupdf)}")
print(f"len cleaned text pdfminer: {len(cleaned_text_pdfminer)}")

len cleaned text PyPDF2: 36611
len cleaned text pymupdf: 36611
len cleaned text pdfminer: 36794


In [22]:
print(f"alphabetic ratio cleaned PyPDF2: {alphabetic_ratio(cleaned_text_PyPDF2):.4f}")
print(f"alphabetic ratio cleaned pymupdf: {alphabetic_ratio(cleaned_text_pymupdf):.4f}")
print(f"alphabetic ratio cleaned pdfminer: {alphabetic_ratio(cleaned_text_pdfminer):.4f}")

alphabetic ratio cleaned PyPDF2: 0.8333
alphabetic ratio cleaned pymupdf: 0.8333
alphabetic ratio cleaned pdfminer: 0.8292


## Save best result 

In [24]:
with open("data/output_PyPDF2.txt", "w", encoding="utf8") as f:
    f.write(cleaned_text_PyPDF2)