In [1]:
%pip install PyMuPDF --quiet

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import fitz

In [3]:
pdf_path = "raw_data\document_to_anonymize.pdf"

In [4]:
def extract_text_from_pdf(path: str) -> str:
    """
    Extracts text content from all pages of a PDF file while preserving paragraph structure through blocks.

    Parameters:
        path (str): The file path to the PDF document.

    Returns:
        str: The extracted text from the entire PDF with paragraphs.
    """
    text = ""
    with fitz.open(path) as doc:
        for page in doc:
            blocks = page.get_text("blocks")
            blocks = sorted(blocks, key=lambda b: b[1])
            for block in blocks:
                text += block[4] + "\n\n"
    return text.strip()

In [5]:
import unicodedata
import re

def remove_all_special_characters(text: str) -> str:
    """
    Normalizes and cleans a text string by removing accents, punctuation, and special characters.

    Steps:
        1. Converts accented characters to their ASCII equivalents.
        2. Removes all characters except letters, numbers, and spaces.
        3. Collapses multiple spaces into a single space except for newlines.
        4. Strips leading and trailing whitespace.

    Parameters:
        text (str): The input string to be cleaned.

    Returns:
        str: The cleaned and normalized string.
    """
    text = unicodedata.normalize("NFD", text)
    text = text.encode("ascii", "ignore").decode("utf-8")

    text = re.sub(r"[^\w\s\n]", "", text)
    text = re.sub(r"[ \t]+", " ", text)

    return text.strip()

In [6]:
raw_text = extract_text_from_pdf(pdf_path)
raw_text_cleaned = remove_all_special_characters(raw_text)

FileNotFoundError: no such file: 'raw_data\document_to_anonymize.pdf'

In [None]:
print("\n--- Preview of Extracted Text ---\n")
print(raw_text_cleaned)


--- Preview of Extracted Text ---

Relatorio de Admissao Centro Medico Lisboa 


Data 15 de abril de 2025 
Referencia ADM20250415089 


Informacoes do Paciente 


Nome Maria Conceicao Oliveira Santos 


Data de Nascimento 12031978 


Sexo Feminino 


NIF 097865413 


Cartao de Cidadao 123456789ZX0 
Morada Rua das Flores 123 Apt 45 Sacavem Lisboa 
Telefone 351 912 345 678 
Email mariasantosemailpessoalpt 
Numero da Seguranca Social 11223344556 


Historico Medico 


A paciente Maria Santos mulher caucasiana de 47 anos compareceu a consulta relatando dores 
abdominais intensas Tem historico de hipertensao e diabetes tipo 2 diagnosticada ha 5 anos E 
HIV positivo desde 2018 atualmente com carga viral indetectavel gracas ao tratamento com 
antirretrovirais 


A paciente relatou que sua familia tem historico de cancro da mama mae falecida aos 52 anos 
e doenca cardiaca pai e avo paterno Exames geneticos realizados em 2022 indicaram 
predisposicao ao cancro de mama mutacao BRCA1 positiva 



In [None]:
txt_path = pdf_path.replace(".pdf", ".txt")
with open(txt_path, "w", encoding="utf-8") as f:
    f.write(raw_text_cleaned)

print(f"✅ Text extracted and saved to: {txt_path}")

✅ Text extracted and saved to: raw_data\document_to_anonymize.txt
