In [2]:
!pip install PyPDF2
import os
import re
import pandas as pd
from PyPDF2 import PdfReader

# Diretórios
pdf_dir = "/content/drive/MyDrive/imgCeonc/train/reports"
img_dir = "/content/drive/MyDrive/imgCeonc/train/images"

# Lista para armazenar os registros
data = []

# Funções auxiliares
def extract_birads(text, side):
    pattern = rf"MAMA {side}.*?Categoria\s+(\d)"
    match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
    return int(match.group(1)) if match else None

def extract_feature(pattern, text):
    match = re.search(pattern, text, re.IGNORECASE)
    return match.group(1).strip() if match else None

# Processa todos os PDFs
for pdf_file in os.listdir(pdf_dir):
    if not pdf_file.endswith(".pdf"):
        continue

    pdf_path = os.path.join(pdf_dir, pdf_file)
    base_name = pdf_file.replace(".pdf", "")

    try:
        reader = PdfReader(pdf_path)
        text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])

        for side_num, side_label in zip(["1", "2"], ["DIREITA", "ESQUERDA"]):
            image_filename = f"{base_name}_{side_num}.png"
            image_path = os.path.join(img_dir, image_filename)

            if os.path.exists(image_path):
                # Extrações
                birads = extract_birads(text, side_label)
                pele = extract_feature(rf"MAMA {side_label}\nPele\s+([^\n]+)", text)
                mama = extract_feature(rf"MAMA {side_label}.*?\nMama\s+([^\n]+)", text)
                linfonodos = extract_feature(rf"MAMA {side_label}.*?\nLinfonodos\s+axilares\s+([^\n]+)", text)
                achado = extract_feature(rf"Nódulo: (.+)", text)

                # Adiciona ao dataset
                data.append({
                    "patient_id": base_name,
                    "image_file": image_filename,
                    "laterality": "R" if side_label == "DIREITA" else "L",
                    "birads": birads,
                    "pele": pele,
                    "mama": mama,
                    "linfonodos": linfonodos,
                    "achado": achado
                })

    except Exception as e:
        print(f"Erro ao processar {pdf_file}: {e}")

# Exporta para CSV
df = pd.DataFrame(data)
df.to_csv("mamografia_estruturado.csv", index=False)
print("CSV gerado com sucesso!")


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m225.3/232.6 kB[0m [31m7.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
CSV gerado com sucesso!
