In [1]:
!pip install PyPDF2 fpdf transformers spacy presidio-analyzer presidio-anonymizer
!python -m spacy download fr_core_news_lg
!pip install reportlab


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting fpdf
  Downloading fpdf-1.7.2.tar.gz (39 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting presidio-analyzer
  Downloading presidio_analyzer-2.2.355-py3-none-any.whl.metadata (2.9 kB)
Collecting presidio-anonymizer
  Downloading presidio_anonymizer-2.2.355-py3-none-any.whl.metadata (8.2 kB)
Collecting phonenumbers<9.0.0,>=8.12 (from presidio-analyzer)
  Downloading phonenumbers-8.13.51-py2.py3-none-any.whl.metadata (10 kB)
Collecting tldextract (from presidio-analyzer)
  Downloading tldextract-5.1.3-py3-none-any.whl.metadata (11 kB)
Collecting azure-core (from presidio-anonymizer)
  Downloading azure_core-1.32.0-py3-none-any.whl.metadata (39 kB)
Collecting pycryptodome>=3.10.1 (from presidio-anonymizer)
  Downloading pycryptodome-3.21.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting requests-file>=1.4 (from tldextract->presidio-analyzer)
  

In [11]:
import os
from PyPDF2 import PdfReader
from fpdf import FPDF
from transformers import pipeline
from presidio_analyzer import AnalyzerEngine, RecognizerResult, EntityRecognizer
from presidio_anonymizer import AnonymizerEngine
from presidio_analyzer.nlp_engine import NlpEngineProvider

# Classe TransformerRecognizer pour utiliser un modèle NER Transformer
class TransformerRecognizer(EntityRecognizer):
    def __init__(self, model_id, mapping_labels, aggregation_strategy="simple"):
        super().__init__(supported_entities=list(mapping_labels.values()), supported_language="fr")
        self.pipeline = pipeline(
            "token-classification",
            model=model_id,
            aggregation_strategy=aggregation_strategy,
            ignore_labels=["O"]
        )
        self.label2presidio = mapping_labels

    def analyze(self, text, entities=None, nlp_artifacts=None):
        results = []
        predictions = self.pipeline(text)
        for entity in predictions:
            if entity["entity_group"] in self.label2presidio:
                converted_entity = self.label2presidio[entity["entity_group"]]
                if entities is None or converted_entity in entities:
                    results.append(
                        RecognizerResult(
                            entity_type=converted_entity,
                            start=entity["start"],
                            end=entity["end"],
                            score=entity["score"]
                        )
                    )
        return results

# Configuration du pipeline NER et Presidio
mapping_labels = {"PER": "PERSON", "LOC": "LOCATION", "ORG": "ORGANIZATION", "MISC": "MISC"}
transformers_recognizer = TransformerRecognizer("Jean-Baptiste/camembert-ner", mapping_labels)

configuration = {"nlp_engine_name": "spacy", "models": [{"lang_code": "fr", "model_name": "fr_core_news_lg"}]}
provider = NlpEngineProvider(nlp_configuration=configuration)
nlp_engine = provider.create_engine()
analyzer = AnalyzerEngine(nlp_engine=nlp_engine, supported_languages=["fr"])
analyzer.registry.add_recognizer(transformers_recognizer)

anonymizer = AnonymizerEngine()

# Fonction pour extraire le texte d'un fichier PDF
def extract_text_from_pdf(file_path):
    reader = PdfReader(file_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

# Fonction pour anonymiser le texte
def anonymize_text(text):
    analyzer_results = analyzer.analyze(text=text, entities=None, language="fr")
    anonymized_result = anonymizer.anonymize(text=text, analyzer_results=analyzer_results)
    return anonymized_result.text, anonymized_result.items

# Fonction pour colorer le texte anonymisé
def colorize_text(original_text, entities):
    colored_text = original_text
    for entity in sorted(entities, key=lambda x: x.start, reverse=True):
        replacement = f"\033[93m{colored_text[entity.start:entity.end]}\033[0m"
        colored_text = colored_text[:entity.start] + replacement + colored_text[entity.end:]
    return colored_text

# Fonction pour créer un PDF anonymisé coloré
from reportlab.pdfgen import canvas

def save_to_pdf(text, output_path):
    c = canvas.Canvas(output_path)
    c.setFont("Helvetica", 12)
    width, height = 595.27, 841.89  # Taille A4 en points
    y = height - 40  # Position de départ en haut de la page

    for line in text.split("\n"):
        c.drawString(40, y, line)
        y -= 15
        if y < 40:  # Ajoutez une nouvelle page si nécessaire
            c.showPage()
            c.setFont("Helvetica", 12)
            y = height - 40

    c.save()


# Traitement de plusieurs fichiers PDF
def process_pdfs(input_folder, output_folder):
    for file_name in os.listdir(input_folder):
        if file_name.endswith(".pdf"):
            input_path = os.path.join(input_folder, file_name)
            output_path = os.path.join(output_folder, f"anonymized_{file_name}")

            print(f"Processing {file_name}...")
            text = extract_text_from_pdf(input_path)
            anonymized_text, entities = anonymize_text(text)
            colorized_text = colorize_text(anonymized_text, entities)
            print(colorized_text)  # Affiche le texte anonymisé et coloré
            save_to_pdf(colorized_text, output_path)
            print(f"Saved anonymized file to {output_path}")

# Chemins d'entrée et de sortie
input_folder = "input_pdfs"
output_folder = "output_pdfs"
os.makedirs(output_folder, exist_ok=True)

# Lancer le traitement
process_pdfs(input_folder, output_folder)





Processing test-pdf_1.pdf...




The[93m<MISC>[0m ([93m<MISC>[0m):
A[93m<MISC>[0m for[93m<MISC>[0m[93m<LOCATION>[0m[93m<PERSON>[0m[93m<ORGANIZATION>[0m[93m<LOCATION>[0m,[93m<LOCATION>[0m
[93m<EMAIL_ADDRESS>[0m[93m<PERSON>[0m[93m<ORGANIZATION>[0m[93m<LOCATION>[0m,[93m<LOCATION>[0m
[93m<EMAIL_ADDRESS>[0m[93m<PERSON>[0m[93m<ORGANIZATION>[0m,[93m<LOCATION>[0m
[93m<EMAIL_ADDRESS>[0m[93m<PERSON>[0m[93m<ORGANIZATION>[0m,[93m<LOCATION>[0m
[93m<EMAIL_ADDRESS>[0m[93m<PERSON>[0m[93m<ORGANIZATION>[0m,[93m<ORGANIZATION>[0m[93m<MISC>[0m[93m<ORGANIZATION>[0m,[93m<LOCATION>[0m
[93m<EMAIL_ADDRESS>[0m[93m<PERSON>[0m[93m<ORGANIZATION>[0m[93m<LOCATION>[0m,[93m<ORGANIZATION>[0m[93m<MISC>[0m[93m<ORGANIZATION>[0m,[93m<LOCATION>[0m
[93m<EMAIL_ADDRESS>[0m
The two ﬁrst authors contributed equally to this work.[93m<MISC>[0m:[93m<PERSON>[0m. Submission received: 21 January 2022; revised version received: 4 [93m<PERSON>[0m 2022;
accepted for publication: 5 August 