In [4]:
import re
import os
import logging
import pytesseract 
from pdf2image import convert_from_path
from transformers import pipeline
from fpdf import FPDF

In [None]:
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'  # Update this path based on your installation

sensitive_words = ["SSN", "credit card", "address", "phone number", "email", "passport", "bank account"]

ner_pipeline = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", aggregation_strategy="simple")

logging.basicConfig(level=logging.INFO)

In [6]:
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        images = convert_from_path(pdf_path)
        for page in images:
            text += pytesseract.image_to_string(page) + "\n"
    except Exception as e:
        logging.error(f"Error extracting text from PDF: {e}")
    return text

In [7]:
def chunk_text(text, chunk_size=512):
    sentences = text.split('. ')
    chunks = []
    current_chunk = ""
    
    for sentence in sentences:
        if len(current_chunk) + len(sentence) + 1 <= chunk_size:
            current_chunk += sentence + ". "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + ". "
    
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks

In [8]:
def find_sensitive_words(text):
    pattern = r"\b(" + "|".join(map(re.escape, sensitive_words)) + r")\b"
    return re.findall(pattern, text, re.IGNORECASE)

In [9]:
def obfuscate_text(text, sensitive_words):
    for word in sensitive_words:
        text = re.sub(r'\b' + re.escape(word) + r'\b', 'REDACTED', text, flags=re.IGNORECASE)
    return text

In [10]:
def identify_sensitive_data_with_ner(text):
    entities = ner_pipeline(text)
    sensitive_entities = []
    for entity in entities:
        if entity['entity_group'] in ['PER', 'ORG', 'LOC', 'MISC']:
            sensitive_entities.append(entity['word'])
    return sensitive_entities

In [11]:
def create_pdf_from_text(output_pdf_path, obfuscated_text):
    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Arial", size=12)
    fixed_width = 190 
    for line in obfuscated_text.split('\n'):
        pdf.multi_cell(fixed_width, 10, line.encode('latin-1', 'replace').decode('latin-1'))
    pdf.output(output_pdf_path)

In [12]:
def process_pdf_for_redaction(pdf_path, output_pdf_path):
    text = extract_text_from_pdf(pdf_path)
    text_chunks = chunk_text(text)
    obfuscated_chunks = []
    for chunk in text_chunks:
        found_sensitive = find_sensitive_words(chunk)
        if not found_sensitive:
            found_sensitive = identify_sensitive_data_with_ner(chunk)
        obfuscated_chunk = obfuscate_text(chunk, found_sensitive)
        obfuscated_chunks.append(obfuscated_chunk)
    obfuscated_text = "\n".join(obfuscated_chunks)
    create_pdf_from_text(output_pdf_path, obfuscated_text)

In [None]:

if __name__ == "__main__":
    input_pdf_path = "input.pdf"
    output_pdf_path = "output_redacted.pdf"
    process_pdf_for_redaction(input_pdf_path, output_pdf_path)