In [1]:
from transformers import DonutProcessor, VisionEncoderDecoderModel

processor = DonutProcessor.from_pretrained("sourinkarmakar/kyc_v1-donut-demo")
model = VisionEncoderDecoderModel.from_pretrained("sourinkarmakar/kyc_v1-donut-demo")

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [2]:
import cv2
import pytesseract
from presidio_analyzer import AnalyzerEngine
from presidio_analyzer.nlp_engine import NlpEngineProvider

# -----------------------
# 1. Setup Presidio NLP with en_core_web_md
# -----------------------
configuration = {
    "nlp_engine_name": "spacy",
    "models": [{"lang_code": "en", "model_name": "en_core_web_md"}],
}

provider = NlpEngineProvider(nlp_configuration=configuration)
nlp_engine = provider.create_engine()

analyzer = AnalyzerEngine(nlp_engine=nlp_engine, supported_languages=["en"])

In [14]:
import cv2
import pytesseract
from transformers import DonutProcessor, VisionEncoderDecoderModel
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine
from PIL import Image
import json
import re
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
# Step 1: Load Donut
processor = DonutProcessor.from_pretrained("sourinkarmakar/kyc_v1-donut-demo")
model = VisionEncoderDecoderModel.from_pretrained("sourinkarmakar/kyc_v1-donut-demo")

def donut_extract(img_path):
    # Load with PIL instead of passing string
    pil_img = Image.open(img_path).convert("RGB")
    pixel_values = processor(pil_img, return_tensors="pt").pixel_values

    task_prompt = "<s_cord-v2>"
    decoder_input_ids = processor.tokenizer(
        task_prompt, add_special_tokens=False, return_tensors="pt"
    ).input_ids

    outputs = model.generate(pixel_values, decoder_input_ids=decoder_input_ids, max_length=512)
    result = processor.tokenizer.decode(outputs[0], skip_special_tokens=True)
    return result

# Step 2: OCR with bounding boxes
def run_ocr(img_path):
    img = cv2.imread(img_path)
    data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT, lang="eng")
    return img, data

# Step 3: Run Presidio
def presidio_detect(text):
    #analyzer = AnalyzerEngine()
    results = analyzer.analyze(text=text, language="en")
    print("presidio_detect",results);
    return results

# Step 4: Redact in image
# def redact_image(img, ocr_data, pii_entities):
#     for entity in pii_entities:
#         pii_text = entity.entity_text
#         for i, word in enumerate(ocr_data["text"]):
#             if word.strip() and re.search(re.escape(pii_text), word, re.IGNORECASE):
#                 x, y, w, h = (
#                     ocr_data["left"][i],
#                     ocr_data["top"][i],
#                     ocr_data["width"][i],
#                     ocr_data["height"][i],
#                 )
#                 cv2.rectangle(img, (x, y), (x + w, y + h), (0, 0, 0), -1)  # black box
#     return img

# Full pipeline
# def redact_doc(img_path, output_path="redacted_donut_presidio.jpg"):
#     # OCR
#     img, ocr_data = run_ocr(img_path)

#     # Donut extraction (structured text)
#     donut_result = donut_extract(img_path)
#     print("donut_result",donut_result)

#     # Presidio detects PII in Donut output
#     results = presidio_detect(donut_result)
#     print("results",results)

#     # Redact in image
#     redacted_img = redact_image(img, ocr_data, results)

#     cv2.imwrite(output_path, redacted_img)
#     return output_path, donut_result

# Example usage
# redacted_path, donut_json = redact_doc("final_lakshmi_aadhar.jpg")
# print("✅ Redacted image saved at:", redacted_path)
# print("Donut Extract:", donut_json)

def parse_donut_output(raw_text):
    # Convert tags into JSON-like dict
    fields = re.findall(r"<s_(.*?)>(.*?)</s_\1>", raw_text)
    data = {k: v.strip() for k, v in fields if v.strip()}
    return data
    
# def redact_image(img, ocr_data, pii_entities, original_text):
#     for entity in pii_entities:
#         # Get the actual substring from the original text
#         pii_text = original_text[entity.start:entity.end]

#         for i, word in enumerate(ocr_data["text"]):
#             if word.strip() and re.fullmatch(re.escape(pii_text.strip()), word.strip(), re.IGNORECASE):
#                 x, y, w, h = (
#                     ocr_data["left"][i],
#                     ocr_data["top"][i],
#                     ocr_data["width"][i],
#                     ocr_data["height"][i],
#                 )
#                 cv2.rectangle(img, (x, y), (x + w, y + h), (0, 0, 0), -1)  # blackout
#     return img

# def redact_image(img, ocr_data, pii_entities):
#     for entity in pii_entities:
#         pii_text = entity.entity_text.strip().lower()

#         for i, word in enumerate(ocr_data["text"]):
#             word_clean = word.strip().lower()
#             if word_clean and pii_text in word_clean:
#                 x, y, w, h = (
#                     ocr_data["left"][i],
#                     ocr_data["top"][i],
#                     ocr_data["width"][i],
#                     ocr_data["height"][i],
#                 )
#                 cv2.rectangle(img, (x, y), (x + w, y + h), (0, 0, 0), -1)  # blackout
#     return img

# def redact_image(img, ocr_data, pii_entities, original_text):
#     for entity in pii_entities:
#         pii_text = original_text[entity.start:entity.end].strip().lower()

#         for i, word in enumerate(ocr_data["text"]):
#             word_clean = word.strip().lower()
#             if word_clean and pii_text in word_clean:
#                 x, y, w, h = (
#                     ocr_data["left"][i],
#                     ocr_data["top"][i],
#                     ocr_data["width"][i],
#                     ocr_data["height"][i],
#                 )
#                 cv2.rectangle(img, (x, y), (x + w, y + h), (0, 0, 0), -1)  # blackout
#     return img

def redact_image(img, ocr_data, pii_entities, original_text):
    for entity in pii_entities:
        # Extract the actual PII substring from the Donut text
        pii_text = original_text[entity.start:entity.end].strip().lower()

        for i, word in enumerate(ocr_data["text"]):
            word_clean = word.strip().lower()
            if word_clean and (pii_text in word_clean or word_clean in pii_text):
                x, y, w, h = (
                    ocr_data["left"][i],
                    ocr_data["top"][i],
                    ocr_data["width"][i],
                    ocr_data["height"][i],
                )
                cv2.rectangle(img, (x, y), (x + w, y + h), (0, 0, 0), -1)  # blackout
    return img

def redact_image_by_fields(img, ocr_data, pii_fields):
    # Flatten all PII values into a list
    pii_values = [v.strip().lower() for v in pii_fields.values() if isinstance(v, str)]
    #print("pii_values",pii_values)

    for i, word in enumerate(ocr_data["text"]):
        word_clean = word.strip().lower()
        if word_clean and any(pii in word_clean or word_clean in pii for pii in pii_values):
            x, y, w, h = (
                ocr_data["left"][i],
                ocr_data["top"][i],
                ocr_data["width"][i],
                ocr_data["height"][i],
            )
            cv2.rectangle(img, (x, y), (x + w, y + h), (0, 0, 0), -1)  # blackout
    return img

def redact_doc(img_path, output_path="pan_sreeram_pan_redacted_by_ocr2.jpg"):
    # OCR
    img, ocr_data = run_ocr(img_path)

    # Donut extraction
    donut_result = donut_extract(img_path)
    clean_data = parse_donut_output(donut_result)
    #print("Extracted PII fields:", clean_data)

    # Redact using OCR match
    redacted_img = redact_image_by_fields(img, ocr_data, clean_data)

    cv2.imwrite(output_path, redacted_img)
    print("✅ Redacted image saved at:", output_path)
    return output_path, clean_data

# def redact_doc(img_path, output_path="red_pan_presidio2.jpg"):
#     # OCR
#     img, ocr_data = run_ocr(img_path)
#     print("ocr_data",ocr_data);

#     # Donut extraction (structured text)
#     donut_result = donut_extract(img_path)
#     clean_data = parse_donut_output(donut_result)
#     print("clean_data", clean_data)

#     # Presidio detects PII in Donut output
#     results = presidio_detect(donut_result)
#     print("results", results)

#     # Redact in image (pass donut_result as reference text)
#     redacted_img = redact_image(img, ocr_data, results,donut_result)

#     cv2.imwrite(output_path, redacted_img)
#     return output_path, donut_result
# redacted_path, donut_json = redact_doc("pan.jpg")
redacted_path, donut_json = redact_doc("pan_sreeram.jpg")
#print("✅ Redacted image saved at:", redacted_path)
print("Donut Extract:", donut_json)    

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Redacted image saved at: pan_sreeram_pan_redacted_by_ocr2.jpg
Donut Extract: {'docType': 'Pan', 'name': 'SREE RAMA MURTHY KATTAMURI', 'fatherName': 'SANYASI SETTY KATTAMURI', 'dob': '18 06 1977', 'docId': 'AWEPK4793N', 'side': 'Front', 'orientation': '0', 'isColoured': 'Coloured'}


In [4]:
import cv2
import random
import string
from faker import Faker

faker = Faker()

def generate_fake_pan_data():
    return {
        "docType": "Pan",
        "name": faker.name().upper(),
        "fatherName": faker.name().upper(),
        "dob": faker.date_of_birth(minimum_age=18, maximum_age=60).strftime("%d/%m/%Y"),
        "docId": ''.join(random.choices(string.ascii_uppercase, k=5)) +
                 ''.join(random.choices(string.digits, k=4)) +
                 random.choice(string.ascii_uppercase),
        "side": "Front",
        "orientation": "0",
        "isColoured": "Coloured"
    }

def generate_fake_pan_image(template_path, output_path="synthetic_pan12.jpg"):
    fake_data = generate_fake_pan_data()
    img = cv2.imread(template_path)

    font = cv2.FONT_HERSHEY_SIMPLEX
    font_scale = 1.0
    color = (0, 0, 0)
    thickness = 3

    # Overlay text at approximate PAN card positions
    cv2.putText(img, fake_data["name"], (80, 420), font, font_scale, color, thickness)
    cv2.putText(img, fake_data["fatherName"], (80, 500), font, font_scale, color, thickness)
    cv2.putText(img, fake_data["dob"], (80, 600), font, font_scale, color, thickness)
    cv2.putText(img, fake_data["docId"], (300, 300), font, font_scale, color, thickness)

    cv2.imwrite(output_path, img)
    print("✅ Synthetic PAN image saved at:", output_path)
    return output_path, fake_data

template_path = "blank_pan_template - Copy.jpg"
synthetic_path, fake_data = generate_fake_pan_image(template_path)
print("Fake PAN Data:", fake_data)

✅ Synthetic PAN image saved at: synthetic_pan12.jpg
Fake PAN Data: {'docType': 'Pan', 'name': 'JAMES ROBINSON', 'fatherName': 'JILLIAN ROBERTS', 'dob': '21/09/1993', 'docId': 'RGGZO6841N', 'side': 'Front', 'orientation': '0', 'isColoured': 'Coloured'}


In [16]:
!pip install faker

Collecting faker
  Downloading faker-37.8.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.8.0-py3-none-any.whl (2.0 MB)
   ---------------------------------------- 0.0/2.0 MB ? eta -:--:--
   ---------------- ----------------------- 0.8/2.0 MB 5.9 MB/s eta 0:00:01
   -------------------------------- ------- 1.6/2.0 MB 3.9 MB/s eta 0:00:01
   ---------------------------------------- 2.0/2.0 MB 4.3 MB/s  0:00:00
Installing collected packages: faker
Successfully installed faker-37.8.0


In [5]:
import cv2
import random
import string
import os
from faker import Faker

faker = Faker()

def generate_fake_pan_data():
    return {
        "docType": "Pan",
        "name": faker.name().upper(),
        "fatherName": faker.name().upper(),
        "dob": faker.date_of_birth(minimum_age=18, maximum_age=60).strftime("%d/%m/%Y"),
        "docId": ''.join(random.choices(string.ascii_uppercase, k=5)) +
                 ''.join(random.choices(string.digits, k=4)) +
                 random.choice(string.ascii_uppercase),
        "side": "Front",
        "orientation": "0",
        "isColoured": "Coloured"
    }

def generate_fake_pan_image(template_path, output_path):
    fake_data = generate_fake_pan_data()
    img = cv2.imread(template_path)

    font = cv2.FONT_HERSHEY_SIMPLEX
    font_scale = 1.0
    color = (0, 0, 0)
    thickness = 3

    # Overlay text at approximate PAN card positions
    cv2.putText(img, fake_data["name"], (80, 420), font, font_scale, color, thickness)
    cv2.putText(img, fake_data["fatherName"], (80, 500), font, font_scale, color, thickness)
    cv2.putText(img, fake_data["dob"], (80, 600), font, font_scale, color, thickness)
    cv2.putText(img, fake_data["docId"], (300, 300), font, font_scale, color, thickness)

    cv2.imwrite(output_path, img)
    return fake_data

def generate_bulk_pan_images(template_path, output_dir="synthetic_pan_samples", count=100):
    os.makedirs(output_dir, exist_ok=True)
    all_data = []

    for i in range(1, count + 1):
        filename = f"synthetic_pan_{i:03d}.jpg"
        output_path = os.path.join(output_dir, filename)
        fake_data = generate_fake_pan_image(template_path, output_path)
        all_data.append(fake_data)
        print(f"✅ Saved: {filename}")

    print(f"\n🎉 Successfully generated {count} synthetic PAN card images in '{output_dir}' folder.")
    return all_data

# Run the bulk generator
template_path = "blank_pan_template - Copy.jpg"
pan_data_list = generate_bulk_pan_images(template_path)


✅ Saved: synthetic_pan_001.jpg
✅ Saved: synthetic_pan_002.jpg
✅ Saved: synthetic_pan_003.jpg
✅ Saved: synthetic_pan_004.jpg
✅ Saved: synthetic_pan_005.jpg
✅ Saved: synthetic_pan_006.jpg
✅ Saved: synthetic_pan_007.jpg
✅ Saved: synthetic_pan_008.jpg
✅ Saved: synthetic_pan_009.jpg
✅ Saved: synthetic_pan_010.jpg
✅ Saved: synthetic_pan_011.jpg
✅ Saved: synthetic_pan_012.jpg
✅ Saved: synthetic_pan_013.jpg
✅ Saved: synthetic_pan_014.jpg
✅ Saved: synthetic_pan_015.jpg
✅ Saved: synthetic_pan_016.jpg
✅ Saved: synthetic_pan_017.jpg
✅ Saved: synthetic_pan_018.jpg
✅ Saved: synthetic_pan_019.jpg
✅ Saved: synthetic_pan_020.jpg
✅ Saved: synthetic_pan_021.jpg
✅ Saved: synthetic_pan_022.jpg
✅ Saved: synthetic_pan_023.jpg
✅ Saved: synthetic_pan_024.jpg
✅ Saved: synthetic_pan_025.jpg
✅ Saved: synthetic_pan_026.jpg
✅ Saved: synthetic_pan_027.jpg
✅ Saved: synthetic_pan_028.jpg
✅ Saved: synthetic_pan_029.jpg
✅ Saved: synthetic_pan_030.jpg
✅ Saved: synthetic_pan_031.jpg
✅ Saved: synthetic_pan_032.jpg
✅ Saved:

In [None]:
import os


import cv2
import pytesseract
from transformers import DonutProcessor, VisionEncoderDecoderModel
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine
from PIL import Image
import json
import re
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
# Step 1: Load Donut
processor = DonutProcessor.from_pretrained("sourinkarmakar/kyc_v1-donut-demo")
model = VisionEncoderDecoderModel.from_pretrained("sourinkarmakar/kyc_v1-donut-demo")

def donut_extract(img_path):
    # Load with PIL instead of passing string
    pil_img = Image.open(img_path).convert("RGB")
    pixel_values = processor(pil_img, return_tensors="pt").pixel_values

    task_prompt = "<s_cord-v2>"
    decoder_input_ids = processor.tokenizer(
        task_prompt, add_special_tokens=False, return_tensors="pt"
    ).input_ids

    outputs = model.generate(pixel_values, decoder_input_ids=decoder_input_ids, max_length=512)
    result = processor.tokenizer.decode(outputs[0], skip_special_tokens=True)
    return result

# Step 2: OCR with bounding boxes
def run_ocr(img_path):
    img = cv2.imread(img_path)
    data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT, lang="eng")
    return img, data

def parse_donut_output(raw_text):
    # Convert tags into JSON-like dict
    fields = re.findall(r"<s_(.*?)>(.*?)</s_\1>", raw_text)
    data = {k: v.strip() for k, v in fields if v.strip()}
    return data  

def redact_image_by_fields(img, ocr_data, pii_fields):
    # Flatten all PII values into a list
    pii_values = [v.strip().lower() for v in pii_fields.values() if isinstance(v, str)]
    #print("pii_values",pii_values)

    for i, word in enumerate(ocr_data["text"]):
        word_clean = word.strip().lower()
        if word_clean and any(pii in word_clean or word_clean in pii for pii in pii_values):
            x, y, w, h = (
                ocr_data["left"][i],
                ocr_data["top"][i],
                ocr_data["width"][i],
                ocr_data["height"][i],
            )
            cv2.rectangle(img, (x, y), (x + w, y + h), (0, 0, 0), -1)  # blackout
    return img    
def redact_doc(img_path, output_path="pan_sreeram_pan_redacted_by_ocr2.jpg"):
    # OCR
    img, ocr_data = run_ocr(img_path)

    # Donut extraction
    donut_result = donut_extract(img_path)
    clean_data = parse_donut_output(donut_result)
    #print("Extracted PII fields:", clean_data)

    # Redact using OCR match
    redacted_img = redact_image_by_fields(img, ocr_data, clean_data)

    cv2.imwrite(output_path, redacted_img)
    print("✅ Redacted image saved at:", output_path)
    return output_path, clean_data

def batch_redact_pan_images(input_dir="synthetic_pan_samples", output_dir="redacted_pan_samples"):
    os.makedirs(output_dir, exist_ok=True)
    redacted_data = []

    for filename in os.listdir(input_dir):
        if filename.lower().endswith(".jpg"):
            input_path = os.path.join(input_dir, filename)
            output_path = os.path.join(output_dir, f"redacted_{filename}")
            
            try:
                redacted_img_path, extracted_data = redact_doc(input_path, output_path)
                redacted_data.append({
                    "filename": filename,
                    "redacted_path": redacted_img_path,
                    "extracted_data": extracted_data
                })
                print(f"✅ Redacted: {filename}")
            except Exception as e:
                print(f"❌ Failed to redact {filename}: {e}")

    print(f"\n🎉 Completed redaction for {len(redacted_data)} PAN images.")
    return redacted_data

# Run the batch redaction
redacted_results = batch_redact_pan_images()


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


✅ Redacted image saved at: redacted_pan_samples\redacted_synthetic_pan_001.jpg
✅ Redacted: synthetic_pan_001.jpg
✅ Redacted image saved at: redacted_pan_samples\redacted_synthetic_pan_002.jpg
✅ Redacted: synthetic_pan_002.jpg
✅ Redacted image saved at: redacted_pan_samples\redacted_synthetic_pan_003.jpg
✅ Redacted: synthetic_pan_003.jpg
✅ Redacted image saved at: redacted_pan_samples\redacted_synthetic_pan_004.jpg
✅ Redacted: synthetic_pan_004.jpg
✅ Redacted image saved at: redacted_pan_samples\redacted_synthetic_pan_005.jpg
✅ Redacted: synthetic_pan_005.jpg
✅ Redacted image saved at: redacted_pan_samples\redacted_synthetic_pan_006.jpg
✅ Redacted: synthetic_pan_006.jpg
✅ Redacted image saved at: redacted_pan_samples\redacted_synthetic_pan_007.jpg
✅ Redacted: synthetic_pan_007.jpg
✅ Redacted image saved at: redacted_pan_samples\redacted_synthetic_pan_008.jpg
✅ Redacted: synthetic_pan_008.jpg
✅ Redacted image saved at: redacted_pan_samples\redacted_synthetic_pan_009.jpg
✅ Redacted: synth