In [6]:

import cv2
import random
import string
import os
import pytesseract
from faker import Faker

faker = Faker()

def generate_fake_pan_data():
    return {
        "docType": "Pan",
        "name": faker.name().upper(),
        "fatherName": faker.name().upper(),
        "dob": faker.date_of_birth(minimum_age=18, maximum_age=60).strftime("%d/%m/%Y"),
        "docId": ''.join(random.choices(string.ascii_uppercase, k=5)) +
                 ''.join(random.choices(string.digits, k=4)) +
                 random.choice(string.ascii_uppercase),
        "side": "Front",
        "orientation": "0",
        "isColoured": "Coloured"
    }

def generate_fake_pan_image(template_path, output_path, fake_data):
    img = cv2.imread(template_path)

    font = cv2.FONT_HERSHEY_SIMPLEX
    font_scale = 1.0
    color = (0, 0, 0)
    thickness = 2

    # Overlay text at approximate PAN card positions
    cv2.putText(img, fake_data["name"], (80, 420), font, font_scale, color, thickness)
    cv2.putText(img, fake_data["fatherName"], (80, 500), font, font_scale, color, thickness)
    cv2.putText(img, fake_data["dob"], (80, 600), font, font_scale, color, thickness)
    cv2.putText(img, fake_data["docId"], (300, 300), font, font_scale, color, thickness)

    cv2.imwrite(output_path, img)

def generate_bulk_pan_images(template_path, output_dir="synthetic_pan_samples_latest", count=100):
    os.makedirs(output_dir, exist_ok=True)
    synthetic_data_list = []

    for i in range(count):
        fake_data = generate_fake_pan_data()
        filename = f"synthetic_pan_{i:03d}.jpg"
        output_path = os.path.join(output_dir, filename)
        generate_fake_pan_image(template_path, output_path, fake_data)
        synthetic_data_list.append(fake_data)
        print(f"✅ Saved: {filename}")

    print(f"\n🎉 Successfully generated {count} synthetic PAN card images.")
    return synthetic_data_list

# Generate and store synthetic data
template_path = "blank_pan_template - Copy.jpg"
synthetic_data_list = generate_bulk_pan_images(template_path)
























def run_ocr(img_path):
    img = cv2.imread(img_path)
    data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT, lang="eng")
    return img, data

def redact_image_by_fields(img, ocr_data, pii_fields):
    # Flatten all PII values into a list
    pii_values = [v.strip().lower() for v in pii_fields.values() if isinstance(v, str)]
    #print("pii_values",pii_values)

    for i, word in enumerate(ocr_data["text"]):
        word_clean = word.strip().lower()
        if word_clean and any(pii in word_clean or word_clean in pii for pii in pii_values):
            x, y, w, h = (
                ocr_data["left"][i],
                ocr_data["top"][i],
                ocr_data["width"][i],
                ocr_data["height"][i],
            )
            cv2.rectangle(img, (x, y), (x + w, y + h), (0, 0, 0), -1)  # blackout

       
    
    return img  



def redact_doc_with_known_fields(img_path, output_path, known_fields):
    img, ocr_data = run_ocr(img_path)
    redacted_img = redact_image_by_fields(img, ocr_data, known_fields)
    cv2.imwrite(output_path, redacted_img)
    print("✅ Redacted image saved at:", output_path)
    return output_path
    

# def batch_redact_pan_images_with_known_data(synthetic_data_list,input_dir="synthetic_pan_samples_latest_1",
#                                             output_dir="redacted_pan_samples_1"
#                                             ):
#     os.makedirs(output_dir, exist_ok=True)
#     redacted_data = []

#     for i, filename in enumerate(sorted(os.listdir(input_dir))):
#         if filename.lower().endswith(".jpg"):
#             input_path = os.path.join(input_dir, filename)
#             output_path = os.path.join(output_dir, f"redacted_{filename}")

#             try:
#                 known_fields = synthetic_data_list[i]  # match index to filename
#                 redact_doc_with_known_fields(input_path, output_path, known_fields)
#                 redacted_data.append({
#                     "filename": filename,
#                     "redacted_path": output_path,
#                     "redacted_fields": known_fields
#                 })
#                 print(f"✅ Redacted: {filename}")
#             except Exception as e:
#                 print(f"❌ Failed to redact {filename}: {e}")

#     print(f"\n🎉 Completed redaction for {len(redacted_data)} PAN images.")
#     return redacted_data

# After generating synthetic images

# for i in range(5):
#     #data = generate_fake_pan_data()
#     # Save image using this data
#     output_path = f"synthetic_pan_samples_latest/synthetic_pan_{i:03d}.jpg"
#     generate_fake_pan_image(template_path, output_path, data)
#     synthetic_data_list.append(data)

import json

def batch_redact_pan_images_with_known_data(synthetic_data_list,
                                            input_dir="synthetic_pan_samples_latest",
                                            output_dir="redacted_pan_samples"):
    os.makedirs(output_dir, exist_ok=True)
    redacted_data = []

    for i, filename in enumerate(sorted(os.listdir(input_dir))):
        if filename.lower().endswith(".jpg"):
            input_path = os.path.join(input_dir, filename)
            redacted_filename = f"redacted_{filename}"
            output_path = os.path.join(output_dir, redacted_filename)
            json_path = os.path.join(output_dir, redacted_filename.replace(".jpg", ".json"))

            try:
                known_fields = synthetic_data_list[i]  # match index to filename
                redact_doc_with_known_fields(input_path, output_path, known_fields)

                # Save metadata as JSON
                with open(json_path, "w") as f:
                    json.dump(known_fields, f, indent=4)

                redacted_data.append({
                    "filename": filename,
                    "redacted_path": output_path,
                    "json_path": json_path,
                    "redacted_fields": known_fields
                })
                print(f"✅ Redacted: {filename} and saved JSON: {json_path}")
            except Exception as e:
                print(f"❌ Failed to redact {filename}: {e}")

    print(f"\n🎉 Completed redaction for {len(redacted_data)} PAN images.")
    return redacted_data


# Then redact using known fields
batch_redact_pan_images_with_known_data(synthetic_data_list=synthetic_data_list)


✅ Saved: synthetic_pan_000.jpg
✅ Saved: synthetic_pan_001.jpg
✅ Saved: synthetic_pan_002.jpg
✅ Saved: synthetic_pan_003.jpg
✅ Saved: synthetic_pan_004.jpg
✅ Saved: synthetic_pan_005.jpg
✅ Saved: synthetic_pan_006.jpg
✅ Saved: synthetic_pan_007.jpg
✅ Saved: synthetic_pan_008.jpg
✅ Saved: synthetic_pan_009.jpg
✅ Saved: synthetic_pan_010.jpg
✅ Saved: synthetic_pan_011.jpg
✅ Saved: synthetic_pan_012.jpg
✅ Saved: synthetic_pan_013.jpg
✅ Saved: synthetic_pan_014.jpg
✅ Saved: synthetic_pan_015.jpg
✅ Saved: synthetic_pan_016.jpg
✅ Saved: synthetic_pan_017.jpg
✅ Saved: synthetic_pan_018.jpg
✅ Saved: synthetic_pan_019.jpg
✅ Saved: synthetic_pan_020.jpg
✅ Saved: synthetic_pan_021.jpg
✅ Saved: synthetic_pan_022.jpg
✅ Saved: synthetic_pan_023.jpg
✅ Saved: synthetic_pan_024.jpg
✅ Saved: synthetic_pan_025.jpg
✅ Saved: synthetic_pan_026.jpg
✅ Saved: synthetic_pan_027.jpg
✅ Saved: synthetic_pan_028.jpg
✅ Saved: synthetic_pan_029.jpg
✅ Saved: synthetic_pan_030.jpg
✅ Saved: synthetic_pan_031.jpg
✅ Saved:

[{'filename': 'synthetic_pan_000.jpg',
  'redacted_path': 'redacted_pan_samples/redacted_synthetic_pan_000.jpg',
  'json_path': 'redacted_pan_samples/redacted_synthetic_pan_000.json',
  'redacted_fields': {'docType': 'Pan',
   'name': 'DANIEL OLSEN',
   'fatherName': 'ANDREW WRIGHT',
   'dob': '06/11/1998',
   'docId': 'UMFCY8161C',
   'side': 'Front',
   'orientation': '0',
   'isColoured': 'Coloured'}},
 {'filename': 'synthetic_pan_001.jpg',
  'redacted_path': 'redacted_pan_samples/redacted_synthetic_pan_001.jpg',
  'json_path': 'redacted_pan_samples/redacted_synthetic_pan_001.json',
  'redacted_fields': {'docType': 'Pan',
   'name': 'TAMARA MURPHY',
   'fatherName': 'LORI SHERMAN',
   'dob': '21/02/2004',
   'docId': 'LSWKO3066Z',
   'side': 'Front',
   'orientation': '0',
   'isColoured': 'Coloured'}},
 {'filename': 'synthetic_pan_002.jpg',
  'redacted_path': 'redacted_pan_samples/redacted_synthetic_pan_002.jpg',
  'json_path': 'redacted_pan_samples/redacted_synthetic_pan_002.json'

In [5]:
import json

def batch_redact_pan_images_with_known_data(synthetic_data_list,
                                            input_dir="synthetic_pan_samples_latest_1",
                                            output_dir="redacted_pan_samples_1"):
    os.makedirs(output_dir, exist_ok=True)
    redacted_data = []

    for i, filename in enumerate(sorted(os.listdir(input_dir))):
        if filename.lower().endswith(".jpg"):
            input_path = os.path.join(input_dir, filename)
            redacted_filename = f"redacted_{filename}"
            output_path = os.path.join(output_dir, redacted_filename)
            json_path = os.path.join(output_dir, redacted_filename.replace(".jpg", ".json"))

            try:
                known_fields = synthetic_data_list[i]  # match index to filename
                redact_doc_with_known_fields(input_path, output_path, known_fields)

                # Save metadata as JSON
                with open(json_path, "w") as f:
                    json.dump(known_fields, f, indent=4)

                redacted_data.append({
                    "filename": filename,
                    "redacted_path": output_path,
                    "json_path": json_path,
                    "redacted_fields": known_fields
                })
                print(f"✅ Redacted: {filename} and saved JSON: {json_path}")
            except Exception as e:
                print(f"❌ Failed to redact {filename}: {e}")

    print(f"\n🎉 Completed redaction for {len(redacted_data)} PAN images.")
    return redacted_data
