# Main

In [1]:
from PIL import Image, ImageDraw, ImageFont
import random
from faker import Faker
import os
from datetime import datetime
import albumentations as A
import numpy as np
import cv2
import json
from tqdm import tqdm


  from .autonotebook import tqdm as notebook_tqdm
INFO:albumentations.check_version:A new version of Albumentations is available: 2.0.3 (you have 1.4.10). Upgrade using: pip install --upgrade albumentations


In [2]:


def save_png_as_pdf(png_path, form_type="form"):
    # Ensure the synthetic_samples directory exists
    output_dir = "synthetic_samples"
    os.makedirs(output_dir, exist_ok=True)
    
    # Generate a timestamp
    timestamp = datetime.now().strftime("%Y_%m_%d_%H%M%S_%f")[:-3]
    
    # Create the output file name
    output_file_name = f"{form_type}_{timestamp}.pdf"
    output_file_path = os.path.join(output_dir, output_file_name)
    
    # Open the PNG image
    with Image.open(png_path) as img:
        # Convert the image to RGB mode if it's not already
        if img.mode != "RGB":
            img = img.convert("RGB")
        
        # Save the image as a PDF
        img.save(output_file_path, "PDF")
    
    print(f"PDF saved at: {output_file_path}")

# Initialize Faker
fake = Faker()

empty_str = '[+EMPTY+]'

# Helper functions for generating random values
def random_string(length):
    return f"""{''.join(random.choices("ABCDEFGHIJKLMNOPQRSTUVWXYZ", k=5))}{''.join(random.choices([' ' ,', ']))}{''.join(random.choices(['Inc' ,'LLC','Company','Incorporated','Global']))}"""

def random_date():
    year = random.randint(1900, 2023)
    month = random.randint(1, 12)
    day = random.randint(1, 28)  # Simplified for any month
    return random.choice([f"{month}/{day}/{year}", f"{year}-{month:02d}-{day:02d}", f"{year}-{month}-{day}"])

def random_number(digits):
    return random.randint(10**(digits-1), 10**digits - 1)

def random_money():
    raw_value = random.choice([
        f"{random.randint(0, 1000000):,.0f}",
        f"{random.randint(0, 1000000):.0f}",
        f"{random.randint(0, 1000000):,.0f}",
        f"{random.randint(0, 1000000):.0f}",
        f"{random.randint(0, 100000000)*0.01:.2f}",
        f""
    ])

    # Pad the resulting string with spaces or zeros to ensure it has 9 characters
    padded_value = raw_value.rjust(11, ' ')  # Replace ' ' with '0' if you prefer zero-padding
    return padded_value

def json_field_value_mapper(file_path):
    # Read the JSON file
    with open(file_path, "r") as f:
        data = json.load(f)

    # Define a mapping of field names to value generators
    name_to_value_map = {
        "business_name": fake.company,
        "industry_code": lambda: random.choice(['',fake.numerify("######")]),
        "industry": fake.job,
        "industry": fake.email,
        "full_name": fake.name,
        "first_name": fake.first_name,
        "last_name": fake.last_name,
        "date": random_date,
        "full_address": fake.address,
        "street_address": lambda: fake.address().splitlines()[0],
        "city_state": lambda: fake.address().splitlines()[1],
        "city": fake.city,
        "state_abbrv": fake.state_abbr,
        "zip": fake.zipcode,
        "money_field": random_money,
        "ein_1040_sch_c": lambda: random.choice(['',fake.numerify("# # # # # # # # #")]),
        "ein": lambda: fake.numerify("##-#######"),
        "ssn": lambda: fake.numerify("###-##-####"),
        "text_field": lambda: "",
        "num_field2": lambda: fake.numerify("##"),
        "num_field3": lambda: fake.numerify("###"),
        "num_field4": lambda: fake.numerify("####"),
        "a28_3x": lambda: random.choice(['X      ', '   X  ', '     X','']),
        "a28_1x": lambda: random.choice(['X','']),
        "a28_loss_payee": lambda: random.choice(['X   LOSS PAYEE','']),
        "a28_str_addr": lambda: random.choice([
            fake.address().splitlines()[0],
            fake.address().splitlines()[0],
            fake.address().splitlines()[0],
            '']),
        "a28_cty_st": lambda: random.choice([
            fake.address().splitlines()[1],
            fake.address().splitlines()[1],
            fake.address().splitlines()[1],
            ''])
    }

    # Map values to fields in the JSON data
    for field in data:
        field_name = field.get("id", "")
        
        # Loose matching: Find the first key that matches as a substring
        matching_key = next((key for key in name_to_value_map.keys() if key in field_name), None)

        if matching_key:
            # Generate and assign the value using the matched key
            field["value"] = name_to_value_map[matching_key]()
        elif '#' in field_name:
            # Handle numeric patterns with '#'
            num_dig = field_name.count('#')
            field["value"] = fake.numerify(f"{num_dig * '#'}")
        else:
            # Default value for unmapped fields
            field["value"] = random_money()
            # Optionally print a warning
            # print(f"Warning: Unmapped field name '{field_name}'")

    return data



# 1040 Sch C

In [9]:
for i in tqdm(range(1)):
    # Load the image
    image_path = r"clf_images\1040_sch_c\base\1040_sch_c.png"
    output_image_path = "filled_image.png"
    form_type = '1040_sch_c'
    img = Image.open(image_path)

    font_path = random.choice([r"C:\Windows\Fonts\cour.ttf"])#, r"C:\Windows\Fonts\arial.ttf"])
    font = ImageFont.truetype(font_path, random.randint(14,14))

    # Get image dimensions
    img_width, img_height = img.size

    # Prepare the drawing context
    draw = ImageDraw.Draw(img)

    def get_augmentation_pipeline(height, width):
        crop_height = int(height * 0.98)  # 90% of the original height
        crop_width = int(width * 0.98)  # 90% of the original width

        return A.Compose([
            A.Rotate(limit=1, p=0.8),  # Larger rotation limit, higher probability
            A.GaussianBlur(blur_limit=1, p=0.3),  # Stronger blur, more frequent
            A.RandomCrop(height=crop_height, width=crop_width, p=0.5),  # Aggressive cropping
            A.GaussNoise(var_limit=(0.0, 1), p=0.5),  # Noisier images
            A.RandomBrightnessContrast(brightness_limit=0.025, contrast_limit=0.025, p=0.5),  # Brightness/contrast variation
            # A.Perspective(scale=(0.02, 0.07), p=0.5),  # Slight perspective distortions
        ])

    augmentation_pipeline = get_augmentation_pipeline(img_height, img_width)

    # Define fields with normalized positions (percentages)
    fields = json_field_value_mapper(r'clf_images\1040_sch_c\base\coords.json')

    # Fill fields with random values
    for field in fields:

        if 'ein' in field['id'] :
            # font_path = r"C:\Windows\Fonts\cour.ttf"
            # font = ImageFont.truetype(font_path, 12)
            x_noise = 0 #random.randint(-4,8) / 1000
            y_noise = 0 #random.randint(-2,2) / 1000
            # Convert normalized coordinates to absolute pixel coordinates
            x = int((field["coords"][0] + x_noise) * img_width)
            y = int((field["coords"][1] + y_noise) * img_height)
        else:
            x_noise = 0#random.randint(-2,4) / 1000
            y_noise = 0#random.randint(-2,2) / 1000
            # Convert normalized coordinates to absolute pixel coordinates
            x = int((field["coords"][0] + x_noise) * img_width)
            y = int((field["coords"][1] + y_noise) * img_height)

        draw.text((x, y), field["value"], fill="black", font=font)
    # Save the updated image
    augmented = augmentation_pipeline(image=np.array(img))
    aug_img = augmented['image']

    # Save the updated image
    # aug_img.save(output_image_path)
    # output_path = os.path.join(variants_folder, f"{os.path.splitext(image_file)[0]}_variant_{i+1}.png")
    cv2.imwrite(output_image_path, aug_img)

    print(f"Filled image saved to: {output_image_path}")
    # save_png_as_pdf(output_image_path, form_type)


100%|██████████| 1/1 [00:00<00:00, 29.36it/s]

Filled image saved to: filled_image.png





# 1120S P1

In [None]:
for i in range(1):
    # Load the image
    image_path = r"clf_images\1120S_p1\base\1120S_p1.png"
    output_image_path = "filled_image.png"
    form_type = '1120S_p1'
    img = Image.open(image_path)

    font_path = random.choice([r"C:\Windows\Fonts\cour.ttf"])
    font = ImageFont.truetype(font_path, random.randint(14,14))

    # Get image dimensions
    img_width, img_height = img.size

    # Prepare the drawing context
    draw = ImageDraw.Draw(img)

    def get_augmentation_pipeline(height, width):
        crop_height = int(height * 0.98)  # 90% of the original height
        crop_width = int(width * 0.98)  # 90% of the original width

        return A.Compose([
            A.Rotate(limit=0.5, p=0.8),  # Larger rotation limit, higher probability
            A.GaussianBlur(blur_limit=1, p=0.3),  # Stronger blur, more frequent
            A.RandomCrop(height=crop_height, width=crop_width, p=0.5),  # Aggressive cropping
            A.GaussNoise(var_limit=(0.0, 3), p=0.5),  # Noisier images
            A.RandomBrightnessContrast(brightness_limit=0.025, contrast_limit=0.025, p=0.5),  # Brightness/contrast variation
            # A.Perspective(scale=(0.02, 0.07), p=0.5),  # Slight perspective distortions
        ])

    augmentation_pipeline = get_augmentation_pipeline(img_height, img_width)

    # Define fields with normalized positions (percentages)
    fields = json_field_value_mapper(r'clf_images\1120S_p1\base\coords.json')

    # Fill fields with random values
    for field in fields:

        if field in ["Begin Date",
            "End Date",
            "End Year",
            "Business activity code",
            "Name",
            "Street",
            "City",
            "Inception Date",
            "EIN"]:
        # Add noise
            x_noise = 0#random.randint(-4,8) / 1000
            y_noise = 0#random.randint(-2,2) / 1000
            # Convert normalized coordinates to absolute pixel coordinates
            x = int((field["coords"][0] + x_noise) * img_width)
            y = int((field["coords"][1] + y_noise) * img_height)
        else:
            x_noise = 0#random.randint(-2,4) / 1000
            y_noise = 0#random.randint(-2,2) / 1000
            # Convert normalized coordinates to absolute pixel coordinates
            x = int((field["coords"][0] + x_noise) * img_width)
            y = int((field["coords"][1] + y_noise) * img_height)

        draw.text((x, y), field["value"], fill="black", font=font)
    # Save the updated image
    augmented = augmentation_pipeline(image=np.array(img))
    aug_img = augmented['image']

    # Save the updated image
    # aug_img.save(output_image_path)
    # output_path = os.path.join(variants_folder, f"{os.path.splitext(image_file)[0]}_variant_{i+1}.png")
    cv2.imwrite(output_image_path, aug_img)

    print(f"Filled image saved to: {output_image_path}")
    save_png_as_pdf(output_image_path, form_type)


Filled image saved to: filled_image.png


# 1120S_bal_sheet

In [126]:
for i in range(100):
# Load the image
    image_path = r"clf_images\1120S_bal_sheet\base\1120S_p4.png"
    output_image_path = "filled_image.png"
    form_type = '1120S_bal_sheet'
    img = Image.open(image_path)

    font_path = random.choice([r"C:\Windows\Fonts\cour.ttf"])#, r"C:\Windows\Fonts\arial.ttf"])
    font = ImageFont.truetype(font_path, random.randint(14,14))

    # Get image dimensions
    img_width, img_height = img.size

    # Prepare the drawing context
    draw = ImageDraw.Draw(img)

    def get_augmentation_pipeline(height, width):
        crop_height = int(height * 0.98)  # 90% of the original height
        crop_width = int(width * 0.98)  # 90% of the original width

        return A.Compose([
            A.Rotate(limit=0.5, p=0.8),  # Larger rotation limit, higher probability
            A.GaussianBlur(blur_limit=1, p=0.3),  # Stronger blur, more frequent
            A.RandomCrop(height=crop_height, width=crop_width, p=0.5),  # Aggressive cropping
            A.GaussNoise(var_limit=(0.0, 3), p=0.5),  # Noisier images
            A.RandomBrightnessContrast(brightness_limit=0.025, contrast_limit=0.025, p=0.5),  # Brightness/contrast variation
            # A.Perspective(scale=(0.02, 0.07), p=0.5),  # Slight perspective distortions
        ])

    augmentation_pipeline = get_augmentation_pipeline(img_height, img_width)

    # Define fields with normalized positions (percentages)
    fields = json_field_value_mapper(r'clf_images\1120S_bal_sheet\base\coords.json')

    # Fill fields with random values
    for field in fields:

        if field in ["Begin Date",
            "End Date",
            "End Year",
            "Business activity code",
            "Name",
            "Street",
            "City",
            "Inception Date",
            "EIN"]:
        # Add noise
            x_noise = 0#random.randint(-4,8) / 1000
            y_noise = 0#random.randint(-2,2) / 1000
            # Convert normalized coordinates to absolute pixel coordinates
            x = int((field["coords"][0] + x_noise) * img_width)
            y = int((field["coords"][1] + y_noise) * img_height)
        else:
            x_noise = 0#random.randint(-2,4) / 1000
            y_noise = 0#random.randint(-2,2) / 1000
            # Convert normalized coordinates to absolute pixel coordinates
            x = int((field["coords"][0] + x_noise) * img_width)
            y = int((field["coords"][1] + y_noise) * img_height)

        draw.text((x, y), field["value"], fill="black", font=font)
    # Save the updated image
    augmented = augmentation_pipeline(image=np.array(img))
    aug_img = augmented['image']

    # Save the updated image
    # aug_img.save(output_image_path)
    # output_path = os.path.join(variants_folder, f"{os.path.splitext(image_file)[0]}_variant_{i+1}.png")
    cv2.imwrite(output_image_path, aug_img)

    print(f"Filled image saved to: {output_image_path}")

    save_png_as_pdf(output_image_path, form_type)

Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1120S_bal_sheet_2024_12_27_085138_480.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1120S_bal_sheet_2024_12_27_085138_516.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1120S_bal_sheet_2024_12_27_085138_552.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1120S_bal_sheet_2024_12_27_085138_649.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1120S_bal_sheet_2024_12_27_085138_742.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1120S_bal_sheet_2024_12_27_085138_773.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1120S_bal_sheet_2024_12_27_085138_805.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1120S_bal_sheet_2024_12_27_085138_836.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1120S_bal_sheet_

# 1065 P1

In [127]:
for i in range(100):
    # Load the image
    image_path = r"clf_images\1065_p1\base\1065_p1_2024.png"
    output_image_path = "filled_image.png"
    form_type = '1065_p1'
    img = Image.open(image_path)

    font_path = random.choice([r"C:\Windows\Fonts\cour.ttf"])#, r"C:\Windows\Fonts\arial.ttf"])
    font = ImageFont.truetype(font_path, random.randint(14,14))

    # Get image dimensions
    img_width, img_height = img.size

    # Prepare the drawing context
    draw = ImageDraw.Draw(img)

    def get_augmentation_pipeline(height, width):
        crop_height = int(height * 0.98)  # 90% of the original height
        crop_width = int(width * 0.98)  # 90% of the original width

        return A.Compose([
            A.Rotate(limit=1, p=0.8),  # Larger rotation limit, higher probability
            A.GaussianBlur(blur_limit=1, p=0.3),  # Stronger blur, more frequent
            A.RandomCrop(height=crop_height, width=crop_width, p=0.5),  # Aggressive cropping
            A.GaussNoise(var_limit=(0.0, 3), p=0.5),  # Noisier images
            A.RandomBrightnessContrast(brightness_limit=0.025, contrast_limit=0.025, p=0.5),  # Brightness/contrast variation
            # A.Perspective(scale=(0.02, 0.07), p=0.5),  # Slight perspective distortions
        ])

    augmentation_pipeline = get_augmentation_pipeline(img_height, img_width)

    # Define fields with normalized positions (percentages)
    fields = json_field_value_mapper(r'clf_images\1065_p1\base\coords.json')

    # Fill fields with random values
    for field in fields:

        if field in ["Begin Date",
            "End Date",
            "End Year",
            "Business activity code",
            "Name",
            "Street",
            "City",
            "Inception Date",
            "EIN"]:
        # Add noise
            x_noise = 0#random.randint(-4,8) / 1000
            y_noise = 0#random.randint(-2,2) / 1000
            # Convert normalized coordinates to absolute pixel coordinates
            x = int((field["coords"][0] + x_noise) * img_width)
            y = int((field["coords"][1] + y_noise) * img_height)
        else:
            x_noise = 0#random.randint(-2,4) / 1000
            y_noise = 0#random.randint(-2,2) / 1000
            # Convert normalized coordinates to absolute pixel coordinates
            x = int((field["coords"][0] + x_noise) * img_width)
            y = int((field["coords"][1] + y_noise) * img_height)

        draw.text((x, y), field["value"], fill="black", font=font)
    # Save the updated image
    augmented = augmentation_pipeline(image=np.array(img))
    aug_img = augmented['image']

    # Save the updated image
    # aug_img.save(output_image_path)
    # output_path = os.path.join(variants_folder, f"{os.path.splitext(image_file)[0]}_variant_{i+1}.png")
    cv2.imwrite(output_image_path, aug_img)

    print(f"Filled image saved to: {output_image_path}")
    save_png_as_pdf(output_image_path, form_type)

Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1065_p1_2024_12_27_085145_137.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1065_p1_2024_12_27_085145_244.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1065_p1_2024_12_27_085145_292.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1065_p1_2024_12_27_085145_344.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1065_p1_2024_12_27_085145_456.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1065_p1_2024_12_27_085145_503.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1065_p1_2024_12_27_085145_534.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1065_p1_2024_12_27_085145_649.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1065_p1_2024_12_27_085145_698.pdf
Filled image saved to: filled_image.png
PDF sa

# 1065_bal_sheet

In [128]:
for i in range(100):
    # Load the image
    image_path = r"clf_images\1065_bal_sheet\base\1065_p6_2024.png"
    output_image_path = "filled_image.png"
    form_type = '1065_bal_sheet'
    img = Image.open(image_path)

    font_path = random.choice([r"C:\Windows\Fonts\cour.ttf"])#, r"C:\Windows\Fonts\arial.ttf"])
    font = ImageFont.truetype(font_path, random.randint(14,14))

    # Get image dimensions
    img_width, img_height = img.size

    # Prepare the drawing context
    draw = ImageDraw.Draw(img)

    def get_augmentation_pipeline(height, width):
        crop_height = int(height * 0.98)  # 90% of the original height
        crop_width = int(width * 0.98)  # 90% of the original width

        return A.Compose([
            A.Rotate(limit=0.5, p=0.8),  # Larger rotation limit, higher probability
            A.GaussianBlur(blur_limit=1, p=0.3),  # Stronger blur, more frequent
            A.RandomCrop(height=crop_height, width=crop_width, p=0.5),  # Aggressive cropping
            A.GaussNoise(var_limit=(0.0, 3), p=0.5),  # Noisier images
            A.RandomBrightnessContrast(brightness_limit=0.025, contrast_limit=0.025, p=0.5),  # Brightness/contrast variation
            # A.Perspective(scale=(0.02, 0.07), p=0.5),  # Slight perspective distortions
        ])

    augmentation_pipeline = get_augmentation_pipeline(img_height, img_width)

    # Define fields with normalized positions (percentages)
    fields = json_field_value_mapper(r'clf_images\1065_bal_sheet\base\coords.json')

    # Fill fields with random values
    for field in fields:

        if field in ["Begin Date",
            "End Date",
            "End Year",
            "Business activity code",
            "Name",
            "Street",
            "City",
            "Inception Date",
            "EIN"]:
        # Add noise
            x_noise = 0#random.randint(-4,8) / 1000
            y_noise = 0#random.randint(-2,2) / 1000
            # Convert normalized coordinates to absolute pixel coordinates
            x = int((field["coords"][0] + x_noise) * img_width)
            y = int((field["coords"][1] + y_noise) * img_height)
        else:
            x_noise = 0#random.randint(-2,4) / 1000
            y_noise = 0#random.randint(-2,2) / 1000
            # Convert normalized coordinates to absolute pixel coordinates
            x = int((field["coords"][0] + x_noise) * img_width)
            y = int((field["coords"][1] + y_noise) * img_height)

        draw.text((x, y), field["value"], fill="black", font=font)
    # Save the updated image
    augmented = augmentation_pipeline(image=np.array(img))
    aug_img = augmented['image']

    # Save the updated image
    # aug_img.save(output_image_path)
    # output_path = os.path.join(variants_folder, f"{os.path.splitext(image_file)[0]}_variant_{i+1}.png")
    cv2.imwrite(output_image_path, aug_img)

    print(f"Filled image saved to: {output_image_path}")
    save_png_as_pdf(output_image_path, form_type)



Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1065_bal_sheet_2024_12_27_085152_333.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1065_bal_sheet_2024_12_27_085152_462.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1065_bal_sheet_2024_12_27_085152_590.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1065_bal_sheet_2024_12_27_085152_637.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1065_bal_sheet_2024_12_27_085152_667.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1065_bal_sheet_2024_12_27_085152_725.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1065_bal_sheet_2024_12_27_085152_772.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1065_bal_sheet_2024_12_27_085152_819.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1065_bal_sheet_2024_12_2

# 1120 P1

In [129]:
for i in range(100):
    # Load the image
    image_path = r"clf_images\1120_p1\base\1120_p1.png"
    output_image_path = "filled_image.png"
    form_type = '1120_p1'
    img = Image.open(image_path)

    font_path = random.choice([r"C:\Windows\Fonts\cour.ttf"])#, r"C:\Windows\Fonts\arial.ttf"])
    font = ImageFont.truetype(font_path, random.randint(14,14))

    # Get image dimensions
    img_width, img_height = img.size

    # Prepare the drawing context
    draw = ImageDraw.Draw(img)

    def get_augmentation_pipeline(height, width):
        crop_height = int(height * 0.98)  # 90% of the original height
        crop_width = int(width * 0.98)  # 90% of the original width

        return A.Compose([
            A.Rotate(limit=0.5, p=0.5),  # Larger rotation limit, higher probability
            A.GaussianBlur(blur_limit=1, p=0.3),  # Stronger blur, more frequent
            A.RandomCrop(height=crop_height, width=crop_width, p=0.5),  # Aggressive cropping
            A.GaussNoise(var_limit=(0.0, 3), p=0.5),  # Noisier images
            A.RandomBrightnessContrast(brightness_limit=0.025, contrast_limit=0.025, p=0.5),  # Brightness/contrast variation
            # A.Perspective(scale=(0.02, 0.07), p=0.5),  # Slight perspective distortions
        ])

    augmentation_pipeline = get_augmentation_pipeline(img_height, img_width)
    # {"name": "Business activity code", "coords": (0.105, 0.195), "value": str(random_number(6))},
    # {"name": "Name", "coords": (0.27, 0.119), "value": fake.company()},
    # {"name": "Street", "coords": (0.27, 0.151), "value": fake.address().splitlines()[0]},
    # {"name": "City", "coords": (0.27, 0.19), "value": fake.address().splitlines()[1]},
    # {"name": "Inception Date", "coords": (0.816, 0.149), "value": random_date()},
    # {"name": "EIN", "coords": (0.816, 0.117), "value": fake.numerify('##-#######')},

    # Define fields with normalized positions (percentages)
    fields = json_field_value_mapper(r'clf_images\1120_p1\base\coords.json')

    # Fill fields with random values
    for field in fields:

        if field in ["Begin Date",
            "End Date",
            "End Year",
            "Business activity code",
            "Name",
            "Street",
            "City",
            "Inception Date",
            "EIN"]:
        # Add noise
            x_noise = 0#random.randint(-4,8) / 1000
            y_noise = 0#random.randint(-2,2) / 1000
            # Convert normalized coordinates to absolute pixel coordinates
            x = int((field["coords"][0] + x_noise) * img_width)
            y = int((field["coords"][1] + y_noise) * img_height)
        else:
            x_noise = 0#random.randint(-2,4) / 1000
            y_noise = 0#random.randint(-2,2) / 1000
            # Convert normalized coordinates to absolute pixel coordinates
            x = int((field["coords"][0] + x_noise) * img_width)
            y = int((field["coords"][1] + y_noise) * img_height)

        draw.text((x, y), field["value"], fill="black", font=font)
    # Save the updated image
    augmented = augmentation_pipeline(image=np.array(img))
    aug_img = augmented['image']

    # Save the updated image
    # aug_img.save(output_image_path)
    # output_path = os.path.join(variants_folder, f"{os.path.splitext(image_file)[0]}_variant_{i+1}.png")
    cv2.imwrite(output_image_path, aug_img)
    print(f"Filled image saved to: {output_image_path}")
    save_png_as_pdf(output_image_path, form_type)

Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1120_p1_2024_12_27_085200_466.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1120_p1_2024_12_27_085200_508.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1120_p1_2024_12_27_085200_630.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1120_p1_2024_12_27_085200_739.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1120_p1_2024_12_27_085200_781.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1120_p1_2024_12_27_085200_884.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1120_p1_2024_12_27_085200_927.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1120_p1_2024_12_27_085200_969.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1120_p1_2024_12_27_085201_072.pdf
Filled image saved to: filled_image.png
PDF sa

# 1120_bal_sheet

In [130]:
for i in range(100):
    # Load the image
    image_path = r"clf_images\1120_bal_sheet\base\1120_bal_sheet.png"
    output_image_path = "filled_image.png"
    form_type = '1120_bal_sheet'
    img = Image.open(image_path)

    font_path = random.choice([r"C:\Windows\Fonts\cour.ttf"])#, r"C:\Windows\Fonts\arial.ttf"])
    font = ImageFont.truetype(font_path, random.randint(14,14))

    # Get image dimensions
    img_width, img_height = img.size

    # Prepare the drawing context
    draw = ImageDraw.Draw(img)

    def get_augmentation_pipeline(height, width):
        crop_height = int(height * 0.98)  # 90% of the original height
        crop_width = int(width * 0.98)  # 90% of the original width

        return A.Compose([
            A.Rotate(limit=0.5, p=0.5),  # Larger rotation limit, higher probability
            A.GaussianBlur(blur_limit=1, p=0.3),  # Stronger blur, more frequent
            A.RandomCrop(height=crop_height, width=crop_width, p=0.5),  # Aggressive cropping
            A.GaussNoise(var_limit=(0.0, 3), p=0.5),  # Noisier images
            A.RandomBrightnessContrast(brightness_limit=0.025, contrast_limit=0.025, p=0.5),  # Brightness/contrast variation
            # A.Perspective(scale=(0.02, 0.07), p=0.5),  # Slight perspective distortions
        ])

    augmentation_pipeline = get_augmentation_pipeline(img_height, img_width)
    # {"name": "Business activity code", "coords": (0.105, 0.195), "value": str(random_number(6))},
    # {"name": "Name", "coords": (0.27, 0.119), "value": fake.company()},
    # {"name": "Street", "coords": (0.27, 0.151), "value": fake.address().splitlines()[0]},
    # {"name": "City", "coords": (0.27, 0.19), "value": fake.address().splitlines()[1]},
    # {"name": "Inception Date", "coords": (0.816, 0.149), "value": random_date()},
    # {"name": "EIN", "coords": (0.816, 0.117), "value": fake.numerify('##-#######')},

    # Define fields with normalized positions (percentages)
    fields = json_field_value_mapper(r'clf_images\1120_bal_sheet\base\coords.json')

    # Fill fields with random values
    for field in fields:

        if field in ["Begin Date",
            "End Date",
            "End Year",
            "Business activity code",
            "Name",
            "Street",
            "City",
            "Inception Date",
            "EIN"]:
        # Add noise
            x_noise = 0#random.randint(-4,8) / 1000
            y_noise = 0#random.randint(-2,2) / 1000
            # Convert normalized coordinates to absolute pixel coordinates
            x = int((field["coords"][0] + x_noise) * img_width)
            y = int((field["coords"][1] + y_noise) * img_height)
        else:
            x_noise = 0#random.randint(-2,4) / 1000
            y_noise = 0#random.randint(-2,2) / 1000
            # Convert normalized coordinates to absolute pixel coordinates
            x = int((field["coords"][0] + x_noise) * img_width)
            y = int((field["coords"][1] + y_noise) * img_height)

        draw.text((x, y), field["value"], fill="black", font=font)
    # Save the updated image
    augmented = augmentation_pipeline(image=np.array(img))
    aug_img = augmented['image']

    # Save the updated image
    # aug_img.save(output_image_path)
    # output_path = os.path.join(variants_folder, f"{os.path.splitext(image_file)[0]}_variant_{i+1}.png")
    cv2.imwrite(output_image_path, aug_img)
    print(f"Filled image saved to: {output_image_path}")
    save_png_as_pdf(output_image_path, form_type)

Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1120_bal_sheet_2024_12_27_085208_641.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1120_bal_sheet_2024_12_27_085208_690.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1120_bal_sheet_2024_12_27_085208_811.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1120_bal_sheet_2024_12_27_085208_854.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1120_bal_sheet_2024_12_27_085208_902.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1120_bal_sheet_2024_12_27_085208_951.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1120_bal_sheet_2024_12_27_085209_067.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1120_bal_sheet_2024_12_27_085209_124.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1120_bal_sheet_2024_12_2

# 1040 P1

In [191]:
for i in range(100):
    # Load the image
    image_path = r"clf_images\1040_p1\base\1040_p1.png"
    output_image_path = "filled_image.png"
    form_type = '1040_p1'
    img = Image.open(image_path)

    font_path = random.choice([r"C:\Windows\Fonts\cour.ttf"])#, r"C:\Windows\Fonts\arial.ttf"])
    font = ImageFont.truetype(font_path, random.randint(14,14))

    # Get image dimensions
    img_width, img_height = img.size

    # Prepare the drawing context
    draw = ImageDraw.Draw(img)

    def get_augmentation_pipeline(height, width):
        crop_height = int(height * 0.98)  # 90% of the original height
        crop_width = int(width * 0.98)  # 90% of the original width

        return A.Compose([
            A.Rotate(limit=0.5, p=0.3),  # Larger rotation limit, higher probability
            A.GaussianBlur(blur_limit=1, p=0.3),  # Stronger blur, more frequent
            A.RandomCrop(height=crop_height, width=crop_width, p=0.5),  # Aggressive cropping
            A.GaussNoise(var_limit=(0.0, 3), p=0.5),  # Noisier images
            A.RandomBrightnessContrast(brightness_limit=0.025, contrast_limit=0.025, p=0.5),  # Brightness/contrast variation
            # A.Perspective(scale=(0.02, 0.07), p=0.5),  # Slight perspective distortions
        ])

    augmentation_pipeline = get_augmentation_pipeline(img_height, img_width)

    p2_info = random.choice([True,False])

    # Define fields with normalized positions (percentages)
    fields = json_field_value_mapper(r'clf_images\1040_p1\base\coords.json')

    # Fill fields with random values
    show_gtr2 = random.choice([True,False])
    for field in fields:

        if field['id'] in [
            'first_name2',
            'last_name2',
            '###2a',
            '##2b',
            '####2c',
        ]:
            x_noise = 0 #random.randint(-4,8) / 1000
            y_noise = 0 #random.randint(-2,2) / 1000
            # Convert normalized coordinates to absolute pixel coordinates
            x = int((field["coords"][0] + x_noise) * img_width)
            y = int((field["coords"][1] + y_noise) * img_height)
            if show_gtr2:
                draw.text((x, y), field["value"], fill="black", font=font)
            else:
                draw.text((x, y), '', fill="black", font=font)
                
        else:
            x_noise = 0 #random.randint(-2,4) / 1000
            y_noise = 0 #random.randint(-2,2) / 1000
            # Convert normalized coordinates to absolute pixel coordinates
            x = int((field["coords"][0] + x_noise) * img_width)
            y = int((field["coords"][1] + y_noise) * img_height)

            draw.text((x, y), field["value"], fill="black", font=font)
    # Save the updated image
    augmented = augmentation_pipeline(image=np.array(img))
    aug_img = augmented['image']

    # Save the updated image
    # aug_img.save(output_image_path)
    # output_path = os.path.join(variants_folder, f"{os.path.splitext(image_file)[0]}_variant_{i+1}.png")
    cv2.imwrite(output_image_path, aug_img)

    print(f"Filled image saved to: {output_image_path}")
    
    save_png_as_pdf(output_image_path, form_type)

Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1040_p1_2024_12_27_085516_945.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1040_p1_2024_12_27_085517_055.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1040_p1_2024_12_27_085517_091.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1040_p1_2024_12_27_085517_193.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1040_p1_2024_12_27_085517_301.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1040_p1_2024_12_27_085517_408.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1040_p1_2024_12_27_085517_449.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1040_p1_2024_12_27_085517_546.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1040_p1_2024_12_27_085517_643.pdf
Filled image saved to: filled_image.png
PDF sa

# 1065 K1

In [132]:
for i in range(100):
    # Load the image
    image_path = r"clf_images\1065_k1\base\1065_k1_2023.png"
    output_image_path = "filled_image.png"
    form_type = '1065_k1'
    img = Image.open(image_path)

    font_path = random.choice([r"C:\Windows\Fonts\cour.ttf"])#, r"C:\Windows\Fonts\arial.ttf"])
    font = ImageFont.truetype(font_path, random.randint(14,14))

    # Get image dimensions
    img_width, img_height = img.size

    # Prepare the drawing context
    draw = ImageDraw.Draw(img)

    def get_augmentation_pipeline(height, width):
        crop_height = int(height * 0.98)  # 90% of the original height
        crop_width = int(width * 0.98)  # 90% of the original width

        return A.Compose([
            A.Rotate(limit=1, p=0.5),  # Larger rotation limit, higher probability
            A.GaussianBlur(blur_limit=1, p=0.3),  # Stronger blur, more frequent
            A.RandomCrop(height=crop_height, width=crop_width, p=0.5),  # Aggressive cropping
            A.GaussNoise(var_limit=(0.0, 3), p=0.5),  # Noisier images
            A.RandomBrightnessContrast(brightness_limit=0.025, contrast_limit=0.025, p=0.5),  # Brightness/contrast variation
            # A.Perspective(scale=(0.02, 0.07), p=0.5),  # Slight perspective distortions
        ])

    augmentation_pipeline = get_augmentation_pipeline(img_height, img_width)

    p2_info = random.choice([True,False])

    # Define fields with normalized positions (percentages)
    fields = json_field_value_mapper(r'clf_images\1065_k1\base\coords.json')

    # Fill fields with random values
    for field in fields:

        if "a28" in field['id']:
        # Add noise
            x_noise = 0#random.randint(-4,8) / 1000
            y_noise = 0#random.randint(-2,2) / 1000
            # Convert normalized coordinates to absolute pixel coordinates
            x = int((field["coords"][0] + x_noise) * img_width)
            y = int((field["coords"][1] + y_noise) * img_height)
        else:
            x_noise = 0#random.randint(-2,4) / 1000
            y_noise = 0#random.randint(-2,2) / 1000
            # Convert normalized coordinates to absolute pixel coordinates
            x = int((field["coords"][0] + x_noise) * img_width)
            y = int((field["coords"][1] + y_noise) * img_height)

        draw.text((x, y), field["value"], fill="black", font=font)
    # Save the updated image
    augmented = augmentation_pipeline(image=np.array(img))
    aug_img = augmented['image']

    # Save the updated image
    # aug_img.save(output_image_path)
    # output_path = os.path.join(variants_folder, f"{os.path.splitext(image_file)[0]}_variant_{i+1}.png")
    cv2.imwrite(output_image_path, aug_img)

    print(f"Filled image saved to: {output_image_path}")
    save_png_as_pdf(output_image_path, form_type)


Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1065_k1_2024_12_27_085224_476.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1065_k1_2024_12_27_085224_538.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1065_k1_2024_12_27_085224_580.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1065_k1_2024_12_27_085224_701.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1065_k1_2024_12_27_085224_816.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1065_k1_2024_12_27_085224_853.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1065_k1_2024_12_27_085224_896.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1065_k1_2024_12_27_085224_940.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1065_k1_2024_12_27_085224_981.pdf
Filled image saved to: filled_image.png
PDF sa

# 1120S K1

In [133]:
for i in range(100):
    # Load the image
    image_path = r"clf_images\1120S_k1\base\1120S_k1.png"
    output_image_path = "filled_image.png"
    form_type = '1120S_k1'
    img = Image.open(image_path)

    font_path = random.choice([r"C:\Windows\Fonts\cour.ttf"])#, r"C:\Windows\Fonts\arial.ttf"])
    font = ImageFont.truetype(font_path, random.randint(14,14))

    # Get image dimensions
    img_width, img_height = img.size

    # Prepare the drawing context
    draw = ImageDraw.Draw(img)

    def get_augmentation_pipeline(height, width):
        crop_height = int(height * 0.98)  # 90% of the original height
        crop_width = int(width * 0.98)  # 90% of the original width

        return A.Compose([
            A.Rotate(limit=0.5, p=0.5),  # Larger rotation limit, higher probability
            A.GaussianBlur(blur_limit=1, p=0.3),  # Stronger blur, more frequent
            A.RandomCrop(height=crop_height, width=crop_width, p=0.5),  # Aggressive cropping
            A.GaussNoise(var_limit=(0.0, 3), p=0.5),  # Noisier images
            A.RandomBrightnessContrast(brightness_limit=0.025, contrast_limit=0.025, p=0.5),  # Brightness/contrast variation
            # A.Perspective(scale=(0.02, 0.07), p=0.5),  # Slight perspective distortions
        ])

    augmentation_pipeline = get_augmentation_pipeline(img_height, img_width)

    p2_info = random.choice([True,False])

    # Define fields with normalized positions (percentages)
    fields = json_field_value_mapper(r'clf_images\1120s_k1\base\coords.json')

    # Fill fields with random values
    for field in fields:

        if field in ["Begin Date",
            "End Date",
            "End Year",
            "Business activity code",
            "Name",
            "Street",
            "City",
            "Inception Date",
            "EIN"]:
        # Add noise
            x_noise = 0#random.randint(-4,8) / 1000
            y_noise = 0#random.randint(-2,2) / 1000
            # Convert normalized coordinates to absolute pixel coordinates
            x = int((field["coords"][0] + x_noise) * img_width)
            y = int((field["coords"][1] + y_noise) * img_height)
        else:
            x_noise = 0#random.randint(-2,4) / 1000
            y_noise = 0#random.randint(-2,2) / 1000
            # Convert normalized coordinates to absolute pixel coordinates
            x = int((field["coords"][0] + x_noise) * img_width)
            y = int((field["coords"][1] + y_noise) * img_height)

        draw.text((x, y), field["value"], fill="black", font=font)
    # Save the updated image
    augmented = augmentation_pipeline(image=np.array(img))
    aug_img = augmented['image']

    # Save the updated image
    # aug_img.save(output_image_path)
    # output_path = os.path.join(variants_folder, f"{os.path.splitext(image_file)[0]}_variant_{i+1}.png")
    cv2.imwrite(output_image_path, aug_img)

    print(f"Filled image saved to: {output_image_path}")
    save_png_as_pdf(output_image_path, form_type)


Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1120S_k1_2024_12_27_085232_572.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1120S_k1_2024_12_27_085232_626.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1120S_k1_2024_12_27_085232_743.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1120S_k1_2024_12_27_085232_786.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1120S_k1_2024_12_27_085232_825.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1120S_k1_2024_12_27_085232_864.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1120S_k1_2024_12_27_085232_974.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1120S_k1_2024_12_27_085233_076.pdf
Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\1120S_k1_2024_12_27_085233_190.pdf
Filled image saved to: filled_image.p

# ACORD 25

In [41]:
# Load the image
image_path = r"clf_images\acord_25\base\acord_25_blank.png"
output_image_path = "filled_image.png"
form_type = 'acord_25'
img = Image.open(image_path)

font_path = random.choice([r"C:\Windows\Fonts\cour.ttf", r"C:\Windows\Fonts\arial.ttf"])
font = ImageFont.truetype(font_path, random.randint(14,14))

# Get image dimensions
img_width, img_height = img.size

# Prepare the drawing context
draw = ImageDraw.Draw(img)

def get_augmentation_pipeline(height, width):
    crop_height = int(height * 0.98)  # 90% of the original height
    crop_width = int(width * 0.98)  # 90% of the original width

    return A.Compose([
        A.Rotate(limit=0.5, p=0.5),  # Larger rotation limit, higher probability
        # A.GaussianBlur(blur_limit=1, p=0.5),  # Stronger blur, more frequent
        A.RandomCrop(height=crop_height, width=crop_width, p=0.5),  # Aggressive cropping
        A.GaussNoise(var_limit=(0.0, 3), p=0.5),  # Noisier images
        A.RandomBrightnessContrast(brightness_limit=0.025, contrast_limit=0.025, p=0.5),  # Brightness/contrast variation
        # A.Perspective(scale=(0.02, 0.07), p=0.5),  # Slight perspective distortions
    ])

augmentation_pipeline = get_augmentation_pipeline(img_height, img_width)

p2_info = random.choice([True,False])

# Define fields with normalized positions (percentages)
fields = json_field_value_mapper(r'clf_images\acord_25\base\coords.json')

# Fill fields with random values
for field in fields:

    if "a28_" in field:
        # Convert normalized coordinates to absolute pixel coordinates
        x = int((field["coords"][0] + x_noise) * img_width)
        y = int((field["coords"][1] + y_noise) * img_height)
    else:
        x_noise = 0#random.randint(-2,4) / 1000
        y_noise = 0#random.randint(-2,2) / 1000
    # Convert normalized coordinates to absolute pixel coordinates
    x = int((field["coords"][0] + x_noise) * img_width)
    y = int((field["coords"][1] + y_noise) * img_height)

    draw.text((x, y), field["value"], fill="black", font=font)
# Save the updated image
augmented = augmentation_pipeline(image=np.array(img))
aug_img = augmented['image']

# Save the updated image
# aug_img.save(output_image_path)
# output_path = os.path.join(variants_folder, f"{os.path.splitext(image_file)[0]}_variant_{i+1}.png")
cv2.imwrite(output_image_path, aug_img)

print(f"Filled image saved to: {output_image_path}")
save_png_as_pdf(output_image_path, form_type)


Filled image saved to: filled_image.png


# ACORD 28

In [7]:
# Load the image
image_path = r"clf_images\acord_28\base\acord_28_blank.png"
output_image_path = "filled_image.png"
form_type = 'acord_28'
img = Image.open(image_path)

font_path = random.choice([r"C:\Windows\Fonts\cour.ttf"])#, r"C:\Windows\Fonts\arial.ttf"])
font = ImageFont.truetype(font_path, random.randint(14,14))

# Get image dimensions
img_width, img_height = img.size

# Prepare the drawing context
draw = ImageDraw.Draw(img)

def get_augmentation_pipeline(height, width):
    crop_height = int(height * 0.98)  # 90% of the original height
    crop_width = int(width * 0.98)  # 90% of the original width

    return A.Compose([
        A.Rotate(limit=1, p=0.5),  # Larger rotation limit, higher probability
        A.GaussianBlur(blur_limit=1, p=0.25),  # Stronger blur, more frequent
        A.RandomCrop(height=crop_height, width=crop_width, p=0.25),  # Aggressive cropping
        A.GaussNoise(var_limit=(0.0, 3), p=0.25),  # Noisier images
        A.RandomBrightnessContrast(brightness_limit=0.025, contrast_limit=0.025, p=0.5),  # Brightness/contrast variation
        # A.Perspective(scale=(0.02, 0.07), p=0.5),  # Slight perspective distortions
    ])

augmentation_pipeline = get_augmentation_pipeline(img_height, img_width)

p2_info = random.choice([True,False])

# Define fields with normalized positions (percentages)
fields = json_field_value_mapper(r'clf_images\acord_28\base\coords.json')

# Fill fields with random values
for field in fields:

    if "a28_" in field:
        # Convert normalized coordinates to absolute pixel coordinates
        x = int((field["coords"][0] + x_noise) * img_width)
        y = int((field["coords"][1] + y_noise) * img_height)
    else:
        x_noise = 0#random.randint(-2,4) / 1000
        y_noise = 0#random.randint(-2,2) / 1000
    # Convert normalized coordinates to absolute pixel coordinates
    x = int((field["coords"][0] + x_noise) * img_width)
    y = int((field["coords"][1] + y_noise) * img_height)

    draw.text((x, y), field["value"], fill="black", font=font)
# Save the updated image
augmented = augmentation_pipeline(image=np.array(img))
aug_img = augmented['image']

# Save the updated image
# aug_img.save(output_image_path)
# output_path = os.path.join(variants_folder, f"{os.path.splitext(image_file)[0]}_variant_{i+1}.png")
cv2.imwrite(output_image_path, aug_img)

print(f"Filled image saved to: {output_image_path}")
save_png_as_pdf(output_image_path, form_type)


Filled image saved to: filled_image.png
PDF saved at: synthetic_samples\acord_28_2025_02_09_102521_285.pdf


In [1]:
XXX

NameError: name 'XXX' is not defined

# ACORD 25

In [None]:
# Load the image
image_path = r"clf_images\acord_25\base\acord_25_blank.png"
output_image_path = "filled_image.png"
form_type = 'acord_25'
img = Image.open(image_path)

font_path = random.choice([r"C:\Windows\Fonts\cour.ttf", r"C:\Windows\Fonts\arial.ttf"])
font = ImageFont.truetype(font_path, random.randint(14,14))

# Get image dimensions
img_width, img_height = img.size

# Prepare the drawing context
draw = ImageDraw.Draw(img)

def get_augmentation_pipeline(height, width):
    crop_height = int(height * 0.98)  # 90% of the original height
    crop_width = int(width * 0.98)  # 90% of the original width

    return A.Compose([
        A.Rotate(limit=0.5, p=0.5),  # Larger rotation limit, higher probability
        # A.GaussianBlur(blur_limit=1, p=0.5),  # Stronger blur, more frequent
        A.RandomCrop(height=crop_height, width=crop_width, p=0.5),  # Aggressive cropping
        A.GaussNoise(var_limit=(0.0, 3), p=0.5),  # Noisier images
        A.RandomBrightnessContrast(brightness_limit=0.025, contrast_limit=0.025, p=0.5),  # Brightness/contrast variation
        # A.Perspective(scale=(0.02, 0.07), p=0.5),  # Slight perspective distortions
    ])

augmentation_pipeline = get_augmentation_pipeline(img_height, img_width)

p2_info = random.choice([True,False])

# Define fields with normalized positions (percentages)
fields = json_field_value_mapper(r'clf_images\acord_25\base\coords.json')

# Fill fields with random values
for field in fields:

    if "a28_" in field:
        # Convert normalized coordinates to absolute pixel coordinates
        x = int((field["coords"][0] + x_noise) * img_width)
        y = int((field["coords"][1] + y_noise) * img_height)
    else:
        x_noise = 0#random.randint(-2,4) / 1000
        y_noise = 0#random.randint(-2,2) / 1000
    # Convert normalized coordinates to absolute pixel coordinates
    x = int((field["coords"][0] + x_noise) * img_width)
    y = int((field["coords"][1] + y_noise) * img_height)

    draw.text((x, y), field["value"], fill="black", font=font)
# Save the updated image
augmented = augmentation_pipeline(image=np.array(img))
aug_img = augmented['image']

# Save the updated image
# aug_img.save(output_image_path)
# output_path = os.path.join(variants_folder, f"{os.path.splitext(image_file)[0]}_variant_{i+1}.png")
cv2.imwrite(output_image_path, aug_img)

print(f"Filled image saved to: {output_image_path}")


Filled image saved to: filled_image.png


In [None]:

save_png_as_pdf(output_image_path, form_type)

In [None]:
fake.job()

In [None]:
fake.job()

In [None]:
from pdfrw import PdfReader, PdfWriter, PageMerge, PdfName
import fitz  # PyMuPDF

input_path = r"test_pages\f1040.pdf"
output_path = "filled_f1040.pdf"

# Open the PDF
doc = fitz.open(input_path)

print(f"Analyzing fillable fields in: {input_path}\n")

# Iterate through each page
for page_number in range(len(doc)):
    page = doc[page_number]
    try:
        # Extract widgets (form fields) on the page
        widgets = page.widgets()
        if widgets:
            print(f"Page {page_number + 1}:")
            for widget in widgets:
                field_name = widget.field_name
                field_value = widget.text
                print(f"  Field Name: {field_name}, Value: {field_value}")
        else:
            print(f"Page {page_number + 1}: No fillable fields found.")
    except Exception as e:
        print(f"Error analyzing page {page_number + 1}: {e}")

# Close the document
doc.close()

