In [25]:
import os
import json
import random
import numpy as np
from PIL import Image, ImageDraw, ImageFont, ImageFilter
from tqdm import tqdm

# ======================
# CONFIGURATION
# ======================
OUTPUT_DIR = "dataset_final"
IMAGE_SIZE = (200, 64)
NUM_UNIQUE_WORDS = 100
VERSIONS_PER_WORD = 10

for subset in ["easy", "hard", "bonus"]:
    os.makedirs(os.path.join(OUTPUT_DIR, subset, "images"), exist_ok=True)

# ======================
# FONT & WORD SETUP
# ======================
def load_fonts():
    paths = [
        "/System/Library/Fonts/Supplemental/Arial.ttf",
        "/System/Library/Fonts/Supplemental/Courier New.ttf",
        "/System/Library/Fonts/Supplemental/Times New Roman.ttf",
        # "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
        # "C:\\Windows\\Fonts\\Arial.ttf",
        # "C:\\Windows\\Fonts\\Verdana.ttf"
    ]
    available_fonts = [p for p in paths if os.path.exists(p)]
    return available_fonts if available_fonts else [None]

FONTS = load_fonts()

def get_unique_words(n):
    try:
        from faker import Faker
        fake = Faker()
        words = set()
        while len(words) < n:
            w = fake.word().lower()
            if 4 <= len(w) <= 8: words.add(w)
        return list(words)
    except ImportError:
        seeds = ["ability", "neural", "vision", "captcha", "model", "logic", "small"]
        return [f"{random.choice(seeds)}{i}" for i in range(n)]

UNIQUE_WORDS = get_unique_words(NUM_UNIQUE_WORDS)

def apply_erasure(draw, bg_color):
    """Simulates removed parts of text by drawing background-colored lines/shapes."""
    # Draw 1-3 horizontal 'cut' lines to remove chunks of text
    for _ in range(random.randint(1, 3)):
        y_pos = random.randint(15, 45)
        thickness = random.randint(1, 3)
        # Offset slightly to look more like natural erasure
        draw.line([(0, y_pos), (IMAGE_SIZE[0], y_pos + random.randint(-2, 2))], 
                  fill=bg_color, width=thickness)

def add_color_grain(img, intensity=0.22):
    arr = np.array(img).astype(np.float32)
    noise = np.random.randint(0, 255, arr.shape, dtype='uint8')
    mask = np.random.rand(*arr.shape[:2]) < intensity
    arr[mask] = noise[mask]
    return Image.fromarray(arr.astype(np.uint8))

def draw_hollow_text(draw, x, y, text, font, color):
    for ox in [-1, 0, 1]:
        for oy in [-1, 0, 1]:
            if ox != 0 or oy != 0:
                draw.text((x + ox, y + oy), text, font=font, fill=color)
    draw.text((x, y), text, font=font, fill=(255, 255, 255))

def render_sample(text, is_hollow, bg_color, mode="hard", font_path=None):
    img = Image.new("RGB", IMAGE_SIZE, bg_color)
    draw = ImageDraw.Draw(img)
    
    font_size = 36 if mode == "easy" else random.randint(32, 38)
    try:
        font = ImageFont.truetype(font_path, font_size) if font_path else ImageFont.load_default()
    except:
        font = ImageFont.load_default()

    bbox = draw.textbbox((0, 0), text, font=font)
    x = (IMAGE_SIZE[0] - (bbox[2] - bbox[0])) // 2
    y = (IMAGE_SIZE[1] - (bbox[3] - bbox[1])) // 2

    text_color = (0, 0, 0) if mode == "easy" else random.choice([(10, 20, 100), (80, 20, 120), (20, 20, 20)])
    
    # Render Text
    if is_hollow and mode != "easy":
        draw_hollow_text(draw, x, y, text, font, text_color)
    else:
        draw.text((x, y), text, fill=text_color, font=font)

    # Apply Erasure (Only for Hard and Bonus)
    if mode != "easy" and random.random() < 0.7: # 70% chance to have erased parts
        apply_erasure(draw, bg_color)

    # Post-processing
    if mode != "easy":
        img = img.rotate(random.uniform(-4, 4), resample=Image.BICUBIC, expand=0, fillcolor=bg_color)
        img = add_color_grain(img)
        img = img.filter(ImageFilter.GaussianBlur(radius=0.15))
    
    return img

# ======================
# GENERATION LOOP
# ======================

def generate_dataset():
    for set_type in ["easy", "hard", "bonus"]:
        labels = {}
        for word in tqdm(UNIQUE_WORDS, desc=f"Generating {set_type}"):
            for v in range(VERSIONS_PER_WORD):
                captcha_text = "".join(c.upper() if random.random() < 0.5 else c.lower() for c in word) if set_type != "easy" else word.capitalize()
                current_font_path = random.choice(FONTS)
                font_name = os.path.basename(current_font_path) if current_font_path else "default"
                
                current_bg = (255, 255, 255)
                if set_type == "hard":
                    current_bg = (245, 248, 255)
                elif set_type == "bonus":
                    is_red = random.random() < 0.5
                    current_bg = (210, 100, 100) if is_red else (100, 210, 100)
                    if is_red: captcha_text = captcha_text[::-1] 

                is_hollow = random.random() < 0.5 if set_type != "easy" else False
                img = render_sample(captcha_text, is_hollow, current_bg, mode=set_type, font_path=current_font_path)
                
                fname = f"{word}_v{v}.png"
                img.save(os.path.join(OUTPUT_DIR, set_type, "images", fname))
                labels[fname] = {"captcha": captcha_text, "word": word, "font": font_name}

        with open(os.path.join(OUTPUT_DIR, set_type, "labels.json"), "w") as f:
            json.dump(labels, f, indent=2)

if __name__ == "__main__":
    generate_dataset()
    print("\n✅ Final Dataset generation complete with Erasure effects.")

Generating easy: 100%|██████████| 100/100 [00:00<00:00, 145.74it/s]
Generating hard: 100%|██████████| 100/100 [00:02<00:00, 34.63it/s]
Generating bonus: 100%|██████████| 100/100 [00:02<00:00, 36.44it/s]


✅ Final Dataset generation complete with Erasure effects.





In [None]:
# import os
# import json
# import random
# import numpy as np
# from PIL import Image, ImageDraw, ImageFont, ImageFilter
# from tqdm import tqdm

# # ======================
# # CONFIGURATION
# # ======================
# OUTPUT_DIR = "dataset_final"
# IMAGE_SIZE = (200, 64)
# NUM_UNIQUE_WORDS = 100
# VERSIONS_PER_WORD = 10

# # Create folder structure for all three sets
# for subset in ["easy", "hard", "bonus"]:
#     os.makedirs(os.path.join(OUTPUT_DIR, subset, "images"), exist_ok=True)

# # ======================
# # WORD & FONT SETUP
# # ======================
# def get_unique_words(n):
#     """Generates 100 unique words."""
#     try:
#         from faker import Faker
#         fake = Faker()
#         words = set()
#         while len(words) < n:
#             w = fake.word().lower()
#             if 4 <= len(w) <= 8:
#                 words.add(w)
#         return list(words)
#     except ImportError:
#         # Robust fallback list if faker is not installed
#         seeds = ["ability", "neural", "vision", "captcha", "model", "logic", "small", "today", "python", "image"]
#         return [f"{random.choice(seeds)}{i}" for i in range(n)]

# UNIQUE_WORDS = get_unique_words(NUM_UNIQUE_WORDS)

# def load_fonts():
#     """Finds system fonts."""
#     paths = [
#         "/System/Library/Fonts/Supplemental/Arial.ttf", # macOS
#         "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", # Linux
#         "C:\\Windows\\Fonts\\Arial.ttf", # Windows
#         "/Library/Fonts/Arial.ttf"
#     ]
#     fonts = [p for p in paths if os.path.exists(p)]
#     return fonts if fonts else [None]

# FONTS = load_fonts()

# # ======================
# # RENDERING UTILS
# # ======================

# def add_color_grain(img, intensity=0.22):
#     """Adds the high-frequency RGB static noise."""
#     arr = np.array(img).astype(np.float32)
#     noise = np.random.randint(0, 255, arr.shape, dtype='uint8')
#     mask = np.random.rand(*arr.shape[:2]) < intensity
#     arr[mask] = noise[mask]
#     return Image.fromarray(arr.astype(np.uint8))

# def draw_hollow_text(draw, x, y, text, font, color):
#     """Renders the hollow/outlined text style."""
#     outline_color = color
#     fill_color = (255, 255, 255)
#     for ox in [-1, 0, 1]:
#         for oy in [-1, 0, 1]:
#             if ox != 0 or oy != 0:
#                 draw.text((x + ox, y + oy), text, font=font, fill=outline_color)
#     draw.text((x, y), text, font=font, fill=fill_color)

# def render_sample(text, is_hollow, bg_color, mode="hard", font_path=None):
#     img = Image.new("RGB", IMAGE_SIZE, bg_color)
#     draw = ImageDraw.Draw(img)
    
#     font_size = 36 if mode == "easy" else random.randint(32, 38)
#     try:
#         font = ImageFont.truetype(font_path, font_size) if font_path else ImageFont.load_default()
#     except:
#         font = ImageFont.load_default()

#     # Center text
#     bbox = draw.textbbox((0, 0), text, font=font)
#     w, h = bbox[2] - bbox[0], bbox[3] - bbox[1]
#     x, y = (IMAGE_SIZE[0] - w) // 2, (IMAGE_SIZE[1] - h) // 2

#     text_color = (0, 0, 0) if mode == "easy" else random.choice([(10, 20, 100), (80, 20, 120), (20, 20, 20)])
    
#     if is_hollow and mode != "easy":
#         draw_hollow_text(draw, x, y, text, font, text_color)
#     else:
#         draw.text((x, y), text, fill=text_color, font=font)

#     # Apply noise and rotation ONLY for Hard and Bonus
#     if mode != "easy":
#         img = img.rotate(random.uniform(-3, 3), resample=Image.BICUBIC, expand=0, fillcolor=bg_color)
#         img = add_color_grain(img)
#         img = img.filter(ImageFilter.GaussianBlur(radius=0.15))
    
#     return img

# # ======================
# # GENERATION TASK
# # ======================

# def generate_dataset():
#     for set_type in ["easy", "hard", "bonus"]:
#         labels = {}
#         for word in tqdm(UNIQUE_WORDS, desc=f"Generating {set_type}"):
#             for v in range(VERSIONS_PER_WORD):
#                 # 1. Capitalization Logic
#                 if set_type == "easy":
#                     captcha_text = word.capitalize() # Fixed: Always Capitalized
#                 else:
#                     # Fluctuating capitalization for Hard/Bonus
#                     captcha_text = "".join(c.upper() if random.random() < 0.5 else c.lower() for c in word)
                
#                 # 2. Background & Font Logic
#                 current_bg = (255, 255, 255) # Easy: Plain white background
#                 current_font = FONTS[0]      # Easy: Fixed font
                
#                 if set_type == "hard":
#                     current_bg = (245, 248, 255)
#                     current_font = random.choice(FONTS)
                
#                 if set_type == "bonus":
#                     is_red = random.random() < 0.5
#                     current_bg = (210, 100, 100) if is_red else (100, 210, 100)
#                     current_font = random.choice(FONTS)
#                     # If red, image shows REVERSED word, but label remains original
#                     if is_red:
#                         captcha_text = captcha_text[::-1]

#                 # 3. Render
#                 is_hollow = random.random() < 0.5 if set_type != "easy" else False
#                 img = render_sample(captcha_text, is_hollow, current_bg, mode=set_type, font_path=current_font)
                
#                 # 4. Save
#                 fname = f"{word}_v{v}.png"
#                 img.save(os.path.join(OUTPUT_DIR, set_type, "images", fname))
                
#                 # 5. Labeling logic: {captcha: rendered_string, word: original_string}
#                 labels[fname] = {
#                     "captcha": captcha_text,
#                     "word": word
#                 }

#         with open(os.path.join(OUTPUT_DIR, set_type, "labels.json"), "w") as f:
#             json.dump(labels, f, indent=2)

# if __name__ == "__main__":
#     generate_dataset()
#     print("\n✅ Full Dataset generation (Easy, Hard, Bonus) complete.")

Generating easy: 100%|██████████| 100/100 [00:00<00:00, 123.54it/s]
Generating hard: 100%|██████████| 100/100 [00:02<00:00, 34.28it/s]
Generating bonus: 100%|██████████| 100/100 [00:02<00:00, 35.16it/s]


✅ Full Dataset generation (Easy, Hard, Bonus) complete.





In [None]:
# import os
# import json
# import random
# import numpy as np
# from PIL import Image, ImageDraw, ImageFont, ImageFilter
# from tqdm import tqdm

# # ======================
# # CONFIGURATION
# # ======================
# OUTPUT_DIR = "dataset_final"
# IMAGE_SIZE = (200, 64)
# NUM_UNIQUE_WORDS = 100
# VERSIONS_PER_WORD = 10

# for subset in ["easy", "hard", "bonus"]:
#     os.makedirs(os.path.join(OUTPUT_DIR, subset, "images"), exist_ok=True)

# # ======================
# # FONT & WORD SETUP
# # ======================
# def load_fonts():
#     """Loads a variety of system fonts to ensure diversity."""
#     paths = [
#         "/System/Library/Fonts/Supplemental/Arial.ttf",
#         "/System/Library/Fonts/Supplemental/Courier New.ttf",
#         "/System/Library/Fonts/Supplemental/Times New Roman.ttf",
#         "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
#         "/usr/share/fonts/truetype/liberation/LiberationSerif-Regular.ttf",
#         "C:\\Windows\\Fonts\\Arial.ttf",
#         "C:\\Windows\\Fonts\\Verdana.ttf"
#     ]
#     available_fonts = [p for p in paths if os.path.exists(p)]
#     return available_fonts if available_fonts else [None]

# FONTS = load_fonts()

# def get_unique_words(n):
#     """Generates unique base words."""
#     try:
#         from faker import Faker
#         fake = Faker()
#         words = set()
#         while len(words) < n:
#             w = fake.word().lower()
#             if 4 <= len(w) <= 8:
#                 words.add(w)
#         return list(words)
#     except ImportError:
#         seeds = ["ability", "neural", "vision", "captcha", "model", "logic", "small", "today", "python", "image"]
#         return [f"{random.choice(seeds)}{i}" for i in range(n)]

# UNIQUE_WORDS = get_unique_words(NUM_UNIQUE_WORDS)

# # ======================
# # RENDERING UTILS
# # ======================

# def randomize_capitalization(word):
#     """Fluctuates capitalization across individual letters."""
#     return "".join(c.upper() if random.random() < 0.5 else c.lower() for c in word)

# def add_color_grain(img, intensity=0.22):
#     """Adds the RGB static noise seen in reference images."""
#     arr = np.array(img).astype(np.float32)
#     noise = np.random.randint(0, 255, arr.shape, dtype='uint8')
#     mask = np.random.rand(*arr.shape[:2]) < intensity
#     arr[mask] = noise[mask]
#     return Image.fromarray(arr.astype(np.uint8))

# def draw_hollow_text(draw, x, y, text, font, color):
#     """Renders the hollow/outlined text style."""
#     for ox in [-1, 0, 1]:
#         for oy in [-1, 0, 1]:
#             if ox != 0 or oy != 0:
#                 draw.text((x + ox, y + oy), text, font=font, fill=color)
#     draw.text((x, y), text, font=font, fill=(255, 255, 255))

# def render_sample(text, is_hollow, bg_color, mode="hard", font_path=None):
#     img = Image.new("RGB", IMAGE_SIZE, bg_color)
#     draw = ImageDraw.Draw(img)
    
#     font_size = 36 if mode == "easy" else random.randint(30, 38)
#     try:
#         font = ImageFont.truetype(font_path, font_size) if font_path else ImageFont.load_default()
#     except:
#         font = ImageFont.load_default()

#     bbox = draw.textbbox((0, 0), text, font=font)
#     x = (IMAGE_SIZE[0] - (bbox[2] - bbox[0])) // 2
#     y = (IMAGE_SIZE[1] - (bbox[3] - bbox[1])) // 2

#     text_color = (0, 0, 0) if mode == "easy" else random.choice([(10, 20, 100), (80, 20, 120), (20, 20, 20)])
    
#     if is_hollow and mode != "easy":
#         draw_hollow_text(draw, x, y, text, font, text_color)
#     else:
#         draw.text((x, y), text, fill=text_color, font=font)

#     if mode != "easy":
#         img = img.rotate(random.uniform(-4, 4), resample=Image.BICUBIC, expand=0, fillcolor=bg_color)
#         img = add_color_grain(img)
#         img = img.filter(ImageFilter.GaussianBlur(radius=0.15))
    
#     return img

# # ======================
# # GENERATION LOOP
# # ======================

# def generate_dataset():
#     for set_type in ["easy", "hard", "bonus"]:
#         labels = {}
#         for word in tqdm(UNIQUE_WORDS, desc=f"Generating {set_type}"):
#             for v in range(VERSIONS_PER_WORD):
#                 # 1. Capitalization Logic
#                 captcha_text = randomize_capitalization(word) if set_type != "easy" else word.capitalize()
                
#                 # 2. Font Selection
#                 current_font_path = random.choice(FONTS) if set_type != "easy" else FONTS[0]
#                 # Extract font name from path for the JSON
#                 font_name = os.path.basename(current_font_path) if current_font_path else "default"
                
#                 # 3. Background Logic
#                 current_bg = (255, 255, 255)
#                 if set_type == "hard":
#                     current_bg = (245, 248, 255)
#                 elif set_type == "bonus":
#                     is_red = random.random() < 0.5
#                     current_bg = (210, 100, 100) if is_red else (100, 210, 100)
#                     if is_red:
#                         captcha_text = captcha_text[::-1] 

#                 # 4. Render
#                 is_hollow = random.random() < 0.5 if set_type != "easy" else False
#                 img = render_sample(captcha_text, is_hollow, current_bg, mode=set_type, font_path=current_font_path)
                
#                 # 5. Save & Label
#                 fname = f"{word}_v{v}.png"
#                 img.save(os.path.join(OUTPUT_DIR, set_type, "images", fname))
                
#                 # Added font information to the label
#                 labels[fname] = {
#                     "captcha": captcha_text,
#                     "word": word,
#                     "font": font_name
#                 }

#         with open(os.path.join(OUTPUT_DIR, set_type, "labels.json"), "w") as f:
#             json.dump(labels, f, indent=2)

# if __name__ == "__main__":
#     generate_dataset()
#     print("\n✅ Dataset generation complete. JSON now includes 'font' metadata.")

Generating easy: 100%|██████████| 100/100 [00:00<00:00, 118.47it/s]
Generating hard: 100%|██████████| 100/100 [00:03<00:00, 32.97it/s]
Generating bonus: 100%|██████████| 100/100 [00:03<00:00, 32.91it/s]


✅ Dataset generation complete. JSON now includes 'font' metadata.



