In [2]:
import os
import cv2
import numpy as np
import random
import zipfile
import multiprocessing
from functools import partial
from tqdm import tqdm

In [7]:
input_path = "./../data/food"     # CHANGE this to your folder of food images
output_root = "marker_detection_dataset"
zip_output = "marker_detection_dataset.zip"

def generate_marker(square_size=20, rows=6, cols=7, border=5):
    """Generate a color checkerboard marker pattern."""
    h, w = rows*square_size, cols*square_size
    marker = np.ones((h, w, 3), dtype=np.uint8)*255
    colors = [
        (255,0,0),(0,255,0),(0,0,255),(255,255,0),(0,255,255),
        (255,0,255),(128,128,128),(0,0,0),(0,128,255),(128,0,128)
    ]
    idx = 0
    for i in range(rows):
        for j in range(cols):
            tl = (j*square_size, i*square_size)
            br = ((j+1)*square_size, (i+1)*square_size)
            cv2.rectangle(marker, tl, br, colors[idx%len(colors)], -1)
            idx += 1
    return cv2.copyMakeBorder(marker, border, border, border, border, cv2.BORDER_CONSTANT, value=(0,0,0))

def adjust_brightness_contrast(img, brightness=0, contrast=1.0):
    """Adjust brightness and contrast of an image."""
    return cv2.convertScaleAbs(img, alpha=contrast, beta=brightness)

def adjust_white_balance(img, blue_gain=1.0, red_gain=1.0, green_gain=1.0):
    """Adjust color balance of an image."""
    b, g, r = cv2.split(img)
    b = np.clip(b * blue_gain, 0, 255).astype(np.uint8)
    g = np.clip(g * green_gain, 0, 255).astype(np.uint8)
    r = np.clip(r * red_gain, 0, 255).astype(np.uint8)
    return cv2.merge((b, g, r))

def apply_shadow(img, intensity=0.5):
    """Apply a shadow gradient to an image."""
    h, w = img.shape[:2]
    mask = np.ones((h, w), dtype=np.float32)
    for i in range(h):
        for j in range(w):
            mask[i, j] = 1 - intensity * (i / h) * (j / w)
    shadowed = img.astype(np.float32) * mask[:, :, np.newaxis]
    return np.clip(shadowed, 0, 255).astype(np.uint8)

def apply_filters(image):
    """Apply various lighting and color filters to an image."""
    filters = [
        ("Bright_Sunlight", lambda img: adjust_brightness_contrast(img, brightness=50, contrast=1.2)),
        ("Overcast", lambda img: adjust_brightness_contrast(img, brightness=-20, contrast=0.8)),
        ("Warm_Indoor", lambda img: adjust_white_balance(img, blue_gain=0.9, red_gain=1.2)),
        ("Cool_Indoor", lambda img: adjust_white_balance(img, blue_gain=1.2, red_gain=0.9)),
        ("Golden_Hour", lambda img: adjust_white_balance(img, blue_gain=0.8, red_gain=1.3)),
        ("High_Contrast_Noon", lambda img: adjust_brightness_contrast(img, brightness=30, contrast=1.5)),
        ("Soft_Morning", lambda img: adjust_brightness_contrast(img, brightness=20, contrast=0.9)),
    ]

    filtered_images = []
    for name, func in filters:
        filtered = func(image.copy())
        filtered = np.clip(filtered, 0, 255).astype(np.uint8)
        filtered_images.append((name, filtered))
    return filtered_images

def overlay_marker(img):
    """Overlay a marker on an image at a random position."""
    marker = generate_marker()
    h_img, w_img = img.shape[:2]
    target_w = int(w_img * random.uniform(0.10, 0.15))
    scale = target_w / marker.shape[1]
    marker = cv2.resize(marker, (0, 0), fx=scale, fy=scale)
    mh, mw = marker.shape[:2]

    # Choose a random position (anywhere in the image with margin)
    margin = 10
    x = random.randint(margin, w_img - mw - margin)
    y = random.randint(margin, h_img - mh - margin)

    out = img.copy()
    out[y:y+mh, x:x+mw] = marker
    return out

def zip_folder(folder_path, output_filename):
    """Compress the output folder into a zip file."""
    print(f"Creating zip file: {output_filename}")
    with zipfile.ZipFile(output_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, _, files in os.walk(folder_path):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, os.path.dirname(folder_path))
                zipf.write(file_path, arcname)
    print(f"✅ Dataset compressed to: {output_filename}")

def process_image(filename, input_folder, output_root):
    """Process a single image - this function will be run in parallel."""
    class0_dir = os.path.join(output_root, "0")
    class1_dir = os.path.join(output_root, "1")
    
    # Skip non-image files
    if not filename.lower().endswith(('.jpg', '.jpeg', '.png')):
        return 0
    
    img_path = os.path.join(input_folder, filename)
    img = cv2.imread(img_path)
    if img is None:
        return 0
    
    image_count = 0
    base_name = os.path.splitext(filename)[0]
    
    # Create Class 0: Original image with filters
    filtered_originals = apply_filters(img)
    for filter_name, filtered_img in filtered_originals:
        out_name = f"{base_name}_{filter_name}.jpg"
        out_path = os.path.join(class0_dir, out_name)
        cv2.imwrite(out_path, filtered_img)
        image_count += 1
    
    # Create Class 1: Image with marker and filters
    for marker_idx in range(5):  # Create 5 random marker positions per image
        marked_img = overlay_marker(img)
        filtered_marked = apply_filters(marked_img)
        
        for filter_name, filtered_img in filtered_marked:
            out_name = f"{base_name}_marker{marker_idx}_{filter_name}.jpg"
            out_path = os.path.join(class1_dir, out_name)
            cv2.imwrite(out_path, filtered_img)
            image_count += 1
    
    return image_count

def create_marker_detection_dataset(input_folder, output_root, num_processes=None):
    """Create a dataset for marker detection with parallelization.
    
    Args:
        input_folder: Path to folder containing original images
        output_root: Path to output dataset folder
        num_processes: Number of processes to use (defaults to CPU count)
    """
    # Create directory structure
    os.makedirs(output_root, exist_ok=True)
    class0_dir = os.path.join(output_root, "0")
    class1_dir = os.path.join(output_root, "1")
    os.makedirs(class0_dir, exist_ok=True)
    os.makedirs(class1_dir, exist_ok=True)
    
    # Get list of image files
    image_files = [f for f in os.listdir(input_folder) 
                  if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
    
    if not image_files:
        print("❌ No images found in the input folder. Check the path.")
        return
    
    print(f"Found {len(image_files)} images to process")
    
    # Determine number of processes to use
    if num_processes is None:
        num_processes = multiprocessing.cpu_count()
    print(f"Using {num_processes} processes for parallel processing")
    
    # Process images in parallel
    with multiprocessing.Pool(processes=num_processes) as pool:
        process_func = partial(process_image, input_folder=input_folder, output_root=output_root)
        results = list(tqdm(pool.imap(process_func, image_files), 
                           total=len(image_files), 
                           desc="Processing images"))
    
    # Calculate total processed images
    total_images = sum(results)
    
    class0_count = len(os.listdir(class0_dir))
    class1_count = len(os.listdir(class1_dir))
    
    print(f"\n✅ Dataset created with {total_images} images:")
    print(f"   - Class 0 (no marker): {class0_count} images")
    print(f"   - Class 1 (with marker): {class1_count} images")
    print(f"   - Output location: {output_root}")
    
    # Create zip file
    zip_folder(output_root, zip_output)

if __name__ == "__main__":
    # Get the number of available CPU cores
    cpu_count = multiprocessing.cpu_count()
    # Use 75% of available cores to avoid overloading the system
    recommended_processes = max(1, int(cpu_count * 0.75))
    
    print(f"Starting dataset creation with {recommended_processes} processes")
    create_marker_detection_dataset(
        input_folder=input_path, 
        output_root=output_root, 
        num_processes=recommended_processes
    )

Starting dataset creation with 6 processes
Found 500 images to process
Using 6 processes for parallel processing


Processing images: 100%|██████████| 500/500 [00:20<00:00, 24.89it/s]



✅ Dataset created with 21000 images:
   - Class 0 (no marker): 3500 images
   - Class 1 (with marker): 17500 images
   - Output location: marker_detection_dataset
Creating zip file: marker_detection_dataset.zip
✅ Dataset compressed to: marker_detection_dataset.zip
