In [2]:
import os
from tqdm import tqdm
from wand.image import Image
from wand.color import Color
from wand.display import display

def deskew_image(inpath, outpath):
    with Image(filename=inpath) as img:
        img.deskew(0.4*img.quantum_range)
        img.save(filename=outpath)

def find_images(folder_path, extensions=['.jpg', '.png', '.jpeg', '.gif', '.bmp']):
    image_paths = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if any(file.lower().endswith(ext) for ext in extensions):
                image_paths.append(os.path.join(root, file))
    return image_paths



In [3]:
def list_all_files(folder_path):
    all_files = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            all_files.append(os.path.join(root, file))
    return all_files

In [4]:
folder_path = "/Users/xieewenz/Downloads/Images 2"
len( find_images(folder_path)), len(list_all_files(folder_path))

(2859, 2862)

In [None]:
folder_path = "/Users/xieewenz/Downloads/Images 2"
image_list = find_images(folder_path)
for i, im in enumerate(tqdm(image_list)):
    deskew_image(im, f"deskewed2/{i:05d}.jpg")

In [None]:

def trim_image(original_img, original_width, original_height, color, fuzz, ):
    with original_img.clone() as trimmed_img:
        trimmed_img.border(color, 3, 3)
        trimmed_img.trim(color=color, fuzz=fuzz, percent_background=0.0, reset_coords=True)

        new_width, new_height = trimmed_img.width, trimmed_img.height
        width_cut_percentage = (original_width - new_width) / original_width * 100
        height_cut_percentage = (original_height - new_height) / original_height * 100

        if width_cut_percentage > 10 or height_cut_percentage > 15:
            trimmed_img.close() 
            new_img = original_img.clone() 
            reverted_trim = True
        else:
            new_img = trimmed_img.clone()  
            reverted_trim = False
    return new_img, reverted_trim


def edgecut(inpath,outpath):
    with Image(filename=inpath) as img:
        original_width, original_height = img.width, img.height
        
        
        # first clone the image and trim the white borders, if it trims too much, revert and go to next step
        white_trimmed, reverted_trim = trim_image(img, original_width, original_height, color=Color("white"), fuzz=0.30*img.quantum_range)
        if reverted_trim:
            print(f"reverted big fuzz white trim: {inpath}")
            white_trimmed, reverted_small_trim = trim_image(img, original_width, original_height, color=Color("white"), fuzz=30)
            if reverted_small_trim: 
                print(f"reverted small fuzz white trim: {inpath}")
                
        black_white_trimmed, reverted_trim = trim_image(white_trimmed, original_width, original_height, color=Color("black"), fuzz=0.30*white_trimmed.quantum_range)
        if reverted_trim:
            print(f"reverted big fuzz black trim: {inpath}")
            black_white_trimmed, reverted_small_trim = trim_image(white_trimmed, original_width, original_height, color=Color("black"), fuzz=30)
            if reverted_small_trim: 
                print(f"reverted small fuzz black trim: {inpath}")

        black_white_trimmed.save(filename=outpath)
    

folder_path = "deskewed2"
image_list = find_images(folder_path)
for i, im in enumerate(tqdm(sorted(image_list))):
    edgecut(im, f"edgecut4/{i:05d}.jpg")



In [None]:
from wand.image import Image

def match_and_crop_image(image_path, reference_path, output_dir, crop_size=(70,70), similarity_threshold=0.05, dissimilarity_threshold=0.618, bad_match_dir="bad_match"):
    with Image(filename=image_path) as orig_img:
        # Crop the image to the top-left 70x70
        with orig_img.clone() as img:
            img.crop(0, 0, width=crop_size[0], height=crop_size[1])
            with Image(filename=reference_path) as reference:
                location, diff = img.similarity(reference, similarity_threshold)
                if diff > dissimilarity_threshold:
                    print('Images too dissimilar to match')
                    orig_img.save(filename=f"{bad_match_dir}/{os.path.basename(image_path)}")
                elif diff <= similarity_threshold:
                    print('First match @ {left}x{top}'.format(**location))
                    # Crop the image from the top left x and y where form.jpg was matched
                    orig_img.crop(location['left'], location['top'], width=orig_img.width, height=orig_img.height)
                    orig_img.save(filename=f"{output_dir}/{os.path.basename(image_path)}")
                else:
                    print('Best match @ {left}x{top}'.format(**location))
                    # Crop the image from the top left x and y where form.jpg was matched
                    orig_img.crop(location['left'], location['top'], width=orig_img.width, height=orig_img.height)
                    orig_img.save(filename=f"{output_dir}/{os.path.basename(image_path)}")

image_dir = "edgecut4"
output_dir = "matched"
reference_path = "form.jpg"
image_list = find_images(image_dir)
for num, image_path in enumerate(tqdm(sorted(image_list))):
    match_and_crop_image(image_path, reference_path, output_dir)


In [None]:
from tqdm import tqdm

def resize_images(image_dir, output_dir):
    max_width = 0
    max_height = 0
    image_list = find_images(image_dir)
    for image_name in tqdm(sorted(image_list)):
        with PIL.Image.open(image_name) as img:
            width, height = img.size
            max_width = max(max_width, width)
            max_height = max(max_height, height)

    os.makedirs(output_dir, exist_ok=True)
    for image_name in tqdm(sorted(image_list)):
        with PIL.Image.open(image_name) as img:
            new_img = img.resize((max_width, max_height))
            new_img.save(f"{output_dir}/{os.path.basename(image_name)}")

resize_images("matched", "resized_images")





In [3]:
import random
import shutil

source_dir = "matched"
target_dir = "sampleimgs"
file_names = os.listdir(source_dir)

os.makedirs(target_dir, exist_ok=True)

for file_name in random.sample(file_names, 100):
    shutil.copy(os.path.join(source_dir, file_name), target_dir)
