In [None]:
import os
import subprocess
from pathlib import Path

END_WITH_LOCAL = 'bubble-detection'

os.environ['PATH'] = f"/root/.cargo/bin:{os.environ['PATH']}"

BASE_DIR = os.getcwd()
print(f"BASE_DIR: {BASE_DIR}")

# Simple validation
if not (BASE_DIR.endswith('/content') or BASE_DIR.endswith(END_WITH_LOCAL)):
    raise ValueError(f"Expected to be in .../{END_WITH_LOCAL} or .../content directory, but got: {BASE_DIR}")

In [None]:
import os
from pathlib import Path

Manga109_dir = os.path.join(BASE_DIR,'../../data/Manga109/Manga109/Manga109_released_2023_12_07/Manga109_released_2023_12_07/images')


In [None]:
folders = [f.name for f in Path(Manga109_dir).iterdir() if f.is_dir()]
print(folders)

In [None]:
folders = sorted([f.name for f in Path(Manga109_dir).iterdir() if f.is_dir()])
print(folders)

In [None]:
original_image_path = []

# Get the first 30 sorted (case-sensitive) folders (volumes)
first_30_volumes = sorted([f for f in Path(Manga109_dir).iterdir() if f.is_dir()], key=lambda x: x.name)[:35]

# For each volume, get the first 11 images sorted in ascending order
for volume in first_30_volumes:
    images = sorted([f for f in volume.iterdir() if f.is_file() and f.suffix.lower() == '.jpg'], key=lambda x: x.name)
    for img_path in images[:21]:
        original_image_path.append(str(img_path))

print(len(original_image_path))

human_annotate_dir = os.path.join(BASE_DIR,'../../data/Human_Annotate_300/train')

all_img_paths = []

# for root, dirs, files in os.walk(human_annotate_dir):
#     for file in files:
#         if file.lower().endswith(('.jpg', '.jpeg', '.png')):
#             all_img_paths.append(os.path.join(root, file))

# Only scan immediate directory (no subdirectories)
all_img_paths = []
for file in os.listdir(human_annotate_dir):
    file_path = os.path.join(human_annotate_dir, file)
    if os.path.isfile(file_path) and file.lower().endswith(('.jpg', '.jpeg', '.png')):
        all_img_paths.append(file_path)

print(len(all_img_paths))

for volume in first_30_volumes:
    os.makedirs(os.path.join(human_annotate_dir, volume.name), exist_ok=True)

In [None]:
import matplotlib.pyplot as plt
import cv2
import numpy as np
import time
import hashlib
from concurrent.futures import ThreadPoolExecutor
import os

# Option 1: Fast hash-based comparison (fastest)
def compare_images_hash(path1, path2):
    """Fast comparison using file hash"""
    try:
        with open(path1, 'rb') as f1, open(path2, 'rb') as f2:
            hash1 = hashlib.md5(f1.read()).hexdigest()
            hash2 = hashlib.md5(f2.read()).hexdigest()
            return hash1 == hash2
    except:
        return False

# Option 2: Fast perceptual hash comparison
def compare_images_phash(path1, path2, threshold=5):
    """Fast perceptual hash comparison"""
    try:
        import imagehash
        from PIL import Image
        
        img1 = Image.open(path1)
        img2 = Image.open(path2)
        
        hash1 = imagehash.phash(img1)
        hash2 = imagehash.phash(img2)
        
        difference = hash1 - hash2
        return difference <= threshold
    except:
        return False

# Option 3: GPU-accelerated comparison using OpenCV with Metal Performance Shaders (Mac)
def compare_images_fast(path1, path2, threshold=0.85):
    """Faster comparison with reduced image size and GPU acceleration where possible"""
    img1 = cv2.imread(path1)
    img2 = cv2.imread(path2)

    if img1 is None or img2 is None:
        return False
    
    # Resize images to smaller size for faster processing
    target_size = (256, 256)  # Much smaller for speed
    img1_small = cv2.resize(img1, target_size)
    img2_small = cv2.resize(img2, target_size)
    
    # Convert to grayscale
    gray1 = cv2.cvtColor(img1_small, cv2.COLOR_BGR2GRAY)
    gray2 = cv2.cvtColor(img2_small, cv2.COLOR_BGR2GRAY)
    
    # Use template matching (faster than SSIM)
    result = cv2.matchTemplate(gray1, gray2, cv2.TM_CCOEFF_NORMED)
    similarity_score = np.max(result)
    
    if similarity_score >= threshold:
        plt.figure(figsize=(15, 5))
        plt.subplot(1, 3, 1)
        plt.imshow(cv2.cvtColor(img1, cv2.COLOR_BGR2RGB))
        plt.title('Image Original to compare')
        plt.subplot(1, 3, 2)
        plt.imshow(cv2.cvtColor(img2, cv2.COLOR_BGR2RGB))
        plt.title('Image in Human Annotate')
        plt.subplot(1, 3, 3)
        plt.text(0.5, 0.5, f'Similarity Score: {similarity_score:.4f}', 
                 horizontalalignment='center', verticalalignment='center', 
                 transform=plt.gca().transAxes, fontsize=14)
        plt.axis('off')
        plt.show()
        time.sleep(0.2)
        plt.close()
        return True
    
    return False

# Option 4: Multi-threaded batch comparison
def compare_images_batch(original_paths, human_paths, comparison_func=compare_images_fast):
    """Process multiple images in parallel"""
    matches = []
    
    def compare_single(args):
        orig_path, human_path = args
        return comparison_func(orig_path, human_path), orig_path, human_path
    
    # Create all combinations to compare
    comparisons = [(orig, human) for orig in original_paths for human in human_paths]
    
    # Use ThreadPoolExecutor for parallel processing
    with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
        results = list(executor.map(compare_single, comparisons))
    
    # Filter matches
    for is_match, orig_path, human_path in results:
        if is_match:
            matches.append((orig_path, human_path))
    
    return matches

# Choose your comparison function:
# For exact matches: compare_images = compare_images_hash
# For similar images: compare_images = compare_images_phash  
# For flexible similarity: compare_images = compare_images_fast

compare_images = compare_images_fast  # Default choice

In [None]:
import shutil
import os
from tqdm import tqdm

count_found = 0
original_image_path_len = len(original_image_path)
print(f"Total original images to find: {original_image_path_len}")
all_img_paths_len = len(all_img_paths)
print(f"Total images in human annotate directory: {all_img_paths_len}")

for img_path in tqdm(original_image_path, desc="Processing original images"):
    copy_img_path = os.path.join(human_annotate_dir, Path(img_path).parent.name)
    copy_img_name = Path(img_path).name
    for img_human in all_img_paths:
        if compare_images(img_path, img_human):
            found = True
            # Copy and rename the image
            shutil.copy(img_human, os.path.join(copy_img_path, copy_img_name))
            # Copy and rename the XML if it exists
            xml_human = os.path.splitext(img_human)[0] + ".xml"
            if os.path.exists(xml_human):
                shutil.copy(xml_human, os.path.join(copy_img_path, os.path.splitext(copy_img_name)[0] + ".xml"))
                os.remove(xml_human)  # Remove the xml file after copying
            # Remove img_human from all_img_paths and delete the file
            all_img_paths.remove(img_human)
            os.remove(img_human)  # Remove the image file after copying
            count_found += 1
            break
        else:
            continue

In [None]:
print(count_found)