# Dataset Classification with Tesseract

This notebook processes the images in the `dataset` folder, classifies them using Tesseract OCR, renames the files based on their classification, and provides summary metrics.

**IMPORTANT:** Before running, you must have Google's Tesseract OCR engine installed on your system and accessible in your PATH. You can find installation instructions here: [https://github.com/tesseract-ocr/tesseract](https://github.com/tesseract-ocr/tesseract)

In [None]:
import os
import cv2
import pytesseract
from collections import Counter
import re

# If tesseract is not in your PATH, you can uncomment the following line
# and provide the path to your tesseract.exe
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

In [None]:
DATASET_DIR = "dataset"
classification_metrics = Counter()
file_rename_counter = Counter()
unclassified_count = 0

# List all files in the dataset directory
try:
    image_files = [f for f in os.listdir(DATASET_DIR) if f.endswith('.png')]
except FileNotFoundError:
    print(f"Error: The directory '{DATASET_DIR}' was not found.")
    image_files = []

print(f"Found {len(image_files)} images to process.")

for filename in image_files:
    old_filepath = os.path.join(DATASET_DIR, filename)
    
    # Load the image in grayscale
    image = cv2.imread(old_filepath, cv2.IMREAD_GRAYSCALE)
    
    # Configure Tesseract
    # --psm 7: Treat the image as a single text line.
    # -c tessedit_char_whitelist: Restrict output to these characters.
    config = "--psm 7 -c tessedit_char_whitelist=0123456789%"
    
    # Perform OCR
    text = pytesseract.image_to_string(image, config=config).strip()
    
    # --- Text Cleanup and Validation ---
    # Find the most likely percentage value
    match = re.search(r'(\d+)', text)
    if match:
        clean_text = match.group(1) + "%"
    else:
        clean_text = ""

    # --- File Renaming ---
    if clean_text:
        # Update the classification metrics
        classification_metrics[clean_text] += 1
        
        # Get the instance count for the new filename
        file_rename_counter[clean_text] += 1
        instance_count = file_rename_counter[clean_text]
        
        # Sanitize '%' for the filename, as it can cause issues on some systems
        safe_label = clean_text.replace('%', 'pct')
        new_filename = f"{safe_label}_{instance_count}.png"
        new_filepath = os.path.join(DATASET_DIR, new_filename)
        
        try:
            # Rename the file
            os.rename(old_filepath, new_filepath)
            print(f"Renamed '{filename}' to '{new_filename}'")
        except OSError as e:
            print(f"Error renaming file {filename}: {e}")
            
    else:
        print(f"Could not classify '{filename}', skipping rename.")
        unclassified_count += 1

print("\n--- Processing Complete ---")

In [None]:
# --- Display Classification Metrics ---

print("Tesseract Classification Metrics:")
print("="*30)

if classification_metrics:
    total_classified = sum(classification_metrics.values())
    print(f"Total Images Classified: {total_classified}")
    print(f"Total Images Unclassified: {unclassified_count}")
    print(f"Number of Unique Classes: {len(classification_metrics)}")
    print("\n--- Classification Counts ---")
    
    # Sort by most common
    for label, count in classification_metrics.most_common():
        print(f"- {label}: {count} times")
else:
    print("No images were successfully classified.")


# Standardize Dataset Naming

The following cell processes the images in the `dataset` folder, standardizes their filenames based on the numeric prefix, and copies them to a new `dataset_cleaned` directory. This is useful for creating a clean, consistently named dataset for model training.

In [None]:
import os
import time
import random

def rename_files_to_number(source_dir, number):
    """
    Renames all files in source_dir to the format '{number}_{uniqueid}{ext}'.
    The unique id is based on the current time and a random integer to avoid collisions.
    """
    files = [f for f in os.listdir(source_dir) if os.path.isfile(os.path.join(source_dir, f))]
    for f in files:
        ext = os.path.splitext(f)[1]
        unique_id = f"{int(time.time() * 1e6)}{random.randint(1000, 9999)}"
        new_name = f"{number}_{unique_id}{ext}"
        src = os.path.join(source_dir, f)
        dst = os.path.join(source_dir, new_name)
        os.rename(src, dst)
        print(f"Renamed '{f}' -> '{new_name}'")

# Example usage:
rename_files_to_number("workdir", 55)

In [None]:
import os
import re
import shutil
from collections import Counter

# Define directories
SOURCE_DIR = "dataset"
DEST_DIR = "dataset_cleaned"

# Create the destination directory if it doesn't exist
os.makedirs(DEST_DIR, exist_ok=True)
print(f"Created directory: {DEST_DIR}")

# Counter for new filenames
rename_counter = Counter()
files_processed = 0
files_skipped = 0

# List all files in the source directory
try:
    image_files = [f for f in os.listdir(SOURCE_DIR) if f.endswith('.png')]
except FileNotFoundError:
    print(f"Error: The source directory '{SOURCE_DIR}' was not found.")
    image_files = []

print(f"Found {len(image_files)} images to standardize.")

# Process each file
for filename in image_files:
    # Extract the numeric label from the start of the filename
    match = re.match(r'^(\d+)', filename)
    
    if match:
        label = match.group(1)
        
        # Remove leading zeros (convert to int and back to string)
        label_int = int(label)
        label_clean = str(label_int)
        
        # Increment the counter for this clean label
        rename_counter[label_clean] += 1
        instance_count = rename_counter[label_clean]
        
        # Create the new standardized filename with no leading zeros
        new_filename = f"{label_clean}pct_{instance_count}.png"
        
        # Define full paths
        old_filepath = os.path.join(SOURCE_DIR, filename)
        new_filepath = os.path.join(DEST_DIR, new_filename)
        
        # Copy the file to the new directory with the new name
        shutil.copy(old_filepath, new_filepath)
        files_processed += 1
    else:
        print(f"Could not extract label from '{filename}', skipping.")
        files_skipped += 1

print("\n--- Standardization Complete ---")
print(f"Successfully processed and copied {files_processed} files.")
print(f"Skipped {files_skipped} files.")

In [None]:
import os
import shutil
import re
from collections import Counter

# --- Dataset Merging Configuration ---
# Change these folder names as needed
FOLDER1 = "dataset_cleaned"  # First folder to merge
FOLDER2 = "dataset_cleaned2"  # Second folder to merge - change this to your second folder
OUTPUT_FOLDER = "dataset_merged"

# Create the output directory
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
print(f"Created output directory: {OUTPUT_FOLDER}")

# Counter to track instances of each label across both folders
global_counter = Counter()
total_files_merged = 0

def merge_folder(source_folder, folder_name):
    """
    Merge files from a source folder into the output folder.
    
    Args:
        source_folder: Path to the source folder
        folder_name: Name of the folder (for logging purposes)
    """
    global global_counter, total_files_merged
    
    if not os.path.exists(source_folder):
        print(f"Warning: {source_folder} does not exist, skipping.")
        return
    
    files_in_folder = [f for f in os.listdir(source_folder) if f.endswith('.png')]
    print(f"\nProcessing {len(files_in_folder)} files from {folder_name}...")
    
    for filename in files_in_folder:
        # Extract the numeric label from the filename
        match = re.match(r'^(\d+)', filename)
        
        if match:
            label = match.group(1)
            
            # Remove leading zeros (convert to int and back to string)
            label_int = int(label)
            label_clean = str(label_int)
            
            # Increment the global counter for this clean label
            global_counter[label_clean] += 1
            instance_count = global_counter[label_clean]
            
            # Create the new standardized filename with no leading zeros
            new_filename = f"{label_clean}pct_{instance_count}.png"
            
            # Define full paths
            old_filepath = os.path.join(source_folder, filename)
            new_filepath = os.path.join(OUTPUT_FOLDER, new_filename)
            
            # Copy the file with the new name
            shutil.copy(old_filepath, new_filepath)
            total_files_merged += 1
            
            if total_files_merged % 100 == 0:  # Progress update every 100 files
                print(f"Merged {total_files_merged} files...")
        else:
            print(f"Could not extract label from '{filename}', skipping.")

# Merge both folders
print("="*50)
print("MERGING DATASET FOLDERS")
print("="*50)

merge_folder(FOLDER1, "Folder 1 (dataset_cleaned)")
merge_folder(FOLDER2, "Folder 2")

print(f"\n--- Merging Complete ---")
print(f"Total files merged: {total_files_merged}")
print(f"Output directory: {OUTPUT_FOLDER}")
print(f"Number of unique labels: {len(global_counter)}")

# Display label distribution
print("\n--- Label Distribution ---")
for label, count in sorted(global_counter.items(), key=lambda x: int(x[0])):
    print(f"Label {label}: {count} images")

In [None]:
import os
import shutil
import re
from collections import defaultdict

# Source and destination directories
SRC_DIR = OUTPUT_FOLDER  # 'dataset_merged'
DEST_DIR = "dataset_ends"
os.makedirs(DEST_DIR, exist_ok=True)

# Find all png files and group by label
label_to_files = defaultdict(list)
for fname in os.listdir(SRC_DIR):
    if fname.endswith('.png'):
        match = re.match(r'^(\d+)pct_(\d+)\.png$', fname)
        if match:
            label = match.group(1)
            counter = int(match.group(2))
            label_to_files[label].append((counter, fname))

# For each label, find min and max counter and copy those files
files_copied = 0
for label, files in label_to_files.items():
    if not files:
        continue
    # Sort by counter
    files_sorted = sorted(files, key=lambda x: x[0])
    min_counter, min_file = files_sorted[0]
    max_counter, max_file = files_sorted[-1]
    # Copy min
    src_min = os.path.join(SRC_DIR, min_file)
    dst_min = os.path.join(DEST_DIR, min_file)
    shutil.copy(src_min, dst_min)
    files_copied += 1
    # If min and max are different, copy max
    if min_file != max_file:
        src_max = os.path.join(SRC_DIR, max_file)
        dst_max = os.path.join(DEST_DIR, max_file)
        shutil.copy(src_max, dst_max)
        files_copied += 1

print(f"Copied {files_copied} files to {DEST_DIR}")

# CNN Training with PyTorch

This section defines and trains a Convolutional Neural Network (CNN) on the cleaned dataset.
The process includes:
1.  A custom `Dataset` class to load images and parse labels from filenames.
2.  Splitting the data into training and validation sets.
3.  Defining the CNN architecture.
4.  A full training and validation loop.

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import transforms
from PIL import Image
import os
import re
import numpy as np
import shutil

# Import our optimized CNN architecture
import sys
sys.path.append('src')
from models import get_model, count_parameters, benchmark_model, get_default_model_type, get_model_info

🔧 Model Configuration: Default model type set to 'optimized'


In [2]:
# --- Compare CNN Architectures ---
print("CNN Architecture Comparison")
print("="*50)

# Show current default
current_default = get_default_model_type()
print(f"Current default model type: '{current_default}'")
print(f"To change: Edit DEFAULT_MODEL_TYPE in src/models/percentage_cnn.py")

# Compare different model architectures
architectures = ["optimized", "lightweight", "simple"]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for arch in architectures:
    model_test = get_model(arch, num_classes=100)
    params = count_parameters(model_test)
    info = get_model_info(arch)
    
    marker = "👉" if arch == current_default else "  "
    print(f"\n{marker} {arch.upper()} CNN:")
    print(f"     Description: {info['description']}")
    print(f"     Parameters: {params:,}")
    print(f"     Expected speed: {info['speed']}")
    
    if device.type == 'cuda':
        try:
            inference_time = benchmark_model(model_test, device=device, num_runs=50)
            print(f"     Actual inference time: {inference_time:.2f}ms")
            
            if inference_time <= 5.0:
                print(f"     ✅ Within target (<5ms)")
            else:
                print(f"     ⚠️  Exceeds target (>5ms)")
        except Exception as e:
            print(f"     Benchmark failed: {e}")
    else:
        print(f"     Benchmark skipped (CPU mode)")

print(f"\nTo change default model:")
print(f"  1. Edit DEFAULT_MODEL_TYPE in src/models/percentage_cnn.py")
print(f"  2. Options: 'optimized', 'lightweight', 'simple'")
print(f"  3. Restart your notebook/script to apply changes")

CNN Architecture Comparison
Current default model type: 'optimized'
To change: Edit DEFAULT_MODEL_TYPE in src/models/percentage_cnn.py

👉 OPTIMIZED CNN:
     Description: Best balance of accuracy and speed
     Parameters: 688,392
     Expected speed: Fast (~2-4ms)
     Actual inference time: 0.99ms
     ✅ Within target (<5ms)

   LIGHTWEIGHT CNN:
     Description: Maximum speed for real-time applications
     Parameters: 167,508
     Expected speed: Very fast (~1-2ms)
     Actual inference time: 0.58ms
     ✅ Within target (<5ms)

   SIMPLE CNN:
     Description: Original architecture for compatibility
     Parameters: 4,250,916
     Expected speed: Moderate (~3-5ms)
     Actual inference time: 0.36ms
     ✅ Within target (<5ms)

To change default model:
  1. Edit DEFAULT_MODEL_TYPE in src/models/percentage_cnn.py
  2. Options: 'optimized', 'lightweight', 'simple'
  3. Restart your notebook/script to apply changes
     Actual inference time: 0.99ms
     ✅ Within target (<5ms)

   LIGH

In [3]:


# --- 1. Custom Dataset Definition ---
class PercentageDataset(Dataset):
    """Custom dataset for loading percentage images."""
    def __init__(self, img_dir, transform=None):
        self.img_dir = img_dir
        self.transform = transform
        self.image_files = [f for f in os.listdir(img_dir) if f.endswith('.png')]
        
    def __len__(self):
        return len(self.image_files)
        
    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.image_files[idx])
        
        # Load image
        image = Image.open(img_path).convert("L") # Convert to grayscale
        
        # Extract label from filename
        match = re.match(r'^(\d+)', self.image_files[idx])
        if match:
            label = int(match.group(1))
        else:
            label = -1 # Should not happen with standardized names
            
        if self.transform:
            image = self.transform(image)
            
        return image, label

# --- 2. Data Preparation ---
# Define transforms to resize images and convert them to tensors
# All images will be resized to 64x64 pixels
data_transforms = transforms.Compose([
    transforms.Resize((64, 64)),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,)) # Normalize for grayscale
])

# Instantiate the dataset
full_dataset = PercentageDataset(img_dir='dataset_final', transform=data_transforms)

# Split into training and validation sets (80% train, 20% validation)
train_size = int(0.8 * len(full_dataset))
val_size = len(full_dataset) - train_size
train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

print(f"Training set size: {len(train_dataset)}")
print(f"Validation set size: {len(val_dataset)}")

# --- 3. CNN Model Definition ---
# Use the configured default CNN architecture
model_type = get_default_model_type()  # Gets the model type set in src/models/percentage_cnn.py
model = get_model(num_classes=100).to(device)  # Uses default model type

print(f"Using {model_type} CNN architecture (configured in src/models/percentage_cnn.py)")
print(f"Model parameters: {count_parameters(model):,}")

# Show model information
model_info = get_model_info()
print(f"Model info: {model_info['description']}")
print(f"Use case: {model_info['use_case']}")
print(f"Expected speed: {model_info['speed']}")

# Benchmark the model if on GPU
if device.type == 'cuda':
    inference_time = benchmark_model(model, device=device, num_runs=50)
    print(f"Actual inference time: {inference_time:.2f}ms")
    if inference_time > 5.0:
        print("⚠️  Warning: Inference time > 5ms. Consider changing DEFAULT_MODEL_TYPE to 'lightweight' in src/models/percentage_cnn.py")
    else:
        print("✅ Inference time within target (<5ms)")

print(f"Training set size: {len(train_dataset)}")
print(f"Validation set size: {len(val_dataset)}")
# --- 4. Training Setup ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)  # Added weight decay for regularization
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)  # Learning rate scheduler
num_epochs = 15  # Increased epochs for better convergence with new architecture

# --- 5. Training and Validation Loop ---
# Create directory for failed validation files
FAILED_DIR = "dataset_failed"
os.makedirs(FAILED_DIR, exist_ok=True)

# Track failed files across all epochs
all_failed_files = set()

for epoch in range(num_epochs):
    # Training
    model.train()
    running_loss = 0.0
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    
    train_loss = running_loss / len(train_loader)

    # Validation with detailed failure tracking
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    failed_files_this_epoch = []
    
    with torch.no_grad():
        # Process validation in batches for efficiency, but track individual files
        for batch_idx, (images, labels) in enumerate(val_loader):
            images, labels = images.to(device), labels.to(device)
            
            # Make predictions for the batch
            outputs = model(images)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            
            _, predicted = torch.max(outputs.data, 1)
            
            # Process each item in the batch to track individual files
            for i in range(len(images)):
                # Calculate the original dataset index for this validation item
                val_idx = batch_idx * val_loader.batch_size + i
                if val_idx >= len(val_dataset):  # Handle last batch edge case
                    break
                    
                original_idx = val_dataset.indices[val_idx]
                filename = full_dataset.image_files[original_idx]
                
                true_label = labels[i].item()
                pred_label = predicted[i].item()
                confidence = torch.softmax(outputs[i].unsqueeze(0), 1)[0][pred_label].item()
                
                total += 1
                
                if pred_label == true_label:
                    correct += 1
                else:
                    # Track failed prediction with detailed info
                    failed_info = {
                        'filename': filename,
                        'true_label': true_label,
                        'predicted_label': pred_label,
                        'confidence': confidence,
                        'val_idx': val_idx,
                        'original_idx': original_idx
                    }
                    failed_files_this_epoch.append(failed_info)
                    all_failed_files.add(filename)
            
    val_loss /= len(val_dataset)
    val_accuracy = 100 * correct / total
    
    print(f"Epoch [{epoch+1}/{num_epochs}], "
          f"Train Loss: {train_loss:.4f}, "
          f"Val Loss: {val_loss:.4f}, "
          f"Val Accuracy: {val_accuracy:.2f}%")
    
    # Step the learning rate scheduler
    scheduler.step()
    current_lr = scheduler.get_last_lr()[0]
    print(f"  Learning rate: {current_lr:.6f}")
    
    # Show failed files for this epoch (limit to first 5 for readability)
    if failed_files_this_epoch:
        print(f"  Failed predictions this epoch: {len(failed_files_this_epoch)}")
        for i, fail_info in enumerate(failed_files_this_epoch[:5]):
            print(f"    {fail_info['filename']}: true={fail_info['true_label']}, "
                  f"pred={fail_info['predicted_label']}, conf={fail_info['confidence']:.3f}")
        if len(failed_files_this_epoch) > 5:
            print(f"    ... and {len(failed_files_this_epoch) - 5} more")
        
        # Debug: Show first failed file's indices to verify correctness
        if len(failed_files_this_epoch) > 0:
            first_fail = failed_files_this_epoch[0]
            print(f"    Debug - First fail: val_idx={first_fail['val_idx']}, "
                  f"orig_idx={first_fail['original_idx']}")
    print()

print("\n--- Training Complete ---")

# --- Copy Failed Files to Analysis Folder ---
print(f"\n--- Copying Failed Validation Files ---")
print(f"Total unique files that failed validation: {len(all_failed_files)}")

failed_copied = 0
failed_not_found = 0
for filename in sorted(all_failed_files):  # Sort for consistent processing
    src_path = os.path.join(full_dataset.img_dir, filename)
    dst_path = os.path.join(FAILED_DIR, filename)
    
    if os.path.exists(src_path):
        try:
            shutil.copy(src_path, dst_path)
            failed_copied += 1
            # Verify the copy worked
            if not os.path.exists(dst_path):
                print(f"Warning: Copy verification failed for {filename}")
        except Exception as e:
            print(f"Error copying {filename}: {e}")
    else:
        print(f"Warning: Source file not found: {src_path}")
        failed_not_found += 1

print(f"Successfully copied {failed_copied} failed validation files to '{FAILED_DIR}'")
if failed_not_found > 0:
    print(f"Warning: {failed_not_found} files were not found in source directory")

# Verify the failed directory contents
copied_files = [f for f in os.listdir(FAILED_DIR) if f.endswith('.png')]
print(f"Verification: '{FAILED_DIR}' now contains {len(copied_files)} PNG files")

# --- 6. Save the Model ---
MODEL_SAVE_PATH = f"percentage_cnn_{model_type}.pth"
torch.save(model.state_dict(), MODEL_SAVE_PATH)
print(f"Model saved to {MODEL_SAVE_PATH}")

# Also save model info for easy loading
model_info = {
    'model_type': model_type,
    'num_classes': 100,
    'input_size': (64, 64),
    'parameters': count_parameters(model)
}

import json
with open(f"percentage_cnn_{model_type}_info.json", 'w') as f:
    json.dump(model_info, f, indent=2)
print(f"Model info saved to percentage_cnn_{model_type}_info.json")

Training set size: 17096
Validation set size: 4274
📋 Using default model type: 'optimized'
Using optimized CNN architecture (configured in src/models/percentage_cnn.py)
Model parameters: 688,392
Model info: Best balance of accuracy and speed
Use case: Recommended for most applications
Expected speed: Fast (~2-4ms)
Actual inference time: 1.23ms
✅ Inference time within target (<5ms)
Training set size: 17096
Validation set size: 4274
Using device: cuda
Actual inference time: 1.23ms
✅ Inference time within target (<5ms)
Training set size: 17096
Validation set size: 4274
Using device: cuda
Epoch [1/15], Train Loss: 3.5228, Val Loss: 0.0103, Val Accuracy: 90.99%
  Learning rate: 0.001000
  Failed predictions this epoch: 385
    55pct_20_diag1.png: true=55, pred=53, conf=0.479
    58pct_21_trans1.png: true=58, pred=88, conf=0.564
    45pct_13_scale1.png: true=45, pred=64, conf=0.285
    52pct_44_trans1.png: true=52, pred=25, conf=0.255
    45pct_29_trans1.png: true=45, pred=54, conf=0.509
   

# Model Evaluation on Original Dataset

This section evaluates the trained model's performance on the original `dataset_merged` dataset to see how well it generalizes to the base data without amplification.

In [4]:
# --- Evaluation on Original Dataset (dataset_merged) ---
print("="*60)
print("EVALUATING MODEL ON ORIGINAL DATASET (dataset_merged)")
print("="*60)

# Load the original dataset for evaluation
eval_dataset = PercentageDataset(img_dir='dataset_merged', transform=data_transforms)
eval_loader = DataLoader(eval_dataset, batch_size=32, shuffle=False)

print(f"Original dataset size: {len(eval_dataset)} images")

# Evaluation metrics
model.eval()
eval_correct = 0
eval_total = 0
eval_loss = 0.0
predictions_by_label = {}
confusion_data = []

# Track performance per percentage
percentage_stats = {}

with torch.no_grad():
    for batch_idx, (images, labels) in enumerate(eval_loader):
        images, labels = images.to(device), labels.to(device)
        
        outputs = model(images)
        loss = criterion(outputs, labels)
        eval_loss += loss.item()
        
        _, predicted = torch.max(outputs.data, 1)
        confidences = torch.softmax(outputs, 1)
        
        # Process each item in the batch
        for i in range(len(images)):
            true_label = labels[i].item()
            pred_label = predicted[i].item()
            confidence = confidences[i][pred_label].item()
            
            eval_total += 1
            
            # Initialize percentage stats if needed
            if true_label not in percentage_stats:
                percentage_stats[true_label] = {'correct': 0, 'total': 0, 'confidences': []}
            
            percentage_stats[true_label]['total'] += 1
            percentage_stats[true_label]['confidences'].append(confidence)
            
            if pred_label == true_label:
                eval_correct += 1
                percentage_stats[true_label]['correct'] += 1
            else:
                # Store confusion data
                img_idx = batch_idx * eval_loader.batch_size + i
                filename = eval_dataset.image_files[img_idx]
                confusion_data.append({
                    'filename': filename,
                    'true': true_label,
                    'pred': pred_label,
                    'confidence': confidence
                })

eval_loss /= len(eval_loader)
eval_accuracy = 100 * eval_correct / eval_total

print(f"\n--- Overall Performance ---")
print(f"Total Images: {eval_total}")
print(f"Correct Predictions: {eval_correct}")
print(f"Overall Accuracy: {eval_accuracy:.2f}%")
print(f"Average Loss: {eval_loss:.4f}")
print(f"Error Rate: {100 - eval_accuracy:.2f}%")

# Show per-percentage performance
print(f"\n--- Per-Percentage Performance ---")
print("Label | Accuracy | Count | Avg Confidence")
print("-" * 45)

sorted_percentages = sorted(percentage_stats.keys())
for pct in sorted_percentages:
    stats = percentage_stats[pct]
    accuracy = (stats['correct'] / stats['total']) * 100 if stats['total'] > 0 else 0
    avg_conf = sum(stats['confidences']) / len(stats['confidences']) if stats['confidences'] else 0
    print(f"{pct:3d}%  | {accuracy:6.1f}%  | {stats['total']:4d}  | {avg_conf:.3f}")

# Show worst performing percentages
print(f"\n--- Worst Performing Percentages (Top 10) ---")
worst_performers = []
for pct, stats in percentage_stats.items():
    if stats['total'] >= 5:  # Only consider percentages with at least 5 samples
        accuracy = (stats['correct'] / stats['total']) * 100
        worst_performers.append((pct, accuracy, stats['total']))

worst_performers.sort(key=lambda x: x[1])  # Sort by accuracy
print("Label | Accuracy | Sample Count")
print("-" * 30)
for pct, acc, count in worst_performers[:10]:
    print(f"{pct:3d}%  | {acc:6.1f}%  | {count:4d}")

# Show confusion examples
if confusion_data:
    print(f"\n--- Confusion Examples (First 10) ---")
    print("True | Pred | Confidence | Filename")
    print("-" * 50)
    for conf_item in sorted(confusion_data, key=lambda x: x['confidence'], reverse=True)[:10]:
        print(f"{conf_item['true']:3d}% | {conf_item['pred']:3d}% | {conf_item['confidence']:8.3f} | {conf_item['filename']}")

print(f"\n--- Evaluation Summary ---")
print(f"Model performs at {eval_accuracy:.2f}% accuracy on the original dataset")
print(f"Total misclassifications: {len(confusion_data)}")
if confusion_data:
    high_conf_errors = [c for c in confusion_data if c['confidence'] > 0.8]
    print(f"High-confidence errors (>80%): {len(high_conf_errors)}")
    
print("\nEvaluation complete!")

EVALUATING MODEL ON ORIGINAL DATASET (dataset_merged)
Original dataset size: 6543 images

--- Overall Performance ---
Total Images: 6543
Correct Predictions: 6535
Overall Accuracy: 99.88%
Average Loss: 0.0073
Error Rate: 0.12%

--- Per-Percentage Performance ---
Label | Accuracy | Count | Avg Confidence
---------------------------------------------
  0%  |  100.0%  |  173  | 1.000
  1%  |  100.0%  |   81  | 1.000
  2%  |  100.0%  |   76  | 1.000
  3%  |  100.0%  |   71  | 1.000
  4%  |  100.0%  |   71  | 1.000
  5%  |   97.3%  |   75  | 1.000
  6%  |  100.0%  |   76  | 1.000
  7%  |  100.0%  |   86  | 1.000
  8%  |  100.0%  |   76  | 1.000
  9%  |  100.0%  |   72  | 1.000
 10%  |  100.0%  |   78  | 1.000
 11%  |  100.0%  |   73  | 1.000
 12%  |  100.0%  |   77  | 1.000
 13%  |  100.0%  |   69  | 1.000
 14%  |  100.0%  |   72  | 1.000
 15%  |  100.0%  |   64  | 1.000
 16%  |  100.0%  |   64  | 1.000
 17%  |  100.0%  |   61  | 1.000
 18%  |  100.0%  |   56  | 1.000
 19%  |  100.0%  |   6

# Create Bias Training Dataset

This section creates a new dataset called `dataset_final` that combines:
1. All original images from `dataset_merged`
2. Failed validation samples from `dataset_failed` (1x ratio)
3. Shift-transformed versions of failed samples (0.7x ratio)
4. Other transformations of failed samples (0.3x ratio - scaling, diagonal, rotation)

This biased dataset will help the model learn better on the samples it previously struggled with.

In [5]:
import os
import shutil
import cv2
import numpy as np
import re
from collections import Counter

# --- Configuration ---
AMPLIFIED_DATASET = "dataset_amplified"  # Source of amplified data
ORIGINAL_DATASET = "dataset_merged"      # Source for failed samples
FAILED_DATASET = "dataset_failed"        # Failed validation files
FINAL_DATASET = "dataset_final"          # Final combined dataset

# Create the final dataset directory
os.makedirs(FINAL_DATASET, exist_ok=True)
print(f"Created final dataset directory: {FINAL_DATASET}")

# --- Step 1: Copy all amplified images from dataset_amplified ---
print("\n--- Copying Amplified Dataset ---")
amplified_files = [f for f in os.listdir(AMPLIFIED_DATASET) if f.endswith('.png')]
global_counter = Counter()

# Initialize counter with existing amplified files
for filename in amplified_files:
    match = re.match(r'^(\d+)pct_(\d+)\.png$', filename)
    if match:
        label = match.group(1)
        counter = int(match.group(2))
        global_counter[label] = max(global_counter[label], counter)

# Copy amplified files
files_copied = 0
for filename in amplified_files:
    src_path = os.path.join(AMPLIFIED_DATASET, filename)
    dst_path = os.path.join(FINAL_DATASET, filename)
    shutil.copy(src_path, dst_path)
    files_copied += 1

print(f"Copied {files_copied} amplified files to {FINAL_DATASET}")

# --- Step 2: Process failed validation files from original dataset ---
print("\n--- Processing Failed Validation Files from Original Dataset ---")
failed_files = [f for f in os.listdir(FAILED_DATASET) if f.endswith('.png')]
print(f"Found {len(failed_files)} failed validation files")

# For each failed file, find the corresponding original file in dataset_merged
failed_originals = []
failed_originals_mapping = {}  # Map from original filename to failed filename for reference

for failed_filename in failed_files:
    # Extract the base filename by removing transformation suffixes
    # Pattern: {label}pct_{number}_{transform}.png -> {label}pct_{base_number}.png
    match = re.match(r'^(\d+)pct_(\d+)(?:_(?:trans|diag|scale)\d*)?\.png$', failed_filename)
    
    if match:
        label = match.group(1)
        base_number = match.group(2)
        
        # Try to find the original file in dataset_merged
        # Look for files with the same label and base number
        original_pattern = f"{label}pct_{base_number}.png"
        original_path = os.path.join(ORIGINAL_DATASET, original_pattern)
        
        if os.path.exists(original_path):
            if original_pattern not in failed_originals_mapping:
                failed_originals.append(original_pattern)
                failed_originals_mapping[original_pattern] = []
            failed_originals_mapping[original_pattern].append(failed_filename)
        else:
            # Try to find any file with the same label and a similar base number
            # Sometimes the numbering might be slightly different
            found_alternative = False
            for alt_file in os.listdir(ORIGINAL_DATASET):
                if alt_file.startswith(f"{label}pct_") and alt_file.endswith('.png'):
                    alt_match = re.match(rf'^{label}pct_(\d+)\.png$', alt_file)
                    if alt_match:
                        alt_number = int(alt_match.group(1))
                        base_num = int(base_number)
                        # Allow for some variance in numbering (within 5)
                        if abs(alt_number - base_num) <= 5:
                            if alt_file not in failed_originals_mapping:
                                failed_originals.append(alt_file)
                                failed_originals_mapping[alt_file] = []
                            failed_originals_mapping[alt_file].append(failed_filename)
                            found_alternative = True
                            break
            
            if not found_alternative:
                print(f"Warning: No original file found for failed sample {failed_filename}")
    else:
        print(f"Warning: Could not parse failed filename {failed_filename}")

# Remove duplicates while preserving order
failed_originals = list(dict.fromkeys(failed_originals))

# Define transformation functions
def apply_translation(image, max_shift=8):
    """Apply random translation/shift transformation"""
    h, w = image.shape[:2]
    dx = np.random.randint(-max_shift, max_shift + 1)
    dy = np.random.randint(-max_shift, max_shift + 1)
    
    M = np.float32([[1, 0, dx], [0, 1, dy]])
    transformed = cv2.warpAffine(image, M, (w, h), borderMode=cv2.BORDER_REFLECT)
    return transformed

def apply_scaling(image, scale_range=(0.85, 1.15)):
    """Apply random scaling transformation"""
    h, w = image.shape[:2]
    scale = np.random.uniform(*scale_range)
    
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, 0, scale)
    transformed = cv2.warpAffine(image, M, (w, h), borderMode=cv2.BORDER_REFLECT)
    return transformed

def apply_rotation(image, angle_range=(-15, 15)):
    """Apply random rotation transformation"""
    h, w = image.shape[:2]
    angle = np.random.uniform(*angle_range)
    
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    transformed = cv2.warpAffine(image, M, (w, h), borderMode=cv2.BORDER_REFLECT)
    return transformed

def apply_diagonal_shift(image, max_shift=6):
    """Apply diagonal transformation"""
    h, w = image.shape[:2]
    dx = np.random.randint(-max_shift, max_shift + 1)
    dy = np.random.randint(-max_shift, max_shift + 1)
    
    # Ensure diagonal movement (both dx and dy are non-zero)
    if dx == 0:
        dx = np.random.choice([-max_shift//2, max_shift//2])
    if dy == 0:
        dy = np.random.choice([-max_shift//2, max_shift//2])
    
    M = np.float32([[1, 0, dx], [0, 1, dy]])
    transformed = cv2.warpAffine(image, M, (w, h), borderMode=cv2.BORDER_REFLECT)
    return transformed

# Process each failed file from the original dataset
total_failed_added = 0
transform_stats = {'original': 0, 'translation': 0, 'other': 0}

print(f"Processing {len(failed_originals)} unique original files from failed validation samples...")
print(f"These correspond to {len(failed_files)} failed validation samples (including transforms)")

# Show mapping summary
print(f"\nFailed sample mapping summary:")
for original_file, failed_variants in list(failed_originals_mapping.items())[:5]:  # Show first 5
    print(f"  {original_file} -> {len(failed_variants)} failed variants: {', '.join(failed_variants[:3])}{'...' if len(failed_variants) > 3 else ''}")
if len(failed_originals_mapping) > 5:
    print(f"  ... and {len(failed_originals_mapping) - 5} more original files")

for filename in failed_originals:
    # Extract label from filename
    match = re.match(r'^(\d+)', filename)
    if not match:
        print(f"Warning: Could not extract label from {filename}, skipping")
        continue
    
    label = match.group(1)
    
    # Load the failed image from the ORIGINAL dataset (not the failed folder)
    src_path = os.path.join(ORIGINAL_DATASET, filename)
    image = cv2.imread(src_path, cv2.IMREAD_GRAYSCALE)
    
    if image is None:
        print(f"Warning: Could not load image {filename}, skipping")
        continue
    
    # 1. Add original failed file (1x ratio)
    global_counter[label] += 1
    new_filename = f"{label}pct_{global_counter[label]}.png"
    dst_path = os.path.join(FINAL_DATASET, new_filename)
    cv2.imwrite(dst_path, image)
    total_failed_added += 1
    transform_stats['original'] += 1
    
    # 2. Add translation-transformed versions (0.7x ratio)
    # Generate 1 translation for every failed image (0.7x ≈ 1x for simplicity)
    global_counter[label] += 1
    transformed_img = apply_translation(image)
    trans_filename = f"{label}pct_{global_counter[label]}.png"
    trans_path = os.path.join(FINAL_DATASET, trans_filename)
    cv2.imwrite(trans_path, transformed_img)
    total_failed_added += 1
    transform_stats['translation'] += 1
    
    # 3. Add other transformations (0.3x ratio)
    # Randomly choose between scaling, rotation, or diagonal shift
    # For every 3 failed images, add 1 other transform (roughly 0.3x)
    if np.random.random() < 0.3:
        transform_type = np.random.choice(['scale', 'rotation', 'diagonal'])
        
        if transform_type == 'scale':
            other_transformed = apply_scaling(image)
        elif transform_type == 'rotation':
            other_transformed = apply_rotation(image)
        else:  # diagonal
            other_transformed = apply_diagonal_shift(image)
        
        global_counter[label] += 1
        other_filename = f"{label}pct_{global_counter[label]}.png"
        other_path = os.path.join(FINAL_DATASET, other_filename)
        cv2.imwrite(other_path, other_transformed)
        total_failed_added += 1
        transform_stats['other'] += 1

print(f"\n--- Bias Dataset Creation Complete ---")
print(f"Total failed samples added: {total_failed_added}")
print(f"  - Original failed samples: {transform_stats['original']}")
print(f"  - Translation transforms: {transform_stats['translation']}")
print(f"  - Other transforms: {transform_stats['other']}")

# Final dataset statistics
final_files = [f for f in os.listdir(FINAL_DATASET) if f.endswith('.png')]
print(f"\nFinal dataset '{FINAL_DATASET}' contains {len(final_files)} total images")
print(f"Amplified dataset had {len(amplified_files)} images")
print(f"Added {len(final_files) - len(amplified_files)} bias training samples from failed originals")

# Show label distribution in final dataset
final_counter = Counter()
for filename in final_files:
    match = re.match(r'^(\d+)', filename)
    if match:
        label = int(match.group(1))
        final_counter[label] += 1

print(f"\nFinal dataset contains {len(final_counter)} unique labels")
print("Label distribution (top 10 most common):")
for label, count in final_counter.most_common(10):
    print(f"  {label}%: {count} images")

print(f"\n🎯 Dataset Composition:")
print(f"   • Amplified dataset: {len(amplified_files)} images")
print(f"   • Failed validation samples: {len(failed_files)} (including transforms)")
print(f"   • Unique original failed files: {len(failed_originals)} files")
print(f"   • Total bias samples added: {total_failed_added}")
print(f"   • Final combined dataset: {len(final_files)} images")
print(f"\n🎯 To use this final dataset for training, ensure the img_dir parameter above is set to:")
print(f"   full_dataset = PercentageDataset(img_dir='{FINAL_DATASET}', transform=data_transforms)")

Created final dataset directory: dataset_final

--- Copying Amplified Dataset ---
Copied 16360 amplified files to dataset_final

--- Processing Failed Validation Files from Original Dataset ---
Found 2517 failed validation files
Processing 2176 unique original files from failed validation samples...
These correspond to 2517 failed validation samples (including transforms)

Failed sample mapping summary:
  0pct_101.png -> 1 failed variants: 0pct_101_scale1.png
  0pct_112.png -> 1 failed variants: 0pct_112_trans1.png
  0pct_121.png -> 1 failed variants: 0pct_121_trans1.png
  0pct_122.png -> 1 failed variants: 0pct_122_trans1.png
  0pct_136.png -> 1 failed variants: 0pct_136_trans1.png
  ... and 2171 more original files
Copied 16360 amplified files to dataset_final

--- Processing Failed Validation Files from Original Dataset ---
Found 2517 failed validation files
Processing 2176 unique original files from failed validation samples...
These correspond to 2517 failed validation samples (in