# Dataset Classification with Tesseract

This notebook processes the images in the `dataset` folder, classifies them using Tesseract OCR, renames the files based on their classification, and provides summary metrics.

**IMPORTANT:** Before running, you must have Google's Tesseract OCR engine installed on your system and accessible in your PATH. You can find installation instructions here: [https://github.com/tesseract-ocr/tesseract](https://github.com/tesseract-ocr/tesseract)

In [None]:
import os
import cv2
import pytesseract
from collections import Counter
import re

# If tesseract is not in your PATH, you can uncomment the following line
# and provide the path to your tesseract.exe
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

In [None]:
DATASET_DIR = "dataset"
classification_metrics = Counter()
file_rename_counter = Counter()
unclassified_count = 0

# List all files in the dataset directory
try:
    image_files = [f for f in os.listdir(DATASET_DIR) if f.endswith('.png')]
except FileNotFoundError:
    print(f"Error: The directory '{DATASET_DIR}' was not found.")
    image_files = []

print(f"Found {len(image_files)} images to process.")

for filename in image_files:
    old_filepath = os.path.join(DATASET_DIR, filename)
    
    # Load the image in grayscale
    image = cv2.imread(old_filepath, cv2.IMREAD_GRAYSCALE)
    
    # Configure Tesseract
    # --psm 7: Treat the image as a single text line.
    # -c tessedit_char_whitelist: Restrict output to these characters.
    config = "--psm 7 -c tessedit_char_whitelist=0123456789%"
    
    # Perform OCR
    text = pytesseract.image_to_string(image, config=config).strip()
    
    # --- Text Cleanup and Validation ---
    # Find the most likely percentage value
    match = re.search(r'(\d+)', text)
    if match:
        clean_text = match.group(1) + "%"
    else:
        clean_text = ""

    # --- File Renaming ---
    if clean_text:
        # Update the classification metrics
        classification_metrics[clean_text] += 1
        
        # Get the instance count for the new filename
        file_rename_counter[clean_text] += 1
        instance_count = file_rename_counter[clean_text]
        
        # Sanitize '%' for the filename, as it can cause issues on some systems
        safe_label = clean_text.replace('%', 'pct')
        new_filename = f"{safe_label}_{instance_count}.png"
        new_filepath = os.path.join(DATASET_DIR, new_filename)
        
        try:
            # Rename the file
            os.rename(old_filepath, new_filepath)
            print(f"Renamed '{filename}' to '{new_filename}'")
        except OSError as e:
            print(f"Error renaming file {filename}: {e}")
            
    else:
        print(f"Could not classify '{filename}', skipping rename.")
        unclassified_count += 1

print("\n--- Processing Complete ---")

In [None]:
# --- Display Classification Metrics ---

print("Tesseract Classification Metrics:")
print("="*30)

if classification_metrics:
    total_classified = sum(classification_metrics.values())
    print(f"Total Images Classified: {total_classified}")
    print(f"Total Images Unclassified: {unclassified_count}")
    print(f"Number of Unique Classes: {len(classification_metrics)}")
    print("\n--- Classification Counts ---")
    
    # Sort by most common
    for label, count in classification_metrics.most_common():
        print(f"- {label}: {count} times")
else:
    print("No images were successfully classified.")


# Standardize Dataset Naming

The following cell processes the images in the `dataset` folder, standardizes their filenames based on the numeric prefix, and copies them to a new `dataset_cleaned` directory. This is useful for creating a clean, consistently named dataset for model training.

In [1]:
import os
import re
import shutil
from collections import Counter

# Define directories
SOURCE_DIR = "dataset"
DEST_DIR = "dataset_cleaned"

# Create the destination directory if it doesn't exist
os.makedirs(DEST_DIR, exist_ok=True)
print(f"Created directory: {DEST_DIR}")

# Counter for new filenames
rename_counter = Counter()
files_processed = 0
files_skipped = 0

# List all files in the source directory
try:
    image_files = [f for f in os.listdir(SOURCE_DIR) if f.endswith('.png')]
except FileNotFoundError:
    print(f"Error: The source directory '{SOURCE_DIR}' was not found.")
    image_files = []

print(f"Found {len(image_files)} images to standardize.")

# Process each file
for filename in image_files:
    # Extract the numeric label from the start of the filename
    match = re.match(r'^(\d+)', filename)
    
    if match:
        label = match.group(1)
        
        # Increment the counter for this label
        rename_counter[label] += 1
        instance_count = rename_counter[label]
        
        # Create the new standardized filename
        new_filename = f"{label}pct_{instance_count}.png"
        
        # Define full paths
        old_filepath = os.path.join(SOURCE_DIR, filename)
        new_filepath = os.path.join(DEST_DIR, new_filename)
        
        # Copy the file to the new directory with the new name
        shutil.copy(old_filepath, new_filepath)
        files_processed += 1
    else:
        print(f"Could not extract label from '{filename}', skipping.")
        files_skipped += 1

print("\n--- Standardization Complete ---")
print(f"Successfully processed and copied {files_processed} files.")
print(f"Skipped {files_skipped} files.")

Created directory: dataset_cleaned
Found 419 images to standardize.

--- Standardization Complete ---
Successfully processed and copied 419 files.
Skipped 0 files.

--- Standardization Complete ---
Successfully processed and copied 419 files.
Skipped 0 files.


In [2]:
import os
import shutil
import re
from collections import Counter

# --- Dataset Merging Configuration ---
# Change these folder names as needed
FOLDER1 = "dataset_cleaned"  # First folder to merge
FOLDER2 = "dataset_cleaned2"  # Second folder to merge - change this to your second folder
OUTPUT_FOLDER = "dataset_merged"

# Create the output directory
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
print(f"Created output directory: {OUTPUT_FOLDER}")

# Counter to track instances of each label across both folders
global_counter = Counter()
total_files_merged = 0

def merge_folder(source_folder, folder_name):
    """
    Merge files from a source folder into the output folder.
    
    Args:
        source_folder: Path to the source folder
        folder_name: Name of the folder (for logging purposes)
    """
    global global_counter, total_files_merged
    
    if not os.path.exists(source_folder):
        print(f"Warning: {source_folder} does not exist, skipping.")
        return
    
    files_in_folder = [f for f in os.listdir(source_folder) if f.endswith('.png')]
    print(f"\nProcessing {len(files_in_folder)} files from {folder_name}...")
    
    for filename in files_in_folder:
        # Extract the numeric label from the filename
        match = re.match(r'^(\d+)', filename)
        
        if match:
            label = match.group(1)
            
            # Increment the global counter for this label
            global_counter[label] += 1
            instance_count = global_counter[label]
            
            # Create the new standardized filename
            new_filename = f"{label}pct_{instance_count}.png"
            
            # Define full paths
            old_filepath = os.path.join(source_folder, filename)
            new_filepath = os.path.join(OUTPUT_FOLDER, new_filename)
            
            # Copy the file with the new name
            shutil.copy(old_filepath, new_filepath)
            total_files_merged += 1
            
            if total_files_merged % 100 == 0:  # Progress update every 100 files
                print(f"Merged {total_files_merged} files...")
        else:
            print(f"Could not extract label from '{filename}', skipping.")

# Merge both folders
print("="*50)
print("MERGING DATASET FOLDERS")
print("="*50)

merge_folder(FOLDER1, "Folder 1 (dataset_cleaned)")
merge_folder(FOLDER2, "Folder 2")

print(f"\n--- Merging Complete ---")
print(f"Total files merged: {total_files_merged}")
print(f"Output directory: {OUTPUT_FOLDER}")
print(f"Number of unique labels: {len(global_counter)}")

# Display label distribution
print("\n--- Label Distribution ---")
for label, count in sorted(global_counter.items(), key=lambda x: int(x[0])):
    print(f"Label {label}: {count} images")

Created output directory: dataset_merged
MERGING DATASET FOLDERS

Processing 419 files from Folder 1 (dataset_cleaned)...
Merged 100 files...
Merged 200 files...
Merged 300 files...
Merged 200 files...
Merged 300 files...
Merged 400 files...

Processing 1497 files from Folder 2...
Merged 500 files...
Merged 400 files...

Processing 1497 files from Folder 2...
Merged 500 files...
Merged 600 files...
Merged 600 files...
Merged 700 files...
Merged 700 files...
Merged 800 files...
Merged 900 files...
Merged 800 files...
Merged 900 files...
Merged 1000 files...
Merged 1000 files...
Merged 1100 files...
Merged 1100 files...
Merged 1200 files...
Merged 1200 files...
Merged 1300 files...
Merged 1300 files...
Merged 1400 files...
Merged 1400 files...
Merged 1500 files...
Merged 1500 files...
Merged 1600 files...
Merged 1600 files...
Merged 1700 files...
Merged 1700 files...
Merged 1800 files...
Merged 1800 files...
Merged 1900 files...

--- Merging Complete ---
Total files merged: 1916
Output d

# CNN Training with PyTorch

This section defines and trains a Convolutional Neural Network (CNN) on the cleaned dataset.
The process includes:
1.  A custom `Dataset` class to load images and parse labels from filenames.
2.  Splitting the data into training and validation sets.
3.  Defining the CNN architecture.
4.  A full training and validation loop.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import transforms
from PIL import Image
import os
import re
import numpy as np

# --- 1. Custom Dataset Definition ---
class PercentageDataset(Dataset):
    """Custom dataset for loading percentage images."""
    def __init__(self, img_dir, transform=None):
        self.img_dir = img_dir
        self.transform = transform
        self.image_files = [f for f in os.listdir(img_dir) if f.endswith('.png')]
        
    def __len__(self):
        return len(self.image_files)
        
    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.image_files[idx])
        
        # Load image
        image = Image.open(img_path).convert("L") # Convert to grayscale
        
        # Extract label from filename
        match = re.match(r'^(\d+)', self.image_files[idx])
        if match:
            label = int(match.group(1))
        else:
            label = -1 # Should not happen with standardized names
            
        if self.transform:
            image = self.transform(image)
            
        return image, label

# --- 2. Data Preparation ---
# Define transforms to resize images and convert them to tensors
# All images will be resized to 64x64 pixels
data_transforms = transforms.Compose([
    transforms.Resize((64, 64)),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,)) # Normalize for grayscale
])

# Instantiate the dataset
full_dataset = PercentageDataset(img_dir='dataset_merged', transform=data_transforms)

# Split into training and validation sets (80% train, 20% validation)
train_size = int(0.8 * len(full_dataset))
val_size = len(full_dataset) - train_size
train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

print(f"Training set size: {len(train_dataset)}")
print(f"Validation set size: {len(val_dataset)}")

# --- 3. CNN Model Definition ---
class SimpleCNN(nn.Module):
    def __init__(self, num_classes=100):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, padding=1)
        # After two pooling layers, 64x64 -> 32x32 -> 16x16
        self.fc1 = nn.Linear(32 * 16 * 16, 512)
        self.fc2 = nn.Linear(512, num_classes)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.pool(self.relu(self.conv1(x)))
        x = self.pool(self.relu(self.conv2(x)))
        x = x.view(-1, 32 * 16 * 16) # Flatten the tensor
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# --- 4. Training Setup ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model = SimpleCNN(num_classes=100).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
num_epochs = 10

# --- 5. Training and Validation Loop ---
for epoch in range(num_epochs):
    # Training
    model.train()
    running_loss = 0.0
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    
    train_loss = running_loss / len(train_loader)

    # Validation
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            
    val_loss /= len(val_loader)
    val_accuracy = 100 * correct / total
    
    print(f"Epoch [{epoch+1}/{num_epochs}], "
          f"Train Loss: {train_loss:.4f}, "
          f"Val Loss: {val_loss:.4f}, "
          f"Val Accuracy: {val_accuracy:.2f}%")

print("\n--- Training Complete ---")

# --- 6. Save the Model ---
MODEL_SAVE_PATH = "percentage_cnn.pth"
torch.save(model.state_dict(), MODEL_SAVE_PATH)
print(f"Model saved to {MODEL_SAVE_PATH}")