<a href="https://colab.research.google.com/github/Priyankaverma2024/Project-10-OCR-System-with-YOLOv3-for-Text-Detection3/blob/main/OCR_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Environment Setup (Google Colab)

In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Install Tesseract OCR engine and the Python wrapper (pytesseract)
!sudo apt-get update
!sudo apt-get install tesseract-ocr
!pip install pytesseract
!pip install opencv-python-headless # For OpenCV

0% [Working]            Hit:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:7 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [8,930 kB]
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:11 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:12 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1,245 kB]
Get:13 https://r2u.stat.illinois.edu/ubuntu j

In [3]:
import warnings
warnings.warnings ="ignore"

# Data Preparation: Accessing Your Images

In [4]:
import os

# Define the path to your images folder on Google Drive
image_folder_path = '/content/drive/MyDrive/OCR_Project_dataset/' # Make sure this path is correct

# List all files in the image folder
try:
    image_files = [f for f in os.listdir(image_folder_path) if os.path.isfile(os.path.join(image_folder_path, f))]
    print(f"Found {len(image_files)} images in {image_folder_path}")
    # print("First few image files:", image_files[:5]) # Optional: print a few names to check
except FileNotFoundError:
    print(f"Error: The folder {image_folder_path} was not found. Please check the path.")
    image_files = []

Found 103 images in /content/drive/MyDrive/OCR_Project_dataset/


# Core OCR Workflow: Image Preprocessing and Text Extraction

In [5]:
import cv2
import pytesseract
import numpy as np
from PIL import Image # Pytesseract works well with PIL Images

def preprocess_image_for_ocr(image_path):
    """
    Loads an image, preprocesses it for OCR, and returns the preprocessed image.
    Preprocessing steps include:
    1. Read image
    2. Resize (optional, here commented out but shown as per document [cite: 10])
    3. Convert to grayscale [cite: 11]
    4. Apply Gaussian blur [cite: 11]
    5. Apply thresholding (Otsu's method) [cite: 12]
    6. Invert colors (black text on white background) [cite: 14]
    """
    try:
        img = cv2.imread(image_path)
        if img is None:
            print(f"Warning: Could not read image {image_path}. Skipping.")
            return None

        # 2. Resize (Optional - your document mentions blowing up small images 3x)
        # If your images are very small, resizing can help.
        # Example: img = cv2.resize(img, None, fx=3, fy=3, interpolation=cv2.INTER_CUBIC) [cite: 10]
        # For now, let's assume original size is okay or this needs tuning per image.

        # 3. Convert to grayscale
        gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

        # 4. Apply Gaussian blur to smooth the image
        # The kernel size (e.g., (5,5)) can be tuned. (1,1) or (3,3) for less blur if text is sharp.
        # Your document mentions a "small Gaussian blur".
        blurred_img = cv2.GaussianBlur(gray_img, (1, 1), 0) # Using a very small kernel [cite: 11]

        # 5. Apply thresholding
        # Otsu's method automatically determines the optimal threshold value. [cite: 12]
        # This creates a binary image (black and white).
        # The document mentions getting white text on a black background first.
        _, thresholded_img_white_text = cv2.threshold(blurred_img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

        # 6. Invert colors: Tesseract often performs better with black text on a white background. [cite: 14]
        preprocessed_img = cv2.bitwise_not(thresholded_img_white_text)

        return preprocessed_img

    except Exception as e:
        print(f"Error during preprocessing image {image_path}: {e}")
        return None

def extract_text_from_image(preprocessed_img):
    """
    Extracts text from a preprocessed image using Pytesseract.
    """
    if preprocessed_img is None:
        return "Error: Preprocessed image is None."
    try:
        # Convert OpenCV image (NumPy array) to PIL Image
        pil_img = Image.fromarray(preprocessed_img)
        custom_config = r'--oem 3 --psm 6' # Example Pytesseract configuration
        text = pytesseract.image_to_string(pil_img, config=custom_config)
        return text
    except Exception as e:
        print(f"Error during text extraction: {e}")
        return f"Error extracting text: {e}"

# Example of processing one image:
if image_files:
    sample_image_path = os.path.join(image_folder_path, image_files[0])
    print(f"\nProcessing sample image: {sample_image_path}")

    preprocessed_image = preprocess_image_for_ocr(sample_image_path)

    if preprocessed_image is not None:
        # To display the image in Colab (optional)
        # from google.colab.patches import cv2_imshow
        # print("Preprocessed Image (for OCR):")
        # cv2_imshow(preprocessed_image)

        extracted_text = extract_text_from_image(preprocessed_image)
        print("\n--- Extracted Text (from sample image) ---")
        print(extracted_text)
        print("--- End of Extracted Text ---")
    else:
        print(f"Could not preprocess {sample_image_path}")

else:
    print("No image files found to process as a sample.")


Processing sample image: /content/drive/MyDrive/OCR_Project_dataset/thyrocare_0_122.jpg

--- Extracted Text (from sample image) ---
a i
hyrovere ; Ler = | st
-37/1,7TC MIDC,Turbhe, bs tetas tdi dl yrQ car e
eet ioe ne omen Think Thyroid. Think Thyrecare.
lo ER ee BS eee mae a Se rd
etree TP eee roam est ea dee Ls aarareali eae aneehcada
rend Pee Sent Stroy Pret ee
er. BY aes (7874357519), KALPANA MEDICOS AND CITY
Wepre rer me Smee Cr a

ey fre cry
TEST NAME TECHNOLOGY VALUE UNITS REFERENCE RANGE

See Met OL a fone a 7 veo too tel)

Drees en] row 5 eed

Der ess Ra ee fot) CLLA 2.14 Prt da  Ee

‘ A

Comments: SUGGESTING Bri tL ee

Rail fe

er

Beer aru en alt Pfeil aml ae

ee art ee oy LUMINESCENT IMMUNO ASSAY

se eB a LUMINESCENT IMMUNO ASSAY

--- End of Extracted Text ---


# Processing All Images and Saving Results

In [6]:
import pandas as pd
import time

results_list = []
output_csv_path = '/content/drive/MyDrive/ocr_project_results/extracted_thyrocare_data.csv' # Choose your output path
os.makedirs(os.path.dirname(output_csv_path), exist_ok=True) # Create folder if it doesn't exist

if image_files:
    print(f"\nStarting batch processing of {len(image_files)} images...")
    for i, image_file_name in enumerate(image_files):
        start_time = time.time()
        print(f"Processing image {i+1}/{len(image_files)}: {image_file_name}...")
        current_image_path = os.path.join(image_folder_path, image_file_name)

        preprocessed_img = preprocess_image_for_ocr(current_image_path)

        if preprocessed_img is not None:
            text = extract_text_from_image(preprocessed_img)
            results_list.append({'image_filename': image_file_name, 'extracted_text': text})
            print(f"  Extracted text (first 100 chars): {text[:100].replace(chr(10), ' ')}...") # Show a snippet
        else:
            results_list.append({'image_filename': image_file_name, 'extracted_text': 'Error in preprocessing'})
            print(f"  Skipped due to preprocessing error.")

        end_time = time.time()
        print(f"  Time taken: {end_time - start_time:.2f} seconds.")


    # Create a Pandas DataFrame and save to CSV
    df_results = pd.DataFrame(results_list)
    try:
        df_results.to_csv(output_csv_path, index=False, encoding='utf-8')
        print(f"\nSuccessfully saved extracted data to: {output_csv_path}")
    except Exception as e:
        print(f"\nError saving CSV file: {e}")
        # Fallback: print to console if saving fails
        # print("\n--- All Extracted Data ---")
        # for item in results_list:
        #     print(f"Image: {item['image_filename']}\nText: {item['extracted_text']}\n---")

else:
    print("No image files found to process for batch operation.")


Starting batch processing of 103 images...
Processing image 1/103: thyrocare_0_122.jpg...
  Extracted text (first 100 chars): a i hyrovere ; Ler = | st -37/1,7TC MIDC,Turbhe, bs tetas tdi dl yrQ car e eet ioe ne omen Think Thy...
  Time taken: 4.89 seconds.
Processing image 2/103: thyrocare_0_511.jpg...
  Extracted text (first 100 chars): "ff CT Ue . By ak #3 CAP mui Ct ome ; aS A Note lei he Ces a SR ee ede nk Rem OR ee Lr ee eM al kl ....
  Time taken: 3.69 seconds.
Processing image 3/103: thyrocare_0_421.jpg...
  Extracted text (first 100 chars): } , a | i’ sw a, eT aE OO ae a ete * en me nr MY Corporate Office : Teyrocare Technologies Limited @...
  Time taken: 5.25 seconds.
Processing image 4/103: thyrocare_0_532.jpg...
  Extracted text (first 100 chars): ye eee agama Boe iebiccee th ic 5, Hokisinla Lasidinbl EMC, Torte aioe nee a REPORT = ¥ bea eda) Se ...
  Time taken: 3.49 seconds.
Processing image 5/103: thyrocare_0_36.jpg...
  Extracted text (first 100 chars): ik, Moat 700 O

In [7]:
import os
import yaml # PyYAML is usually pre-installed in Colab

# Ensure Google Drive is mounted
if not os.path.exists('/content/drive/MyDrive'):
    from google.colab import drive
    drive.mount('/content/drive')
    print("Google Drive mounted.")
else:
    print("Google Drive already mounted.")

# Define paths
base_project_folder = '/content/drive/MyDrive/OCR_Project_dataset/'
dataset_main_folder = os.path.join(base_project_folder, 'thyrocare_dataset/')
images_folder = os.path.join(dataset_main_folder, 'images/')
labels_folder = os.path.join(dataset_main_folder, 'labels/')
path_to_yaml = os.path.join(dataset_main_folder, 'dataset.yaml')

# Create the main dataset folder and the top-level images and labels folders
os.makedirs(dataset_main_folder, exist_ok=True)
os.makedirs(images_folder, exist_ok=True)
os.makedirs(labels_folder, exist_ok=True)

print(f"Ensured folder exists: {dataset_main_folder}")
print(f"Ensured folder exists: {images_folder}")
print(f"Ensured folder exists: {labels_folder}")

# Define the content of the dataset.yaml file
# Paths are absolute for clarity and robustness in Colab.
yaml_content = {
    'train': os.path.join(images_folder, 'train/'), # Path to training images folder
    'val': os.path.join(images_folder, 'val/'),     # Path to validation images folder
    # Note: YOLO usually expects label folders to mirror image folder structure.
    # Some implementations might infer label paths from image paths,
    # e.g., if images are in 'images/train', labels are expected in 'labels/train'.
    # The paths above define where the *images* are. The label paths are often implicit.

    'nc': 4,  # Number of classes
    'names': ['test name','technology','value','unit reference range']  # List of class names
}

# Write the YAML content to the file
with open(path_to_yaml, 'w') as f:
    yaml.dump(yaml_content, f, sort_keys=False, default_flow_style=None)

print(f"\n'{path_to_yaml}' created successfully with the following content:")
with open(path_to_yaml, 'r') as f:
    print(f.read())

print("\nMANUAL ACTIONS REQUIRED NEXT:")
print("1.  **Create `train` and `val` subdirectories:**")
print(f"    - Go to your Google Drive. Inside '{images_folder}', create two subfolders: `train` and `val`.")
print(f"    - Inside '{labels_folder}', create two subfolders: `train` and `val`.")
print("\n2.  **Populate these folders:**")
print("    - Distribute your ~100 images into the `images/train/` and `images/val/` folders (e.g., 80 for train, 20 for val).")
print("    - Start your annotation process for all these images.")
print("    - As you annotate each image, save its corresponding YOLO annotation `.txt` file in the `labels/train/` or `labels/val/` folder that matches where the image is.")
print("      (e.g., if `reportX.jpg` is in `images/train/`, then `reportX.txt` should be in `labels/train/`).")
print("\n3.  **Confirm class order for annotations:**")
print("      0: test Name, 1: technology, 2: value, 3: unit reference range")

Google Drive already mounted.
Ensured folder exists: /content/drive/MyDrive/OCR_Project_dataset/thyrocare_dataset/
Ensured folder exists: /content/drive/MyDrive/OCR_Project_dataset/thyrocare_dataset/images/
Ensured folder exists: /content/drive/MyDrive/OCR_Project_dataset/thyrocare_dataset/labels/

'/content/drive/MyDrive/OCR_Project_dataset/thyrocare_dataset/dataset.yaml' created successfully with the following content:
train: /content/drive/MyDrive/OCR_Project_dataset/thyrocare_dataset/images/train/
val: /content/drive/MyDrive/OCR_Project_dataset/thyrocare_dataset/images/val/
nc: 4
names: [test name, technology, value, unit reference range]


MANUAL ACTIONS REQUIRED NEXT:
1.  **Create `train` and `val` subdirectories:**
    - Go to your Google Drive. Inside '/content/drive/MyDrive/OCR_Project_dataset/thyrocare_dataset/images/', create two subfolders: `train` and `val`.
    - Inside '/content/drive/MyDrive/OCR_Project_dataset/thyrocare_dataset/labels/', create two subfolders: `train` 

# Model Training

Task 3.1: Train YOLO Model

In [8]:
! pip install ultralytics

Collecting ultralytics
  Downloading ultralytics-8.3.134-py3-none-any.whl.metadata (37 kB)
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.14-py3-none-any.whl.metadata (9.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.8.0->ultralytics)
  Downloading n

In [9]:
from ultralytics import YOLO  # Ultralytics YOLOv8 (or v5 - install if needed)
import os

# 1. Load a pre-trained model (YOLOv8 strongly recommended)
model = YOLO('yolov8n.pt')  # or 'yolov5n.pt' if you prefer v5

# 2. Define data.yaml path (already created)
data_yaml_path = '/content/drive/MyDrive/OCR_Project_dataset/thyrocare_dataset/dataset.yaml'

# 3. Output directory for trained models
output_model_dir = '/content/drive/MyDrive/OCR_Project_dataset/models'
os.makedirs(output_model_dir, exist_ok=True)

# 4. Train the model
results = model.train(data=data_yaml_path, epochs=100, imgsz=640)

# 5. Save the trained model
trained_model_path = os.path.join(output_model_dir, 'thyrocare_yolov8n_trained.pt') # Or yolov5n
model.save(trained_model_path)

print(f"Trained model saved to: {trained_model_path}")

Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.
Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8n.pt to 'yolov8n.pt'...


100%|██████████| 6.25M/6.25M [00:00<00:00, 89.2MB/s]


Ultralytics 8.3.134 🚀 Python-3.11.12 torch-2.6.0+cu124 CUDA:0 (Tesla T4, 15095MiB)
[34m[1mengine/trainer: [0magnostic_nms=False, amp=True, augment=False, auto_augment=randaugment, batch=16, bgr=0.0, box=7.5, cache=False, cfg=None, classes=None, close_mosaic=10, cls=0.5, conf=None, copy_paste=0.0, copy_paste_mode=flip, cos_lr=False, cutmix=0.0, data=/content/drive/MyDrive/OCR_Project_dataset/thyrocare_dataset/dataset.yaml, degrees=0.0, deterministic=True, device=None, dfl=1.5, dnn=False, dropout=0.0, dynamic=False, embed=None, epochs=100, erasing=0.4, exist_ok=False, fliplr=0.5, flipud=0.0, format=torchscript, fraction=1.0, freeze=None, half=False, hsv_h=0.015, hsv_s=0.7, hsv_v=0.4, imgsz=640, int8=False, iou=0.7, keras=False, kobj=1.0, line_width=None, lr0=0.01, lrf=0.01, mask_ratio=4, max_det=300, mixup=0.0, mode=train, model=yolov8n.pt, momentum=0.937, mosaic=1.0, multi_scale=False, name=train, nbs=64, nms=False, opset=None, optimize=False, optimizer=auto, overlap_mask=True, patie

100%|██████████| 755k/755k [00:00<00:00, 21.9MB/s]

Overriding model.yaml nc=80 with nc=4

                   from  n    params  module                                       arguments                     
  0                  -1  1       464  ultralytics.nn.modules.conv.Conv             [3, 16, 3, 2]                 
  1                  -1  1      4672  ultralytics.nn.modules.conv.Conv             [16, 32, 3, 2]                
  2                  -1  1      7360  ultralytics.nn.modules.block.C2f             [32, 32, 1, True]             
  3                  -1  1     18560  ultralytics.nn.modules.conv.Conv             [32, 64, 3, 2]                
  4                  -1  2     49664  ultralytics.nn.modules.block.C2f             [64, 64, 2, True]             
  5                  -1  1     73984  ultralytics.nn.modules.conv.Conv             [64, 128, 3, 2]               
  6                  -1  2    197632  ultralytics.nn.modules.block.C2f             [128, 128, 2, True]           
  7                  -1  1    295424  ultralytics




Model summary: 129 layers, 3,011,628 parameters, 3,011,612 gradients, 8.2 GFLOPs

Transferred 319/355 items from pretrained weights
Freezing layer 'model.22.dfl.conv.weight'
[34m[1mAMP: [0mrunning Automatic Mixed Precision (AMP) checks...
Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolo11n.pt to 'yolo11n.pt'...


100%|██████████| 5.35M/5.35M [00:00<00:00, 88.1MB/s]


[34m[1mAMP: [0mchecks passed ✅
[34m[1mtrain: [0mFast image access ✅ (ping: 0.5±0.2 ms, read: 0.2±0.0 MB/s, size: 51.9 KB)


[34m[1mtrain: [0mScanning /content/drive/MyDrive/OCR_Project_dataset/thyrocare_dataset/labels/train... 80 images, 0 backgrounds, 80 corrupt: 100%|██████████| 80/80 [01:00<00:00,  1.33it/s]

[34m[1mtrain: [0m/content/drive/MyDrive/OCR_Project_dataset/thyrocare_dataset/images/train/thyrocare_0_1001.jpg: ignoring corrupt image/label: Label class 18 exceeds dataset class count 4. Possible class labels are 0-3
[34m[1mtrain: [0m/content/drive/MyDrive/OCR_Project_dataset/thyrocare_dataset/images/train/thyrocare_0_1174.jpg: ignoring corrupt image/label: Label class 18 exceeds dataset class count 4. Possible class labels are 0-3
[34m[1mtrain: [0m/content/drive/MyDrive/OCR_Project_dataset/thyrocare_dataset/images/train/thyrocare_0_122.jpg: ignoring corrupt image/label: Label class 18 exceeds dataset class count 4. Possible class labels are 0-3
[34m[1mtrain: [0m/content/drive/MyDrive/OCR_Project_dataset/thyrocare_dataset/images/train/thyrocare_0_123.jpg: ignoring corrupt image/label: Label class 18 exceeds dataset class count 4. Possible class labels are 0-3
[34m[1mtrain: [0m/content/drive/MyDrive/OCR_Project_dataset/thyrocare_dataset/images/train/thyrocare_0_1248.jpg




RuntimeError: No valid images found in /content/drive/MyDrive/OCR_Project_dataset/thyrocare_dataset/labels/train.cache. Images with incorrectly formatted labels are ignored. See https://docs.ultralytics.com/datasets for dataset formatting guidance.

# Task 3.2: Model Validation

In [10]:
# 1. Validate the model
metrics = model.val(data=data_yaml_path)  # Uses the validation set in data.yaml
print("Validation Metrics:", metrics.results_dict)

# 2. Visualize Predictions (optional) - Saves images with bounding boxes
output_predict_dir = '/content/drive/MyDrive/OCR_Project_dataset/results/validation_predictions'
os.makedirs(output_predict_dir, exist_ok=True)
model.predict(source='/content/drive/MyDrive/OCR_Project_dataset/images/val', save=True, name='val_preds', exist_ok=True)

#   #  The predicted images are saved in a 'runs/predict' directory.  Move them:
#   import shutil
#   source_predict_dir = 'runs/predict/val_preds' # Adjust if needed
#   if os.path.exists(source_predict_dir):
#       for file_name in os.listdir(source_predict_dir):
#           shutil.move(os.path.join(source_predict_dir, file_name), output_predict_dir)
#       shutil.rmtree(source_predict_dir) # Clean up
#   print(f"Validation predictions saved to: {output_predict_dir}")

Ultralytics 8.3.134 🚀 Python-3.11.12 torch-2.6.0+cu124 CUDA:0 (Tesla T4, 15095MiB)
Model summary (fused): 72 layers, 3,006,428 parameters, 13,260 gradients, 8.1 GFLOPs
[34m[1mval: [0mFast image access ✅ (ping: 0.5±0.2 ms, read: 0.1±0.0 MB/s, size: 50.1 KB)


[34m[1mval: [0mScanning /content/drive/MyDrive/OCR_Project_dataset/thyrocare_dataset/labels/val... 20 images, 0 backgrounds, 20 corrupt: 100%|██████████| 20/20 [00:13<00:00,  1.46it/s]

[34m[1mval: [0m/content/drive/MyDrive/OCR_Project_dataset/thyrocare_dataset/images/val/thyrocare_0_7805.jpg: ignoring corrupt image/label: Label class 18 exceeds dataset class count 4. Possible class labels are 0-3
[34m[1mval: [0m/content/drive/MyDrive/OCR_Project_dataset/thyrocare_dataset/images/val/thyrocare_0_8214.jpg: ignoring corrupt image/label: Label class 18 exceeds dataset class count 4. Possible class labels are 0-3
[34m[1mval: [0m/content/drive/MyDrive/OCR_Project_dataset/thyrocare_dataset/images/val/thyrocare_0_8231.jpg: ignoring corrupt image/label: Label class 18 exceeds dataset class count 4. Possible class labels are 0-3
[34m[1mval: [0m/content/drive/MyDrive/OCR_Project_dataset/thyrocare_dataset/images/val/thyrocare_0_8251.jpg: ignoring corrupt image/label: Label class 18 exceeds dataset class count 4. Possible class labels are 0-3
[34m[1mval: [0m/content/drive/MyDrive/OCR_Project_dataset/thyrocare_dataset/images/val/thyrocare_0_8265.jpg: ignoring corrupt




RuntimeError: No valid images found in /content/drive/MyDrive/OCR_Project_dataset/thyrocare_dataset/labels/val.cache. Images with incorrectly formatted labels are ignored. See https://docs.ultralytics.com/datasets for dataset formatting guidance.

# 4. Inference and Post-Processing
# Task: Detect text regions with YOLO, crop, OCR with Tesseract, and structure the data.

In [11]:
import cv2
from ultralytics import YOLO
import pytesseract
from PIL import Image
import pandas as pd
import os

# 1. Load the trained YOLO model
trained_model_path = '/content/drive/MyDrive/OCR_Project_dataset/models/thyrocare_yolov8n_trained.pt'  # Adjust path!
model = YOLO(trained_model_path)

def process_image(image_path):
    # 2. Run YOLO inference
    results = model.predict(image_path)
    boxes = results[0].boxes.xyxy.cpu().numpy().astype(int)
    class_ids = results[0].boxes.cls.cpu().numpy().astype(int)
    cropped_data = []

    # 3. Crop and OCR
    img = cv2.imread(image_path)
    for i, (x1, y1, x2, y2) in enumerate(boxes):
        crop = img[y1:y2, x1:x2]
        preprocessed_crop = preprocess_image_for_ocr(crop)  # Your preprocessing
        if preprocessed_crop is not None:
            text = pytesseract.image_to_string(Image.fromarray(preprocessed_crop)).strip()
            cropped_data.append({'class': class_ids[i], 'text': text, 'bbox': (x1, y1, x2, y2)})
    return cropped_data

# 4. Process all images and create structured output
all_extracted_data = []
for image_file in os.listdir('/content/drive/MyDrive/OCR_Project_dataset/images/val'):  # Or a test folder
    if image_file.lower().endswith(('.png', '.jpg', '.jpeg')):
        image_path = os.path.join('/content/drive/MyDrive/OCR_Project_dataset/images/val', image_file)
        extracted_data = process_image(image_path)
        all_extracted_data.append({'image': image_file, 'data': extracted_data})

# 5. Structure the data (example - adapt to your specific report layout!)
structured_results = []
for item in all_extracted_data:
    image_name = item['image']
    data = item['data']
    #  Assume a simple table-like structure:  Find bounding boxes in a logical order
    test_name = next((d['text'] for d in data if d['class'] == 0), None)
    value = next((d['text'] for d in data if d['class'] == 1), None)
    unit = next((d['text'] for d in data if d['class'] == 2), None)
    ref_value = next((d['text'] for d in data if d['class'] == 3), None)
    structured_results.append({'image': image_name, 'Test Name': test_name, 'Value': value, 'Unit': unit, 'Reference Value': ref_value})

df_results = pd.DataFrame(structured_results)
output_csv_path = '/content/drive/MyDrive/OCR_Project_dataset/results/structured_ocr_output.csv'
df_results.to_csv(output_csv_path, index=False)
print(f"Structured data saved to: {output_csv_path}")

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/OCR_Project_dataset/models/thyrocare_yolov8n_trained.pt'

# 5. Evaluation and Optimization

# Task 5.1: Evaluation

In [12]:
! pip install jiwer

Collecting jiwer
  Downloading jiwer-3.1.0-py3-none-any.whl.metadata (2.6 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading jiwer-3.1.0-py3-none-any.whl (22 kB)
Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m46.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-3.1.0 rapidfuzz-3.13.0


In [None]:
import pandas as pd
from jiwer import measure  # For CER/WER (install: pip install jiwer)

def evaluate_ocr(ground_truth_csv, ocr_output_csv):
    gt_df = pd.read_csv(ground_truth_csv)
    ocr_df = pd.read_csv(ocr_output_csv)

    merged_df = pd.merge(gt_df, ocr_df, on='image', suffixes=('_gt', '_ocr'), how='inner') # Adjust 'on' key

    cer_list = []
    wer_list = []

    for index, row in merged_df.iterrows():
        gt_text = str(row['Test Name_gt']) + " " + str(row['Value_gt']) + " " + str(row['Unit_gt']) + " " + str(row['Reference Value_gt'])  # Concatenate ground truth text
        ocr_text = str(row['Test Name_ocr']) + " " + str(row['Value_ocr']) + " " + str(row['Unit_ocr']) + " " + str(row['Reference Value_ocr']) # Concatenate OCR text

        # Calculate CER and WER
        cer = measure(gt_text, ocr_text).cer
        wer = measure(gt_text, ocr_text).wer
        cer_list.append(cer)
        wer_list.append(wer)

    avg_cer = sum(cer_list) / len(cer_list) if cer_list else 0
    avg_wer = sum(wer_list) / len(wer_list) if wer_list else 0

    print(f"Average CER: {avg_cer:.4f}")
    print(f"Average WER: {avg_wer:.4f}")

# Example Usage:
ground_truth_csv_path = '/content/drive/MyDrive/OCR_Project_dataset/ground_truth.csv'  # You'll need to create this!
ocr_output_csv_path = '/content/drive/MyDrive/OCR_Project_dataset/results/structured_ocr_output.csv'
evaluate_ocr(ground_truth_csv_path, ocr_output_csv)

# Task 5.2: Optimization (Conceptual - Code Fragments)

Data Augmentation (YOLO): YOLOv8/v5 have built-in augmentation.  Tweak these in model.train():

In [None]:
model.train(data=data_yaml_path, epochs=100, imgsz=640,
            flipud=0.5,  # Vertical flip
            lr0=0.01)   # Learning rate (example)

# Preprocessing Tuning: Modify your preprocess_image_for_ocr() function.  There's no single "best" – experiment!

In [None]:
def preprocess_image_for_ocr(image):  #  Now takes the image directly
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    blurred = cv2.GaussianBlur(gray, (3, 3), 0)  #  Bigger blur
    _, thresh = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) # Inverted thresholding
    #  ...  other processing
    return thresh

# Tesseract Configuration: Pass different configurations to pytesseract.image_to_string():

In [None]:
text = pytesseract.image_to_string(Image.fromarray(preprocessed_crop),
                            config='--psm 6 --oem 1')  # Different Page Segmentation Mode, OCR Engine Mode