# YOLOv8 + EasyOCR

## Import Dependencies

In [1]:
from ultralytics import YOLO
import torch
import cv2
import warnings
import pandas as pd
import os
from models.EasyOCR.utils import AttrDict, AttnLabelConverter, CTCLabelConverter
import yaml
from torchvision import transforms
warnings.filterwarnings("ignore")

Run 'pip install torchvision==0.19' to fix torchvision or 'pip install -U torch torchvision' to update both.
For a full compatibility table see https://github.com/pytorch/vision#installation


## Configuration

In [2]:
# --- Configuration for Detection ---
TARGET_CLASS_NAME = "license_plate" # The string name of the class we want to detect with YOLO
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
CONFIDENCE_THRESHOLD = 0.25 # YOLO default is often 0.25, adjust as needed

BOX_COLOR = (255, 0, 0) # Blue for YOLO car plates (to differentiate if needed)
TEXT_COLOR = (255, 255, 255) # White
TEXT_BG_COLOR = (0, 0, 0) # Black

OCR_BOX_COLOR = (0, 255, 0) # Green for OCR text bounding box (if you draw it)
OCR_TEXT_COLOR = (50, 200, 255)  # Light blue/yellow for OCR text
OCR_TEXT_BG_COLOR = (0,0,0) # Black background for OCR text

# TARGET_CLASS_ID will be determined after loading YOLO model and seeing its classes

# --- Get Video Path  ---
video_path = "../data/video/53.mp4"  # Or your desired video
if video_path == '0':
    video_source = 0 # Use webcam
else:
    video_source = video_path

## Load YOLO

In [3]:
yolo_model_path = "../models/best_yolov8.pt"
yolo_model = YOLO(yolo_model_path, task="detect")

## Load EasyOCR

In [4]:
def get_config(file_path):
    with open(file_path, 'r', encoding="utf8") as stream:
        opt = yaml.safe_load(stream)
    opt = AttrDict(opt)
    if opt.lang_char == 'None':
        characters = ''
        for data in opt['select_data'].split('-'):
            csv_path = os.path.join(opt['train_data'], data, 'label.csv')
            df = pd.read_csv(csv_path, sep='^([^,]+),', engine='python', usecols=['filename', 'words'], keep_default_na=False)
            all_char = ''.join(df['words'])
            characters += ''.join(set(all_char))
        characters = sorted(set(characters))
        opt.character= ''.join(characters)
    else:
        opt.character = opt.number + opt.symbol + opt.lang_char
    return opt

In [5]:
from models.EasyOCR.model import Model

opt = get_config("../models/EasyOCR/opt.txt")

if not opt.data_filtering_off:
    print('Filtering the images containing characters which are not in opt.character')
    print('Filtering the images whose label is longer than opt.batch_max_length')

opt.select_data = opt.select_data.split('-')
opt.batch_ratio = opt.batch_ratio.split('-')

if 'CTC' in opt.Prediction:
    converter = CTCLabelConverter(opt.character)
else:
    converter = AttnLabelConverter(opt.character)
opt.num_class = len(converter.character)

if opt.rgb:
    opt.input_channel = 3

In [6]:
easyocr = Model(opt)
print('model input parameters', opt.imgH, opt.imgW, opt.num_fiducial, opt.input_channel, opt.output_channel,
      opt.hidden_size, opt.num_class, opt.batch_max_length, opt.Transformation, opt.FeatureExtraction,
      opt.SequenceModeling, opt.Prediction)
easyocr = torch.nn.DataParallel(easyocr).to(device)
easyocr.load_state_dict(torch.load('../models/best_easyocr_full.pth', map_location = 'cuda'), strict=False)
easyocr.eval()

No Transformation module specified
model input parameters 64 256 20 1 512 256 37 12 None VGG BiLSTM CTC


DataParallel(
  (module): Model(
    (FeatureExtraction): VGG_FeatureExtractor(
      (ConvNet): Sequential(
        (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (1): ReLU(inplace=True)
        (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
        (3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (4): ReLU(inplace=True)
        (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
        (6): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (7): ReLU(inplace=True)
        (8): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (9): ReLU(inplace=True)
        (10): MaxPool2d(kernel_size=(2, 1), stride=(2, 1), padding=0, dilation=1, ceil_mode=False)
        (11): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (12): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track

## Draw Predictions (Object Detection & OCR)

In [7]:
# Cell id: fc256f89ef200a19 (NEW CONTENT)

# --- Video Capture ---
cap = cv2.VideoCapture(video_source)

# To store YOLO class names and our target class ID for YOLO
yolo_class_names_map = None
yolo_target_cls_id_int = None


if not cap.isOpened():
    print(f"Error: Could not open video source '{video_source}'. Please check the path or camera.")
else:
    print(f"Processing video: {video_source} with YOLOv8 and Custom EasyOCR")
    window_title = f"Car Plate YOLOv8 Detection & Custom EasyOCR"
    cv2.namedWindow(window_title, cv2.WINDOW_NORMAL)

    # Define the image transformation for the custom EasyOCR model
    ocr_transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize((opt.imgH, opt.imgW)), # opt.imgH, opt.imgW from custom OCR config
        transforms.Grayscale(num_output_channels=1), # Ensure grayscale, opt.input_channel is 1
        transforms.ToTensor(),
        transforms.Normalize([0.5], [0.5]) # For grayscale, normalizes [0,1] to [-1.0, 1.0]
    ])

    frame_count = 0
    while cap.isOpened():
        ret, frame_bgr = cap.read()

        if not ret:
            if isinstance(video_source, str):
                print("End of video file reached.")
            else:
                print("Error reading frame from webcam.")
            break

        frame_to_draw = frame_bgr.copy()
        frame_count += 1

        # --- YOLOv8 Inference ---
        # source=frame_bgr (already BGR), stream=False for single image, verbose=False for less output
        # device=device will use the torch device. YOLO handles moving data.
        yolo_results = yolo_model(frame_bgr, device=device, verbose=False, conf=CONFIDENCE_THRESHOLD)

        # Get results for the first (and only) image
        result = yolo_results[0]

        # --- Initialize YOLO class names and target ID on the first frame ---
        if yolo_class_names_map is None:
            yolo_class_names_map = result.names  # This is a dict like {0: 'person', 1: 'car', 2: 'carplate'}
            print(f"YOLO Model Classes: {yolo_class_names_map}")
            # Find the integer class ID for our TARGET_CLASS_NAME
            for cls_id_int, name_str in yolo_class_names_map.items():
                if name_str.lower() == TARGET_CLASS_NAME.lower():
                    yolo_target_cls_id_int = cls_id_int
                    break
            if yolo_target_cls_id_int is None:
                print(f"Error: YOLO Target class '{TARGET_CLASS_NAME}' not found in YOLO model's classes: {yolo_class_names_map}")
                # Optionally, break or stop processing if target class isn't found
            else:
                print(f"Targeting YOLO class '{TARGET_CLASS_NAME}' with integer ID: {yolo_target_cls_id_int}")

        # --- Process Detections and Perform OCR ---
        num_detections_in_frame = 0
        if yolo_target_cls_id_int is not None:
            # Iterate through detected boxes
            for box in result.boxes:
                cls_id_tensor = box.cls # Tensor with the class ID
                conf_tensor = box.conf # Tensor with the confidence score
                xyxy_tensor = box.xyxy[0] # Tensor with [xmin, ymin, xmax, ymax]

                cls_id_int = int(cls_id_tensor.item()) # Get Python int from tensor
                confidence = conf_tensor.item() # Get Python float from tensor

                if cls_id_int == yolo_target_cls_id_int: # No need for confidence check here, YOLO already filtered by `conf`
                    num_detections_in_frame += 1
                    xmin, ymin, xmax, ymax = map(int, xyxy_tensor.cpu().numpy()) # Get coords as integers

                    # Clamp coordinates
                    xmin = max(0, xmin); ymin = max(0, ymin)
                    xmax = min(frame_bgr.shape[1], xmax); ymax = min(frame_bgr.shape[0], ymax)

                    # Draw YOLO bounding box
                    cv2.rectangle(frame_to_draw, (xmin, ymin), (xmax, ymax), BOX_COLOR, 2)
                    yolo_label_text = f"{yolo_class_names_map[cls_id_int]}: {confidence:.2f}"

                    (text_w_yolo, text_h_yolo), base_yolo = cv2.getTextSize(yolo_label_text, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 1)
                    cv2.rectangle(frame_to_draw, (xmin, ymin - text_h_yolo - base_yolo - 2), (xmin + text_w_yolo, ymin - base_yolo + 2), TEXT_BG_COLOR, -1)
                    cv2.putText(frame_to_draw, yolo_label_text, (xmin, ymin - base_yolo -1),
                                cv2.FONT_HERSHEY_SIMPLEX, 0.6, TEXT_COLOR, 1, cv2.LINE_AA)

                    # --- Custom EasyOCR: Crop the plate and read text ---
                    if xmax > xmin and ymax > ymin:
                        plate_roi_bgr = frame_bgr[ymin:ymax, xmin:xmax]
                        try:
                            ocr_image_tensor = ocr_transform(plate_roi_bgr)
                            ocr_image_tensor = ocr_image_tensor.unsqueeze(0).to(device)

                            with torch.no_grad():
                                # 'easyocr' is your custom OCR model instance from cell 2adc52a5112dca56
                                ocr_preds_raw = easyocr(ocr_image_tensor, '')

                            seq_len_T = ocr_preds_raw.size(1)
                            if seq_len_T == 0:
                                recognized_text = ""
                            else:
                                _, ocr_preds_idx_batched = ocr_preds_raw.max(2)
                                ocr_preds_idx_flattened = ocr_preds_idx_batched.view(-1)
                                batch_size_ocr = ocr_image_tensor.size(0)
                                preds_size_ocr = torch.IntTensor([seq_len_T] * batch_size_ocr).to(device)

                                # 'converter' is your custom OCR model's converter
                                recognized_text_list = converter.decode_greedy(ocr_preds_idx_flattened.data, preds_size_ocr.data)
                                recognized_text = recognized_text_list[0] if recognized_text_list else ""
                                recognized_text = ''.join(filter(str.isalnum, recognized_text)).upper()

                            if recognized_text:
                                ocr_text_y_pos = ymax + 20
                                if ocr_text_y_pos + 10 > frame_to_draw.shape[0]:
                                    ocr_text_y_pos = ymin - 10

                                (text_w_ocr, text_h_ocr), base_ocr = cv2.getTextSize(recognized_text, cv2.FONT_HERSHEY_SIMPLEX, 0.7, 2)
                                cv2.rectangle(frame_to_draw, (xmin, ocr_text_y_pos - text_h_ocr - base_ocr),
                                              (xmin + text_w_ocr, ocr_text_y_pos + base_ocr),
                                              OCR_TEXT_BG_COLOR, -1)
                                cv2.putText(frame_to_draw, recognized_text, (xmin, ocr_text_y_pos),
                                            cv2.FONT_HERSHEY_SIMPLEX, 0.7, OCR_TEXT_COLOR, 2)
                                if frame_count % 10 == 0:
                                    print(f"Frame {frame_count}: YOLO Plate (Conf: {confidence:.2f}), Custom OCR: '{recognized_text}'")
                        except Exception as e:
                            if frame_count % 10 == 0 or "Boolean value of Tensor" in str(e):
                                print(f"Frame {frame_count}: Error during Custom OCR for a plate ROI: {e}")

        if frame_count % 30 == 0:
            print(f"Frame {frame_count}: Found {num_detections_in_frame} '{TARGET_CLASS_NAME}' instances by YOLOv8.")

        cv2.imshow(window_title, frame_to_draw)

        key = cv2.waitKey(1) & 0xFF
        if key == ord('q'):
            print("Exiting...")
            break

    if cap.isOpened():
        cap.release()
    cv2.destroyAllWindows()
    print("Video processing finished and resources released.")

Processing video: data/video/53.mp4 with YOLOv8 and Custom EasyOCR
YOLO Model Classes: {0: 'license_plate'}
Targeting YOLO class 'license_plate' with integer ID: 0
Frame 10: YOLO Plate (Conf: 0.76), Custom OCR: 'QCH2377'
Frame 20: YOLO Plate (Conf: 0.62), Custom OCR: 'QCH2357'
Frame 30: Found 0 'license_plate' instances by YOLOv8.
Frame 60: Found 0 'license_plate' instances by YOLOv8.
Frame 80: YOLO Plate (Conf: 0.44), Custom OCR: 'QAV951'
Frame 90: YOLO Plate (Conf: 0.80), Custom OCR: 'QKV880'
Frame 90: Found 1 'license_plate' instances by YOLOv8.
Frame 100: YOLO Plate (Conf: 0.44), Custom OCR: 'QKV898'
Frame 120: Found 0 'license_plate' instances by YOLOv8.
Frame 150: Found 0 'license_plate' instances by YOLOv8.
Frame 180: Found 0 'license_plate' instances by YOLOv8.
Frame 210: Found 0 'license_plate' instances by YOLOv8.
Frame 240: Found 0 'license_plate' instances by YOLOv8.
Frame 270: Found 0 'license_plate' instances by YOLOv8.
Frame 300: YOLO Plate (Conf: 0.84), Custom OCR: 'QAB