# FRCNN + EasyOCR

## Import Dependencies

In [1]:
import torch
import torchvision
import cv2
import warnings
import pandas as pd
import os
from models.EasyOCR.utils import AttrDict, AttnLabelConverter, CTCLabelConverter
import yaml
from torchvision import transforms
warnings.filterwarnings("ignore")

## Configuration

In [2]:
# --- Configuration for Detection ---
CUSTOM_MODEL_CLASS_NAMES = ['__background__', 'carplate'] # Should match your model's training
TARGET_CLASS_NAME = "carplate"
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
CONFIDENCE_THRESHOLD = 0.5

BOX_COLOR = (0, 0, 255) # Red for car plates
TEXT_COLOR = (255, 255, 255) # White
TEXT_BG_COLOR = (0, 0, 0) # Black

OCR_BOX_COLOR = (0, 255, 0) # Green for OCR text bounding box (if you draw it)
OCR_TEXT_COLOR = (50, 200, 255)  # Light blue/yellow for OCR text
OCR_TEXT_BG_COLOR = (0,0,0) # Black background for OCR text

try:
    TARGET_CLASS_ID = CUSTOM_MODEL_CLASS_NAMES.index(TARGET_CLASS_NAME.lower())
    print(f"Targeting class '{TARGET_CLASS_NAME}' with ID: {TARGET_CLASS_ID} from {CUSTOM_MODEL_CLASS_NAMES}")
except ValueError:
    print(f"Error: Target class '{TARGET_CLASS_NAME}' not found in CUSTOM_MODEL_CLASS_NAMES: {CUSTOM_MODEL_CLASS_NAMES}")
    TARGET_CLASS_ID = None

# --- Get Video Path  ---
video_path = "data/video/53.mp4"
if video_path == '0':
    video_source = 0 # Use webcam
else:
    video_source = video_path

Targeting class 'carplate' with ID: 1 from ['__background__', 'carplate']


## Load FRCNN

In [3]:
nun_classes = 2
frcnn = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=False, num_classes=nun_classes)
frcnn.load_state_dict(torch.load('models/full_fasterrcnn_best.pth', map_location= 'cuda')['model_state_dict'])
frcnn.to('cuda')
frcnn.eval()

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=1e-05)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=1e-05)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=1e-05)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=1e-05)
          (relu

## Load EasyOCR

In [4]:
def get_config(file_path):
    with open(file_path, 'r', encoding="utf8") as stream:
        opt = yaml.safe_load(stream)
    opt = AttrDict(opt)
    if opt.lang_char == 'None':
        characters = ''
        for data in opt['select_data'].split('-'):
            csv_path = os.path.join(opt['train_data'], data, 'label.csv')
            df = pd.read_csv(csv_path, sep='^([^,]+),', engine='python', usecols=['filename', 'words'], keep_default_na=False)
            all_char = ''.join(df['words'])
            characters += ''.join(set(all_char))
        characters = sorted(set(characters))
        opt.character= ''.join(characters)
    else:
        opt.character = opt.number + opt.symbol + opt.lang_char
    return opt

In [5]:
from models.EasyOCR.model import Model

opt = get_config("models/EasyOCR/opt.txt")

if not opt.data_filtering_off:
    print('Filtering the images containing characters which are not in opt.character')
    print('Filtering the images whose label is longer than opt.batch_max_length')

opt.select_data = opt.select_data.split('-')
opt.batch_ratio = opt.batch_ratio.split('-')

if 'CTC' in opt.Prediction:
    converter = CTCLabelConverter(opt.character)
else:
    converter = AttnLabelConverter(opt.character)
opt.num_class = len(converter.character)

if opt.rgb:
    opt.input_channel = 3

In [6]:
easyocr = Model(opt)
print('model input parameters', opt.imgH, opt.imgW, opt.num_fiducial, opt.input_channel, opt.output_channel,
      opt.hidden_size, opt.num_class, opt.batch_max_length, opt.Transformation, opt.FeatureExtraction,
      opt.SequenceModeling, opt.Prediction)
easyocr = torch.nn.DataParallel(easyocr).to(device)
easyocr.load_state_dict(torch.load('./models/best_easyocr_full.pth', map_location = 'cuda'), strict=False)
easyocr.eval()

No Transformation module specified
model input parameters 64 256 20 1 512 256 37 12 None VGG BiLSTM CTC


DataParallel(
  (module): Model(
    (FeatureExtraction): VGG_FeatureExtractor(
      (ConvNet): Sequential(
        (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (1): ReLU(inplace=True)
        (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
        (3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (4): ReLU(inplace=True)
        (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
        (6): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (7): ReLU(inplace=True)
        (8): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (9): ReLU(inplace=True)
        (10): MaxPool2d(kernel_size=(2, 1), stride=(2, 1), padding=0, dilation=1, ceil_mode=False)
        (11): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (12): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track

## Draw Predictions (Object Detection & OCR)

In [7]:
# --- Video Capture ---
cap = cv2.VideoCapture(video_source)

if not cap.isOpened():
    print(f"Error: Could not open video source '{video_source}'. Please check the path or camera.")
else:
    print(f"Processing video: {video_source}")
    window_title = f"Car Plate FRCNN Detection & Custom OCR"
    cv2.namedWindow(window_title, cv2.WINDOW_NORMAL)

    ocr_transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize((opt.imgH, opt.imgW)),
        transforms.Grayscale(num_output_channels=1),
        transforms.ToTensor(),
        transforms.Normalize([0.5], [0.5])
    ])

    frame_count = 0
    while cap.isOpened():
        ret, frame_bgr = cap.read()

        if not ret:
            if isinstance(video_source, str): print("End of video file reached.")
            else: print("Error reading frame from webcam.")
            break

        frame_to_draw = frame_bgr.copy()
        frame_count += 1

        img_rgb_frcnn = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
        img_tensor_chw_frcnn = torch.from_numpy(img_rgb_frcnn.transpose((2, 0, 1)))
        img_ready_for_frcnn = img_tensor_chw_frcnn.float().to(device) / 255.0

        with torch.no_grad():
            frcnn_outputs = frcnn([img_ready_for_frcnn])

        predictions = frcnn_outputs[0]
        pred_boxes = predictions['boxes'].cpu().numpy()
        pred_labels = predictions['labels'].cpu().numpy()
        pred_scores = predictions['scores'].cpu().numpy()

        num_detections_in_frame = 0
        if TARGET_CLASS_ID is not None:
            for i in range(len(pred_scores)):
                score = pred_scores[i]
                label_id = pred_labels[i]
                box = pred_boxes[i]

                if label_id == TARGET_CLASS_ID and score >= CONFIDENCE_THRESHOLD:
                    num_detections_in_frame += 1
                    xmin, ymin, xmax, ymax = map(int, box)

                    xmin = max(0, xmin); ymin = max(0, ymin)
                    xmax = min(frame_bgr.shape[1], xmax); ymax = min(frame_bgr.shape[0], ymax)

                    cv2.rectangle(frame_to_draw, (xmin, ymin), (xmax, ymax), BOX_COLOR, 2)
                    class_name_to_display = CUSTOM_MODEL_CLASS_NAMES[label_id]
                    label_text_frcnn = f"{class_name_to_display}: {score:.2f}"
                    (text_w_frcnn, text_h_frcnn), base_frcnn = cv2.getTextSize(label_text_frcnn, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 1)
                    cv2.rectangle(frame_to_draw, (xmin, ymin - text_h_frcnn - base_frcnn - 2), (xmin + text_w_frcnn, ymin - base_frcnn + 2), TEXT_BG_COLOR, -1)
                    cv2.putText(frame_to_draw, label_text_frcnn, (xmin, ymin - base_frcnn -1), cv2.FONT_HERSHEY_SIMPLEX, 0.6, TEXT_COLOR, 1, cv2.LINE_AA)

                    if xmax > xmin and ymax > ymin:
                        plate_roi_bgr = frame_bgr[ymin:ymax, xmin:xmax]
                        try:
                            ocr_image_tensor = ocr_transform(plate_roi_bgr)
                            ocr_image_tensor = ocr_image_tensor.unsqueeze(0).to(device) # [1, C, H, W]

                            with torch.no_grad():
                                ocr_preds_raw = easyocr(ocr_image_tensor, '')

                            seq_len_T = ocr_preds_raw.size(1)
                            if seq_len_T == 0:
                                recognized_text = ""
                                if frame_count % 10 == 0: print(f"Frame {frame_count}: OCR skipped: 0-length sequence from model.")
                            else:
                                _, ocr_preds_idx_batched = ocr_preds_raw.max(2)

                                ocr_preds_idx_flattened = ocr_preds_idx_batched.view(-1)

                                batch_size_ocr = ocr_image_tensor.size(0)
                                preds_size_ocr = torch.IntTensor([seq_len_T] * batch_size_ocr).to(device)

                                recognized_text_list = converter.decode_greedy(ocr_preds_idx_flattened.data, preds_size_ocr.data)
                                recognized_text = recognized_text_list[0] if recognized_text_list else ""
                                recognized_text = ''.join(filter(str.isalnum, recognized_text)).upper()
                            if recognized_text:
                                ocr_text_y_pos = ymax + 20
                                if ocr_text_y_pos + 10 > frame_to_draw.shape[0]: ocr_text_y_pos = ymin - 10
                                (text_w_ocr, text_h_ocr), base_ocr = cv2.getTextSize(recognized_text, cv2.FONT_HERSHEY_SIMPLEX, 0.7, 2)
                                cv2.rectangle(frame_to_draw, (xmin, ocr_text_y_pos - text_h_ocr - base_ocr), (xmin + text_w_ocr, ocr_text_y_pos + base_ocr), OCR_TEXT_BG_COLOR, -1)
                                cv2.putText(frame_to_draw, recognized_text, (xmin, ocr_text_y_pos), cv2.FONT_HERSHEY_SIMPLEX, 0.7, OCR_TEXT_COLOR, 2)
                                if frame_count % 10 == 0: print(f"Frame {frame_count}: OCR Result: '{recognized_text}'")
                        except Exception as e:
                            if frame_count % 10 == 0 or "Boolean value of Tensor" in str(e):
                                print(f"Frame {frame_count}: Error during OCR for a plate ROI: {e}")
                                if 'plate_roi_bgr' in locals(): print(f"  Plate ROI shape: {plate_roi_bgr.shape}")
                                if 'ocr_preds_idx_batched' in locals() and 'preds_size_ocr' in locals() and seq_len_T > 0:
                                     print(f"  Input to decode_greedy (after view(-1).data): ocr_preds_idx_flattened shape: {ocr_preds_idx_flattened.shape}, preds_size_ocr: {preds_size_ocr.data}")


        if frame_count % 30 == 0: print(f"Frame {frame_count}: Found {num_detections_in_frame} '{TARGET_CLASS_NAME}' instances by FRCNN.")
        cv2.imshow(window_title, frame_to_draw)
        key = cv2.waitKey(1) & 0xFF
        if key == ord('q'): print("Exiting..."); break

    if cap.isOpened(): cap.release()
    cv2.destroyAllWindows()
    print("Video processing finished and resources released.")

Processing video: data/video/53.mp4
Frame 10: OCR Result: 'QCH2377'
Frame 20: OCR Result: 'QCH2347'
Frame 30: OCR Result: 'QCH2347'
Frame 30: Found 1 'carplate' instances by FRCNN.
Exiting...
Video processing finished and resources released.
