# FRCNN + PaddleOCR (Inference)

## Import Dependencies

In [1]:
import torch
import torchvision
import paddle
from paddleocr import PaddleOCR
import cv2
import warnings
warnings.filterwarnings("ignore")

## Configuration

In [2]:
# Cell id: 6ab5a433226c0e38 (MODIFIED)
# --- Configuration for Detection ---
CUSTOM_MODEL_CLASS_NAMES = ['__background__', 'carplate']
TARGET_CLASS_NAME = "carplate"
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
CONFIDENCE_THRESHOLD = 0.5
BOX_COLOR = (0, 0, 255)       # Red for FRCNN car plates
TEXT_COLOR = (255, 255, 255)  # White for FRCNN label
TEXT_BG_COLOR = (0, 0, 0)     # Black for FRCNN label background

OCR_TEXT_COLOR = (0, 255, 0)      # Green for PaddleOCR text
OCR_TEXT_BG_COLOR = (0, 0, 0)     # Black background for PaddleOCR text

try:
    TARGET_CLASS_ID = CUSTOM_MODEL_CLASS_NAMES.index(TARGET_CLASS_NAME.lower())
    print(f"Targeting class '{TARGET_CLASS_NAME}' with ID: {TARGET_CLASS_ID} from {CUSTOM_MODEL_CLASS_NAMES}")
except ValueError:
    print(f"Error: Target class '{TARGET_CLASS_NAME}' not found in CUSTOM_MODEL_CLASS_NAMES: {CUSTOM_MODEL_CLASS_NAMES}")
    TARGET_CLASS_ID = None

# --- Get Video Path  ---
video_path = "../data/video/58.mp4"  # Or your desired video
if video_path == '0':
    video_source = 0 # Use webcam
else:
    video_source = video_path

Targeting class 'carplate' with ID: 1 from ['__background__', 'carplate']


## Load FRCNN

In [3]:
nun_classes = 2
frcnn = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=False, num_classes=nun_classes)
frcnn.load_state_dict(torch.load('../models/full_fasterrcnn_best.pth', map_location= 'cuda')['model_state_dict'])
frcnn.to('cuda')
frcnn.eval()

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=1e-05)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=1e-05)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=1e-05)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=1e-05)
          (relu

## Load PaddleOCR


In [4]:
gpu_available  = paddle.device.is_compiled_with_cuda()
print(gpu_available)

True


In [5]:
paddleocr = PaddleOCR(
    use_angle_cls = True,
    lang = 'en',
    use_gpu = True
)

[2025/05/11 17:31:10] ppocr DEBUG: Namespace(alpha=1.0, alphacolor=(255, 255, 255), benchmark=False, beta=1.0, binarize=False, cls_batch_num=6, cls_image_shape='3, 48, 192', cls_model_dir='C:\\Users\\User/.paddleocr/whl\\cls\\ch_ppocr_mobile_v2.0_cls_infer', cls_thresh=0.9, cpu_threads=10, crop_res_save_dir='./output', det=True, det_algorithm='DB', det_box_type='quad', det_db_box_thresh=0.6, det_db_score_mode='fast', det_db_thresh=0.3, det_db_unclip_ratio=1.5, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_east_score_thresh=0.8, det_limit_side_len=960, det_limit_type='max', det_model_dir='C:\\Users\\User/.paddleocr/whl\\det\\en\\en_PP-OCRv3_det_infer', det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, det_pse_thresh=0, det_sast_nms_thresh=0.2, det_sast_score_thresh=0.5, draw_img_save_dir='./inference_results', drop_score=0.5, e2e_algorithm='PGNet', e2e_char_dict_path='./ppocr/utils/ic15_dict.txt', e2e_limit_side_len=768, e2e_limit_type='max', e2e_model_dir=None, e

## Draw Predictions (Object Detection & OCR)

In [6]:
# Cell id: dc8b44f4dc5661a3 (NEW CONTENT)

# --- Video Capture ---
cap = cv2.VideoCapture(video_source)

if not cap.isOpened():
    print(f"Error: Could not open video source '{video_source}'. Please check the path or camera.")
    # In a notebook, avoid exit(), just print and let the cell finish.
else:
    print(f"Processing video: {video_source} with FRCNN and PaddleOCR")
    window_title = f"Car Plate FRCNN Detection & PaddleOCR"
    cv2.namedWindow(window_title, cv2.WINDOW_NORMAL) # Allows resizing

    frame_count = 0
    while cap.isOpened():
        ret, frame_bgr = cap.read()

        if not ret:
            if isinstance(video_source, str):
                print("End of video file reached.")
            else:
                print("Error reading frame from webcam.")
            break

        frame_to_draw = frame_bgr.copy()
        frame_count += 1

        # --- FRCNN Preprocessing ---
        img_rgb_frcnn = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
        # For torchvision models, image needs to be [C, H, W] and normalized
        # Standard transform for ResNet50 based FasterRCNN
        frcnn_transform = torchvision.transforms.Compose([
            torchvision.transforms.ToTensor(), # Converts to [0,1] range and C,H,W
            # torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) # Usually part of model.transform
        ])
        # Your FRCNN model's transform might handle normalization.
        # If you've loaded it with weights, it typically has a .transform attribute.
        # For simplicity, if your model already includes normalization in its internal transform:
        img_tensor_frcnn = torch.from_numpy(img_rgb_frcnn.transpose((2, 0, 1))).float().to(device) / 255.0
        # If normalization is not part of model.transform, use this:
        # img_tensor_frcnn = frcnn_transform(img_rgb_frcnn).to(device)


        # --- FRCNN Inference ---
        with torch.no_grad():
            frcnn_outputs = frcnn([img_tensor_frcnn]) # frcnn is your FRCNN model

        predictions = frcnn_outputs[0]
        pred_boxes = predictions['boxes'].cpu().numpy()
        pred_labels = predictions['labels'].cpu().numpy()
        pred_scores = predictions['scores'].cpu().numpy()

        # --- Process Detections and Perform OCR ---
        num_detections_in_frame = 0
        if TARGET_CLASS_ID is not None:
            for i in range(len(pred_scores)):
                score = pred_scores[i]
                label_id = pred_labels[i]
                box = pred_boxes[i]

                if label_id == TARGET_CLASS_ID and score >= CONFIDENCE_THRESHOLD:
                    num_detections_in_frame += 1
                    xmin, ymin, xmax, ymax = map(int, box)

                    # Clamp coordinates to be within frame boundaries
                    xmin = max(0, xmin)
                    ymin = max(0, ymin)
                    xmax = min(frame_bgr.shape[1], xmax)
                    ymax = min(frame_bgr.shape[0], ymax)

                    # Draw FRCNN bounding box
                    cv2.rectangle(frame_to_draw, (xmin, ymin), (xmax, ymax), BOX_COLOR, 2)
                    class_name_to_display = CUSTOM_MODEL_CLASS_NAMES[label_id]
                    label_text_frcnn = f"{class_name_to_display}: {score:.2f}"

                    (text_w_frcnn, text_h_frcnn), base_frcnn = cv2.getTextSize(label_text_frcnn, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 1)
                    cv2.rectangle(frame_to_draw, (xmin, ymin - text_h_frcnn - base_frcnn - 2), (xmin + text_w_frcnn, ymin - base_frcnn + 2), TEXT_BG_COLOR, -1)
                    cv2.putText(frame_to_draw, label_text_frcnn, (xmin, ymin - base_frcnn -1),
                                cv2.FONT_HERSHEY_SIMPLEX, 0.6, TEXT_COLOR, 1, cv2.LINE_AA)

                    # --- PaddleOCR: Crop the plate and read text ---
                    if xmax > xmin and ymax > ymin: # Check if the box is valid
                        plate_roi_bgr = frame_bgr[ymin:ymax, xmin:xmax] # Crop from original BGR

                        try:
                            # Perform OCR on the cropped plate using PaddleOCR
                            # For a single ROI, we expect text recognition only
                            # The 'paddleocr' instance is from your setup cell
                            ocr_result = paddleocr.ocr(plate_roi_bgr, det=False, rec=True, cls=paddleocr.use_angle_cls)

                            recognized_text = ""
                            text_confidence = 0.0

                            if ocr_result and ocr_result[0]:
                                lines = ocr_result[0]
                                if lines: # If lines is not None or empty
                                    # Assuming the first recognized line is the primary one for a plate
                                    recognized_text_tuple = lines[0] # This should be (text_string, confidence)
                                    recognized_text = recognized_text_tuple[0]
                                    text_confidence = recognized_text_tuple[1]

                                    # Optional: Filter or clean the recognized text
                                    recognized_text = ''.join(filter(str.isalnum, recognized_text)).upper()


                            if recognized_text:
                                ocr_display_text = f"{recognized_text} ({text_confidence:.2f})"
                                ocr_text_y_pos = ymax + 20 # Position below the FRCNN box
                                if ocr_text_y_pos + 10 > frame_to_draw.shape[0]: # If too low, put above
                                    ocr_text_y_pos = ymin - 10

                                (text_w_ocr, text_h_ocr), base_ocr = cv2.getTextSize(ocr_display_text, cv2.FONT_HERSHEY_SIMPLEX, 0.7, 2)
                                cv2.rectangle(frame_to_draw, (xmin, ocr_text_y_pos - text_h_ocr - base_ocr),
                                              (xmin + text_w_ocr, ocr_text_y_pos + base_ocr),
                                              OCR_TEXT_BG_COLOR, -1)
                                cv2.putText(frame_to_draw, ocr_display_text, (xmin, ocr_text_y_pos),
                                            cv2.FONT_HERSHEY_SIMPLEX, 0.7, OCR_TEXT_COLOR, 2)
                                if frame_count % 10 == 0:
                                    print(f"Frame {frame_count}: FRCNN Plate (Score: {score:.2f}), PaddleOCR: '{recognized_text}' (Conf: {text_confidence:.2f})")
                        except Exception as e:
                            if frame_count % 10 == 0:
                                print(f"Frame {frame_count}: Error during PaddleOCR for a plate ROI: {e}")


        if frame_count % 30 == 0:
            print(f"Frame {frame_count}: Found {num_detections_in_frame} '{TARGET_CLASS_NAME}' instances by FRCNN.")

        cv2.imshow(window_title, frame_to_draw)

        key = cv2.waitKey(1) & 0xFF
        if key == ord('q'):
            print("Exiting...")
            break

    # --- Cleanup ---
    if cap.isOpened():
        cap.release()
    cv2.destroyAllWindows()
    print("Video processing finished and resources released.")

Processing video: data/video/58.mp4 with FRCNN and PaddleOCR
Frame 30: FRCNN Plate (Score: 0.88), PaddleOCR: 'OLA9206' (Conf: 0.33)
Frame 30: Found 1 'carplate' instances by FRCNN.
Frame 60: Found 0 'carplate' instances by FRCNN.
Frame 80: FRCNN Plate (Score: 0.51), PaddleOCR: 'DA53' (Conf: 0.72)
Frame 90: Found 0 'carplate' instances by FRCNN.
Frame 120: FRCNN Plate (Score: 0.93), PaddleOCR: 'MUT9' (Conf: 0.60)
Frame 120: Found 1 'carplate' instances by FRCNN.
Frame 140: FRCNN Plate (Score: 0.84), PaddleOCR: 'UUS2' (Conf: 0.20)
Frame 150: Found 0 'carplate' instances by FRCNN.
Frame 160: FRCNN Plate (Score: 0.85), PaddleOCR: 'DISESH' (Conf: 0.51)
Frame 160: FRCNN Plate (Score: 0.51), PaddleOCR: 'AS' (Conf: 0.46)
Frame 180: Found 0 'carplate' instances by FRCNN.
Frame 190: FRCNN Plate (Score: 0.96), PaddleOCR: '08230' (Conf: 0.51)
Frame 190: FRCNN Plate (Score: 0.73), PaddleOCR: 'DABEE' (Conf: 0.68)
Frame 210: Found 0 'carplate' instances by FRCNN.
Frame 240: Found 0 'carplate' instanc