In [1]:
import cv2
import numpy as np
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from PIL import Image

# Load the frozen EAST model using OpenCV
net = cv2.dnn.readNet("frozen_east_text_detection.pb")

# Initialize TrOCR model and processor
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")


  from .autonotebook import tqdm as notebook_tqdm
Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-handwritten and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
# Load your image
image_path = '/Users/darwinye/myfile/NorthwesternU/499 Capstone/Data_subset_Final/train/images/139.bmp'
image = cv2.imread(image_path)
orig = image.copy()
(H, W) = image.shape[:2]

# Define EAST model input size
newW, newH = (320, 320)
rW = W / float(newW)
rH = H / float(newH)

# Resize the image and prepare it for the EAST model
image = cv2.resize(image, (newW, newH))
blob = cv2.dnn.blobFromImage(image, 1.0, (newW, newH),
                             (123.68, 116.78, 103.94), swapRB=True, crop=False)

# Forward pass to get scores and geometry
net.setInput(blob)
(scores, geometry) = net.forward(["feature_fusion/Conv_7/Sigmoid", "feature_fusion/concat_3"])

# Decode predictions to get bounding boxes
def decode_predictions(scores, geometry, confidence_threshold=0.5):
    (numRows, numCols) = scores.shape[2:4]
    rects = []
    confidences = []

    for y in range(numRows):
        scoresData = scores[0, 0, y]
        xData0 = geometry[0, 0, y]
        xData1 = geometry[0, 1, y]
        xData2 = geometry[0, 2, y]
        xData3 = geometry[0, 3, y]
        anglesData = geometry[0, 4, y]

        for x in range(numCols):
            if scoresData[x] < confidence_threshold:
                continue

            offsetX, offsetY = (x * 4.0, y * 4.0)
            angle = anglesData[x]
            cos = np.cos(angle)
            sin = np.sin(angle)
            h = xData0[x] + xData2[x]
            w = xData1[x] + xData3[x]
            endX = int(offsetX + (cos * xData1[x]) + (sin * xData2[x]))
            endY = int(offsetY - (sin * xData1[x]) + (cos * xData2[x]))
            startX = int(endX - w)
            startY = int(endY - h)
            rects.append((startX, startY, endX, endY))
            confidences.append(scoresData[x])

    return rects, confidences

# Decode bounding boxes
rects, confidences = decode_predictions(scores, geometry)
boxes = cv2.dnn.NMSBoxes(rects, confidences, 0.5, 0.4)

# Sort boxes by position (top-to-bottom, left-to-right)
def sort_boxes(rects):
    y_threshold = 10  # Adjust threshold to group text rows accurately
    sorted_indices = sorted(
        range(len(rects)),
        key=lambda i: (rects[i][1] // y_threshold, rects[i][0])
    )
    return sorted_indices

# Filter overlapping boxes to reduce redundancy
def filter_overlapping_boxes(rects, overlap_threshold=0.3):
    filtered_boxes = []
    for rect in rects:
        (startX, startY, endX, endY) = rect
        box_area = (endX - startX) * (endY - startY)
        
        # Check for overlap with boxes already in filtered_boxes
        keep = True
        for fb in filtered_boxes:
            (fx1, fy1, fx2, fy2) = fb
            inter_x1 = max(startX, fx1)
            inter_y1 = max(startY, fy1)
            inter_x2 = min(endX, fx2)
            inter_y2 = min(endY, fy2)
            
            inter_w = max(0, inter_x2 - inter_x1)
            inter_h = max(0, inter_y2 - inter_y1)
            inter_area = inter_w * inter_h
            
            # Compute the IoU (intersection over union)
            union_area = box_area + (fx2 - fx1) * (fy2 - fy1) - inter_area
            iou = inter_area / union_area
            
            if iou > overlap_threshold:
                keep = False
                break

        if keep:
            filtered_boxes.append(rect)

    return filtered_boxes

# Sort and filter the boxes
sorted_indices = sort_boxes(rects)
filtered_rects = filter_overlapping_boxes([rects[i] for i in sorted_indices])

# Process each detected text box with TrOCR in sorted order
text_results = []
for rect in filtered_rects:
    (startX, startY, endX, endY) = rect

    # Scale bounding box back to original image size
    startX = int(startX * rW)
    startY = int(startY * rH)
    endX = int(endX * rW)
    endY = int(endY * rH)

    # Crop the detected text region
    cropped_img = orig[startY:endY, startX:endX]

    # Convert to PIL image for TrOCR
    pil_img = Image.fromarray(cv2.cvtColor(cropped_img, cv2.COLOR_BGR2RGB))

    # Recognize text using TrOCR
    pixel_values = processor(pil_img, return_tensors="pt").pixel_values
    generated_ids = model.generate(pixel_values)
    transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

    text_results.append(transcription)

# Print the results in order
full_text = "\n".join(text_results)
print("Detected Text in Order:")
print(full_text)



Detected Text in Order:
0 1
0 1
QRCo
2C O.D.E
vali
parame
E. T.E.R.D.E.F.
ehaul.
IL T.S
codes.
test
in BARC
