In [1]:
import cv2
from ultralytics import YOLO
import easyocr
import numpy as np

In [2]:
# Initialize YOLO model and EasyOCR reader
model = YOLO('model.pt')
reader = easyocr.Reader(['en'])

In [3]:
# Function to perform object detection and draw bounding boxes
def predict_and_detect(chosen_model, img, classes=[], conf=0.5, rectangle_thickness=2, text_thickness=1):
    results = chosen_model.predict(img, conf=conf)
    for result in results:
        for box in result.boxes:
            # Draw bounding box
            x1, y1, x2, y2 = map(int, box.xyxy[0].tolist())
            cv2.rectangle(img, (x1, y1), (x2, y2), (255, 0, 0), rectangle_thickness)
            cv2.putText(img, f"{result.names[int(box.cls[0])]}",
                        (x1, y1 - 10), cv2.FONT_HERSHEY_PLAIN, 1, (255, 0, 0), text_thickness)
    return img, results

In [4]:
# Function to crop frames based on bounding boxes and perform OCR
def run_ocr_on_boxes(frame, boxes):
    ocr_results = []
    for box in boxes:
        x1, y1, x2, y2 = map(int, box.xyxy[0].tolist())  # Convert to int for easy indexing
        cropped_frame = frame[y1:y2, x1:x2]  # Crop the frame within the bounding box
        ocr_result = reader.readtext(cropped_frame)  # Run OCR on cropped frame
        ocr_results.append({
            'box': (x1, y1, x2, y2),
            'ocr_result': ocr_result
        })
    return ocr_results

In [5]:
# Open video file or capture device
video_path = 'IMG_9715.mov'  # Replace with your video file path or 0 for webcam
cap = cv2.VideoCapture(video_path)

In [6]:
output_text = ""
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break  # Exit if no more frames are available
    
    # Predict and detect objects
    processed_frame, results = predict_and_detect(model, frame, conf=0.25)
    
    # Run OCR on the detected bounding boxes
    for result in results:
        boxes = result.boxes
        ocr_results = run_ocr_on_boxes(frame, boxes)
        
        # Extract and print OCR text
        extracted_text = []
        for ocr_result in ocr_results:
            for detection in ocr_result['ocr_result']:
                extracted_text.append(detection[1])
        
        # Display extracted text
        output_text = ", ".join(extracted_text)
        print(f"Extracted Text: {output_text}")
    
    # Display the processed frame with bounding boxes and OCR text
    cv2.imshow('YOLO + OCR Detection', processed_frame)
    
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break  # Press 'q' to exit

# Release the video capture object and close windows
cap.release()
cv2.destroyAllWindows()


0: 640x384 3 titles-or-authorss, 55.6ms
Speed: 1.9ms preprocess, 55.6ms inference, 0.1ms postprocess per image at shape (1, 3, 640, 384)
Extracted Text: THE, SECRET, HISTORY, DONNA, TARTT, DONNA, TARTT

0: 640x384 3 titles-or-authorss, 69.9ms
Speed: 1.6ms preprocess, 69.9ms inference, 0.1ms postprocess per image at shape (1, 3, 640, 384)
Extracted Text: THE, SECRET, HISTORY, DONNA, TARTT, DONNA, TARTT

0: 640x384 2 titles-or-authorss, 61.3ms
Speed: 1.8ms preprocess, 61.3ms inference, 0.1ms postprocess per image at shape (1, 3, 640, 384)
Extracted Text: THE, SECRET, HISTORY, DONNA, TARTT

0: 640x384 2 titles-or-authorss, 59.5ms
Speed: 1.6ms preprocess, 59.5ms inference, 0.1ms postprocess per image at shape (1, 3, 640, 384)
Extracted Text: THE, SECRET, HISTORY, DONNA, TARTT

0: 640x384 2 titles-or-authorss, 56.0ms
Speed: 1.6ms preprocess, 56.0ms inference, 0.1ms postprocess per image at shape (1, 3, 640, 384)
Extracted Text: THE, SECRET, HISTORY, DONNA, TARTT

0: 640x384 2 titles-or-aut

In [7]:
output_text

'THE, SECRET, HISTORY, DONNA, TARTT'

In [8]:
import ollama
prompt = f"""
        - Below is a text extracted from an OCR. The text contains mentions of famous books and their corresponding authors.
        - Some words may be slightly misspelled or out of order.
        - Your task is to identify the book titles and corresponding authors from the text.
        - Output the text in the format: '<Name of the book> : <Name of the author>'.
        - Do not generate any other text except the book title and the author.
        
        TEXT:
        {output_text}
"""
# Call the LLaMA 3 model
response = ollama.chat(
    model="llama3",
    messages=[{"role": "user", "content": prompt}]
)
# Extract the response content
response_text = response['message']['content'].strip()
print(response_text)


The Secret History : Donna Tartt
