In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd /content/drive/MyDrive/Scene text recognition/

In [None]:
!pip install -r requirements.txt

In [None]:
!pip install ultralytics
import ultralytics
ultralytics.checks()

In [None]:
import xml.etree.ElementTree as ET
import os
import yaml
import shutil

In [None]:
word_xml_path = "data/icdar2003/SceneTrialTrain/words.xml"
tree = ET.parse(word_xml_path)
root = tree.getroot()
print(root)

In [None]:
for image in root:
    imag_name = image[0].text
    for bbs in image.findall('taggedRectangles'):
        for bb in bbs:
            bbox = [
                float(bb.attrib['x']),
                float(bb.attrib['y']),
                float(bb.attrib['width']),
                float(bb.attrib['height'])
            ]
            print(bb[0].text, bbox)
    break

In [None]:
def extract_from_xml(path):
    image_paths = []
    image_sizes = []
    image_labels = []
    bounding_boxes = []
    tree = ET.parse(path)
    root = tree.getroot()
    for image in root:
        bbs_of_image = []
        labels_of_image = []

        for bbs in image.findall('taggedRectangles'):
            for bb in bbs:
                if not bb[0].text.isalnum():
                    continue
                if 'é' in bb[0].text.lower() or 'ñ' in bb[0].text.lower():
                    continue

                bbs_of_image.append(
                    [
                        float(bb.attrib['x']),
                        float(bb.attrib['y']),
                        float(bb.attrib['width']),
                        float(bb.attrib['height'])
                    ]
                )
                labels_of_image.append(bb[0].text)
        image_paths.append(image[0].text)
        image_sizes.append((int(image[1].attrib['x']), int(image[1].attrib['y'])))
        image_labels.append(labels_of_image)
        bounding_boxes.append(bbs_of_image)
    return image_paths, image_sizes, image_labels, bounding_boxes

In [None]:
words_xml_path = os.path.join('data','icdar2003','SceneTrialTrain','words.xml')
image_paths, image_sizes, image_labels, bounding_boxes = extract_from_xml(words_xml_path)

In [None]:
image_paths

In [None]:
print(bounding_boxes)

# **Convert to YOLOv8 format**
vì yolo format có định dạng là center và width height và có giá trị trong khoảng từ (0,1) do đó cần normalize lại các giá trị

In [None]:
def convert_to_yolo_format(image_paths, image_sizes, bounding_boxes):
    yolo_data = []

    for image_path, image_size, bboxes in zip(image_paths, image_sizes, bounding_boxes):
        image_width, image_height = image_size
        yolo_labels = []

        for bbox in bboxes:
            x, y, w, h = bbox

            center_x = (x + w / 2) / image_width
            center_y = (y + h / 2) / image_height
            norm_width = w / image_width
            norm_height = h / image_height

            class_id = 0

            yolo_label = f"{class_id} {center_x} {center_y} {norm_width} {norm_height}"
            yolo_labels.append(yolo_label)

        yolo_data.append((image_path, yolo_labels))

    return yolo_data

In [None]:
class_labels = ['text']

In [None]:
yolo_data = convert_to_yolo_format(image_paths, image_sizes, bounding_boxes)
yolo_data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
seed = 0
val_size = 0.3
test_size = 2/3
is_shuffle = True
train_data, test_data = train_test_split(
    yolo_data,
    test_size=val_size,
    random_state=seed,
    shuffle=is_shuffle
)

test_data, val_data = train_test_split(
    test_data,
    test_size=test_size,
    random_state=seed,
    shuffle=is_shuffle
)

In [None]:
train_data

In [None]:
def save_data(data, src_img_dir, save_dir):
    os.makedirs(save_dir, exist_ok=True)

    os.makedirs(os.path.join(save_dir,'images'), exist_ok=True)
    os.makedirs(os.path.join(save_dir,'labels'), exist_ok=True)

    for image_path, yolo_labels in data:
        shutil.copy(
            os.path.join(src_img_dir, image_path),
            os.path.join(save_dir, 'images')
        )

        image_name = os.path.basename(image_path)
        image_name = os.path.splitext(image_name)[0]
        with open(os.path.join(save_dir,'labels', f"{image_name}.txt"), 'w') as f:
            for label in yolo_labels:
                f.write(f"{label}\n")

In [None]:
save_yolo_data_dir = 'datasets/yolo_data'

os.makedirs(save_yolo_data_dir, exist_ok=True)
save_train_dir = os.path.join(save_yolo_data_dir, 'train')
save_val_dir = os.path.join(save_yolo_data_dir, 'val')
save_test_dir = os.path.join(save_yolo_data_dir, 'test')
dataset_dir = 'data/icdar2003/SceneTrialTrain'
save_data(train_data, dataset_dir, save_train_dir)
save_data(val_data, dataset_dir, save_val_dir)
save_data(test_data, dataset_dir, save_test_dir)

In [None]:
data_yaml = {
    'path': 'yolo_data',
    'train':'train/images',
    'val':'val/images',
    'test':'test/images',
    'nc': 1,
    'names':class_labels,
}

yolo_yaml_path = os.path.join(save_yolo_data_dir ,'data.yml')
with open(yolo_yaml_path, 'w') as f:
    yaml.dump(data_yaml, f,default_flow_style=False)

In [None]:
print(yolo_yaml_path)

#**Training**

In [None]:
from ultralytics import YOLO

model = YOLO('yolov8s.yaml').load('yolov8s.pt')

epochs = 200
imgsz = 1024
results = model.train(
    data = yolo_yaml_path,
    epochs = epochs,
    imgsz = imgsz,
    project = 'models',
    name = 'yolov8/detect/train',
)

#**Evaluation**

In [None]:
from ultralytics import YOLO

model_path = 'models/yolov8/detect/train/weights/best.pt'
yolo_model = YOLO(model_path)

metrics = yolo_model.val(
    project = 'models',
    name = 'yolov8/detect/val',
)

Load thử một ảnh

In [None]:
import cv2
import matplotlib.pyplot as plt
import json

In [None]:
def visualize_bbox(img_path, predictions,conf_thres = 0, font = cv2.FONT_HERSHEY_SIMPLEX):
    img = cv2.imread(img_path)
    h, w = img.shape[:2]

    for prediction in predictions:
        conf_score = prediction['confidence']

        if conf_score < conf_thres:
            continue

        bbox = prediction['box']
        xmin = int(bbox['x1'])
        ymin = int(bbox['y1'])
        xmax = int(bbox['x2'])
        ymax = int(bbox['y2'])

        cv2.rectangle(img, (xmin, ymin), (xmax, ymax), (0, 255, 0), 2)

        text = f"{conf_score:.2f}"
        (text_width, text_height),_ = cv2.getTextSize(text, font, 1, 2)

        cv2.rectangle(img, (xmin,ymin-text_height - 5), (xmax + text_width, ymin),(0, 255, 0), 2)
        cv2.putText(img, text, (xmin, ymin - 5), font, 1, (0, 0, 0), 2)

    return img


In [None]:
model_path = 'models/yolov8/detect/train/weights/best.pt'
img_path = 'datasets/yolo_data/train/images/IMG_1263.JPG'

conf_thres = 0.75

results = model(img_path, verbose= False)
predictions = json.loads(results[0].tojson())
print(predictions)

visualize_img = visualize_bbox(img_path, predictions, conf_thres)
plt.imshow(visualize_img)
plt.axis('off')
plt.show()

# **Text Recognition**

Sử dụng mô hình cơ bản trong OCR là CRNN(Convolution Recurrent Neural Network)
Vì đây cũng là dạng bài sequence nhưng không nằm ở dạng text mà là ảnh --> vẫn dùng LSTM để thực hiện
Ban đầu sẽ đưa qua CNN để bóc tách đặc trưng ảnh.   
--> Khi đi qua CNN sẽ có các feature map
vd khi flatten sẽ được output cuối cùng khoảng (512,1,1) x N  
Idea lúc này: coi N feature vector là cái token thứ Xi, mỗi token sẽ fit qua RNN coi như là text bình thường.  
Paper đã chứng minh được mỗi feature vector sẽ đại diện cho 1 phần hình trong ảnh.



Với mỗi vị trí X đi vào có thể thiết kế mạng LSTM Many-to-Many. Với 1 mỗi X sẽ dự đoán được 1Y
Ảnh --> CNN --> RNN --> CTC --> Text

Nếu sử dụng CrossEntropy sẽ bị lặp từ và sai khá nhiều.  
colab funtionc: giữ lại 1 nếu các kí tự giống nhau.
vd: bbobook --> bok vì ghép b và o vs nhau, colab function sẽ không nhận diện được có bao nhiêu chữ o mà chỉ giữ lại 1 chữ --> sai   
khi có CTC loss sẽ học được các blank token để ngăn lại colab function.


1. Cropped images

In [None]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding
!pip install timm

In [None]:
from PIL import Image
import os
import numpy as np
import torch
import torch.nn.functional as F
import torchvision.transforms as transform
import torchvision.transforms.functional as TF
from torch.utils.data import Dataset, DataLoader
import torchvision
import torch.nn as nn
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import timm

In [None]:
def split_bounding_boxes(img_paths, img_labels, bboxes, save_dir):
    os.makedirs(save_dir, exist_ok=True)
    count = 0
    labels = []
    for img_path, img_label, bbs in zip(img_paths, img_labels, bboxes):
        img = Image.open(os.path.join("data/icdar2003/SceneTrialTrain",img_path))
        for label, bb in zip(img_label, bbs):
            cropped_img = img.crop((bb[0], bb[1], bb[0]+bb[2], bb[1]+bb[3]))
            if np.mean(cropped_img) < 35 or np.mean(cropped_img) > 220:
                continue
            if cropped_img.size[0] < 10 or cropped_img.size[1] < 10:
                continue
            if len(img_label) < 3:
                continue

            filename = f"{count:06d}.jpg"
            cropped_img.save(os.path.join(save_dir, filename))
            new_img_path = os.path.join(save_dir, filename)
            label = new_img_path + '\t' + label
            labels.append(label)
            count += 1

    print(f"Created {count} images")

    with open(os.path.join(save_dir, 'labels.txt'), 'w') as f:
        for label in labels:
            f.write(f"{label}\n")

In [None]:
save_dir = "datasets/ocr_data"

In [None]:
split_bounding_boxes(image_paths, image_labels, bounding_boxes, save_dir)

In [None]:
root_dir = save_dir
img_paths = []
labels = []
with open(os.path.join(root_dir, 'labels.txt'), 'r') as f:
    for label in f:
        labels.append(label.strip().split('\t')[1])
        img_paths.append(label.strip().split('\t')[0])

In [None]:
with open(os.path.join(root_dir, 'labels.txt'), 'r') as f:
    for label in f:
        print(label)
        break

In [None]:
print(f"Total images: {len(img_paths)}")
print(f"Total labels: {len(labels)}")

Tạo bộ vocab để recognite các ký tự.  
Vì CRNN dự đoán theo từng ký tự --> ký tự aphalbet và các số. Bỏ qua các dấu.

In [None]:
letters = [char.split('.')[0].lower() for char in labels]

letters = "".join(letters)
letters = sorted(list(set(letters)))
print(letters)

In [None]:
chars = "".join(letters)
blank_char = '-'
chars += blank_char
vocab_size = len(chars)
print(f"Vocab: {chars}")
print(f"Vocab size: {vocab_size}")

In [None]:
char_to_idx = {char: idx + 1 for idx,char in enumerate(sorted(chars))}
print(char_to_idx)
max_label_len = max([len(label) for label in labels])

In [None]:
print(max_label_len)

In [None]:
def encode_label(label, char_to_idx, max_label_len):
    encoded_label = torch.tensor(
        [char_to_idx[char] for char in label.lower()],
        dtype = torch.long
    )
    label_len = len(encoded_label)
    lengths = torch.tensor(label_len, dtype = torch.long)
    padded_label = F.pad(
        encoded_label,
        (0, max_label_len - label_len),
        value = 0
    )
    return padded_label, lengths

In [None]:
test = "Helao"
print(encode_label(test, char_to_idx, max_label_len))

In [None]:
idx_to_char = {idx: char for char, idx in char_to_idx.items()}
def decode(encoded_seq, idx_to_char, blank_char ='-'):
    decoded_seq = []
    for seq in encoded_seq:
        decoded_label = []
        for (idx, token) in enumerate(seq):
            if token != 0:
                char = idx_to_char[token.item()]
                if char != blank_char:
                    decoded_label.append(char)
        decoded_seq.append(''.join(decoded_label))
    return decoded_seq

In [None]:
encoded_seq = torch.tensor([[19, 16, 23, 23, 26,  0,  0,  0,  0,  0,  0,  0,  0,  0]])
print(decode(encoded_seq, idx_to_char))

data preprocessing function

In [None]:
data_transform = {
    'train' : transform.Compose([
        transform.Resize((100,420)),
        transform.ColorJitter(
            brightness = 0.5,
            contrast = 0.5,
            saturation = 0.5
        ),
        transform.Grayscale(num_output_channels = 1),
        transform.GaussianBlur(3),
        transform.RandomAffine(degrees = 1, shear = 1),
        transform.RandomPerspective(
            distortion_scale = 0.5,
            p = 0.5,
            interpolation = 3
        ),
        transform.RandomRotation(degrees = 2),
        transform.ToTensor(),
        transform.Normalize((0.5,), (0.5,))
    ]),
    'val' : transform.Compose([
        transform.Resize((100,420)),
        transform.Grayscale(num_output_channels=1),
        transform.ToTensor(),
        transform.Normalize((0.5,), (0.5,))
    ]),
}


In [None]:
class STRDataset(Dataset):
    def __init__(self, X,y,char_to_idx, max_label_len, label_encoder = None, transform = None):
        self.transform = transform
        self.img_paths = X
        self.labels = y
        self.char_to_idx = char_to_idx
        self.max_label_len = max_label_len
        self.label_encoder = label_encoder
    def __len__(self):
        return len(self.img_paths)
    def __getitem__(self, idx):
        label = self.labels[idx]
        img_path = self.img_paths[idx]
        img = Image.open(img_path).convert("RGB")

        if self.transform:
            img = self.transform(img)

        if self.label_encoder:
            encoded_label, label_len = self.label_encoder(
                label,
                self.char_to_idx,
                self.max_label_len
            )
        return img, encoded_label, label_len


In [None]:
seed = 0
val_size = 0.3
test_size = 1/3
is_shuffle = True
X_train,X_val, y_train, y_val = train_test_split(
    img_paths,
    labels,
    test_size=val_size,
    random_state=seed,
    shuffle=is_shuffle
)

X_val, X_test, y_val, y_test = train_test_split(
    X_val,
    y_val,
    test_size=test_size,
    random_state=seed,
    shuffle=is_shuffle
)

In [None]:
X_train

In [None]:
y_train

In [None]:
train_dataset = STRDataset(
    X_train,
    y_train,
    char_to_idx,
    max_label_len,
    encode_label,
    data_transform['train']
)
val_dataset = STRDataset(
    X_val,
    y_val,
    char_to_idx,
    max_label_len,
    encode_label,
    data_transform['val']
)
test_dataset = STRDataset(
    X_test,
    y_test,
    char_to_idx,
    max_label_len,
    encode_label,
    data_transform['val']
)

In [None]:
print(f"Train dataset: {len(train_dataset)}")
print(f"Val dataset: {len(val_dataset)}")
print(f"Test dataset: {len(test_dataset)}")

In [None]:
train_batch_size = 32
test_batch_size = 8

train_loader = DataLoader(
    train_dataset,
    batch_size = train_batch_size,
    shuffle = True,
)

val_loader = DataLoader(
    val_dataset,
    batch_size = test_batch_size,
    shuffle = False,
)

test_loader = DataLoader(
    test_dataset,
    batch_size = test_batch_size,
    shuffle = False,
)

In [None]:
train_features, train_labels, train_lengths = next(iter(train_loader))

def show_batch(imgs,labels):
    print(labels)
    labels = decode(labels, idx_to_char)
    print(labels)
    grid = torchvision.utils.make_grid(imgs, nrow = 4, normalize = True)
    plt.figure(figsize = (10,20))
    plt.imshow(np.transpose(grid, (1,2,0)))
    plt.axis('off')
    plt.show()

show_batch(train_features, train_labels)

In [None]:
class CRNN(nn.Module):
    def __init__(self, vocab_size, hidden_size, n_layers, dropout = 0.2, unfreeze_layers = 3):
        super(CRNN, self).__init__()
        backbone = timm.create_model(
            'resnet101',
            pretrained = True,
            in_chans = 1
        )

        modules = list(backbone.children())[:-2]
        modules.append(nn.AdaptiveAvgPool2d((1,None)))
        self.backbone = nn.Sequential(*modules)

        for parameter in self.backbone[-unfreeze_layers:].parameters():
            parameter.requires_grad = True

        self.mapSeq = nn.Sequential(
            nn.Linear(2048, 512),
            nn.ReLU(),
            nn.Dropout(dropout),
        )

        self.lstm = nn.LSTM(
            512,
            hidden_size,
            n_layers,
            batch_first = True,
            dropout = dropout if n_layers > 1 else 0,
            bidirectional = True
        )

        self.layer_norm = nn.LayerNorm(hidden_size * 2)

        self.out = nn.Sequential(
            nn.Linear(hidden_size * 2, vocab_size),
            nn.LogSoftmax(dim = 2)
        )

    def forward(self,x):
        # print(x.shape)
        x = self.backbone(x)
        # print(x.shape)
        x = x.permute(0, 3, 1, 2)
        # print(x.shape)
        x = x.view(x.size(0), x.size(1), -1)
        # print(x.shape)
        x = self.mapSeq(x)
        # print(x.shape)
        x, _ = self.lstm(x)
        # print(x.shape)
        x = self.layer_norm(x)
        # print(x.shape)
        x = self.out(x)
        # print(x.shape)
        x = x.permute(1, 0, 2)
        return x


In [None]:
hidden_size = 256
n_layers = 3
dropout = 0.2
unfreeze_layers = 3
model = CRNN(vocab_size, hidden_size, n_layers, dropout, unfreeze_layers)

In [None]:
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dummy_tensor = torch.rand((32,1,100,420)).to(device)
model.to(device)
with torch.no_grad():
    output = model(dummy_tensor)

print('Output shape', output.shape)

# **Training**

In [None]:
def evaluate(model, dataloader, criterion, device):
    model.eval()
    losses = []
    with torch.no_grad():
        for inputs, labels, labels_len in dataloader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            labels_len = labels_len.to(device)

            outputs = model(inputs)
            # Calculate input lengths for each sample in the batch
            # Assuming 'inputs' has shape (sequence_length, batch_size, input_features)
            logits_lens = torch.full(
                size=(outputs.size(1),),  # batch_size
                fill_value=outputs.size(0),  # sequence_length
                dtype=torch.long
            ).to(device)



            loss = criterion(outputs, labels, logits_lens, labels_len)
            losses.append(loss.item())
        loss = np.mean(losses)
    return loss

In [None]:
def fit(model, dataloader, criterion, optimizer, device, scheduler, epochs):
    train_losses = []
    val_losses = []
    model.to(device)
    for epoch in range(epochs):
        batch_train_losses = []
        model.train()
        for idx, (inputs, labels, labels_len) in enumerate(dataloader):
            inputs = inputs.to(device)
            labels = labels.to(device)
            labels_len = labels_len.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)

            logits_lens = torch.full(
                size = (outputs.size(1),),
                fill_value = outputs.size(0),
                dtype = torch.long
            ).to(device)

            loss = criterion(outputs, labels, logits_lens, labels_len)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
            optimizer.step()

            batch_train_losses.append(loss.item())

        train_loss = np.mean(batch_train_losses)
        train_losses.append(train_loss)

        val_loss = evaluate(
            model,
            val_loader,
            criterion,
            device
        )
        val_losses.append(val_loss)

        print(f"epoch {epoch + 1}:\tTrain loss: {train_loss:.4f}\t Val loss: {val_loss:.4f}")
        scheduler.step(val_loss)
    return train_losses, val_losses

In [None]:
epochs = 100
lr = 0.001
weight_decay = 1e-5
scheduler_step_size =  epochs * 0.4
criterion = nn.CTCLoss(
    blank = char_to_idx[blank_char],
    zero_infinity = True
)
optimizer = torch.optim.Adam(
    model.parameters(),
    lr = lr,
    weight_decay = weight_decay
)
scheduler = torch.optim.lr_scheduler.StepLR(
    optimizer,
    step_size = scheduler_step_size,
    gamma = 0.1
)

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
train_losses, val_losses = fit(
    model,
    train_loader,
    criterion,
    optimizer,
    device,
    scheduler,
    epochs
)

In [None]:
plt.figure(figsize = (10,5))
plt.subplot(1,2,1)
plt.title('train loss')
plt.xlabel('epoch')
plt.ylabel('loss')
plt.plot(train_losses, label = 'train')
plt.subplot(1,2,2)
plt.title('val loss')
plt.xlabel('epoch')
plt.ylabel('loss')
plt.plot(val_losses, label = 'val')
plt.show()

In [None]:
train_loss = evaluate(
    model,
    train_loader,
    criterion,
    device
)
val_loss = evaluate(
    model,
    val_loader,
    criterion,
    device
)

test_loss = evaluate(
    model,
    test_loader,
    criterion,
    device
)

In [None]:
print(train_loss)
print(val_loss)
print(test_loss)

In [None]:
save_path = "models/ocr_crnn_base_best.pt"
torch.save(
    model.state_dict(),
    save_path
)

# End-to-End Pipeline

In [None]:
model = CRNN(vocab_size, hidden_size, n_layers, dropout, unfreeze_layers)
model.load_state_dict(torch.load(save_path))
model.eval()

In [None]:
chars = '0123456789abcdefghijklmnopqrstuvwxyz-'
vocab_size = len(chars)
char_to_idx = {char: idx + 1 for idx,char in enumerate(sorted(chars))}
idx_to_char = {idx: char for char,idx in char_to_idx.items()}

hidden_size = 256
n_layers = 3
unfreeze_layers = 3
dropout_prob = 0.2
device = 'cuda' if torch.cuda.is_available() else 'cpu'

crnn_model = CRNN(
    vocab_size,
    hidden_size,
    n_layers,
    dropout_prob,
    unfreeze_layers
).to(device)
crnn_model.load_state_dict(torch.load(save_path))
crnn_model.eval()

In [None]:
def text_detection(img_path, text_det_model):
    text_det_results = text_det_model(img_path, verbose = False)[0]

    bboxes = text_det_results.boxes.xyxy.tolist()
    classes = text_det_results.boxes.cls.tolist()
    names = text_det_results.names
    scores = text_det_results.boxes.conf.tolist()

    return bboxes, classes,names, scores

def text_recognition(img, data_transforms, text_reg_model, idx_to_char, device):
    transformed_image = data_transforms(img)
    transformed_image = transformed_image.unsqueeze(0).to(device)
    text_reg_model.eval()

    with torch.no_grad():
        logits = text_reg_model(transformed_image).detach().cpu()
    text = decode(logits.permute(1,0,2).argmax(2), idx_to_char)

    return text


In [None]:
def visualize_detections(img, detections):
    plt.figure(figsize = (12,8))
    plt.imshow(img)
    plt.axis('off')

    for bbox, detected_class, confidence, transcribed_text in detections:
        x1, y1, x2, y2 = bbox
        plt.gca().add_patch(
            plt.Rectangle(
                (x1, y1),
                x2 - x1,
                y2 - y1,
                fill=False,
                edgecolor='red',
                linewidth=2
            )
        )

        plt.text(
            x1, y1 - 10,
            f"{detected_class}: {confidence:.2f}\n{transcribed_text}",
            fontsize = 9,
            bbox = dict(facecolor='red', alpha = 0.5)
        )
    plt.show()

In [None]:
def predict(img_path, data_transform, text_det_model, text_reg_model, idx_to_char, device):
    bboxes, classes, names, confs = text_detection(img_path, text_det_model)

    img = Image.open(img_path)

    predictions = []

    for bbox, cls, conf in zip(bboxes, classes, confs):
        x1, y1, x2, y2 = bbox
        confidence = conf
        detected_class = cls
        name = names[int(cls)]

        cropped_image = img.crop((x1,y1,x2,y2))

        transcribed_text = text_recognition(
            cropped_image,
            data_transform,
            text_reg_model,
            idx_to_char,
            device
        )

        predictions.append((bbox, name, confidence, transcribed_text))
    visualize_detections(img, predictions)
    return predictions

In [None]:
from ultralytics import YOLO

model_path = 'models/yolov8/detect/train/weights/best.pt'
yolo_model = YOLO(model_path)

metrics = yolo_model.val(
    project = 'models',
    name = 'yolov8/detect/val',
)

In [None]:
img_dir = 'data/icdar2003/SceneTrialTrain/lfsosa_12.08.2002'
inf_transforms = data_transform['val']

for img_path in os.listdir(img_dir):
    img_path = os.path.join(img_dir, img_path)

    predictions = predict(
        img_path,
        data_transform = inf_transforms,
        text_det_model = yolo_model,
        text_reg_model = crnn_model,
        idx_to_char = idx_to_char,
        device = device
    )

In [None]:
!pip install pyspellchecker