In [18]:
import pandas as pd

df = pd.read_table('./data/2014/caption.txt', header=None)  #2014
df.rename(columns={0: "file_name", 1: "text"}, inplace=True)
df['file_name']= df['file_name'].apply(lambda x: x+'.jpg')
df = df.dropna()
df.head()


Unnamed: 0,file_name,text
0,18_em_0.jpg,x _ { k } x x _ { k } + y _ { k } y x _ { k }
1,18_em_10.jpg,2 6
2,18_em_11.jpg,q _ { t } = 2 q
3,18_em_12.jpg,\frac { p e ^ { t } } { 1 - ( 1 - p ) e ^ { t } }
4,18_em_13.jpg,4 ^ { 2 } + 4 ^ { 2 } + \frac { 4 } { 4 }


In [19]:
import torch
from torch.utils.data import Dataset
from PIL import Image

class IAMDataset(Dataset):
    def __init__(self, root_dir, df, processor, max_target_length=490):
        self.root_dir = root_dir
        self.df = df
        self.processor = processor
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # get file name + text 
        file_name = self.df['file_name'][idx]
        text = self.df['text'][idx]
        # prepare image (i.e. resize + normalize)
        image = Image.open(self.root_dir + file_name).convert("RGB")
        pixel_values = self.processor(image, return_tensors="pt").pixel_values
        # add labels (input_ids) by encoding the text
        labels = self.processor.tokenizer(text, 
                                          padding="max_length", 
                                          max_length=self.max_target_length).input_ids
        # important: make sure that PAD tokens are ignored by the loss function
        labels = [label if label != self.processor.tokenizer.pad_token_id else -100 for label in labels]

        encoding = {"pixel_values": pixel_values.squeeze(), "labels": torch.tensor(labels)}
        return encoding

In [20]:
from transformers import TrOCRProcessor

processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
test_dataset = IAMDataset(root_dir='./data/2014/',
                           df=df,
                           processor=processor)

In [21]:
from torch.utils.data import DataLoader

test_dataloader = DataLoader(test_dataset, batch_size=1)

In [22]:
batch = next(iter(test_dataloader))

In [23]:
for k,v in batch.items():
  print(k, v.shape)

pixel_values torch.Size([1, 3, 384, 384])
labels torch.Size([1, 490])


In [24]:
labels = batch["labels"]
labels[labels == -100] = processor.tokenizer.pad_token_id
label_str = processor.batch_decode(labels, skip_special_tokens=True)
label_str

['x _ { k } x x _ { k } + y _ { k } y x _ { k }']

In [25]:
from transformers import VisionEncoderDecoderModel
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = VisionEncoderDecoderModel.from_pretrained('./checkpoint_eval_2014_small_stage1_new_image/checkpoint-15000')
model.to(device)

VisionEncoderDecoderModel(
  (encoder): DeiTModel(
    (embeddings): DeiTEmbeddings(
      (patch_embeddings): PatchEmbeddings(
        (projection): Conv2d(3, 384, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): DeiTEncoder(
      (layer): ModuleList(
        (0): DeiTLayer(
          (attention): DeiTAttention(
            (attention): DeiTSelfAttention(
              (query): Linear(in_features=384, out_features=384, bias=True)
              (key): Linear(in_features=384, out_features=384, bias=True)
              (value): Linear(in_features=384, out_features=384, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): DeiTSelfOutput(
              (dense): Linear(in_features=384, out_features=384, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): DeiTIntermediate(
            (dense): Linear(in_features=

In [30]:
from datasets import load_metric

#cer_metric = load_metric("accuracy")
cer_metric = load_metric("cer")

Downloading:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

In [36]:
from tqdm.notebook import tqdm
import numpy as np

print("Running evaluation...")

total = 0
pred_label = 0

for batch in tqdm(test_dataloader):
    # predict using generate
    pixel_values = batch["pixel_values"].to(device)
    outputs = model.generate(pixel_values)
    # decode
    pred_str = processor.batch_decode(outputs, skip_special_tokens=True)
    labels = batch["labels"]
    labels[labels == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(labels, skip_special_tokens=True)

    if pred_str == label_str:
        pred_label += 1
    total += 1

    #pred_str = np.argmax(pred_str)

    # add batch to metric
    cer_metric.add_batch(predictions=pred_str, references=label_str)

Accuracy_score = pred_label/total
final_score = cer_metric.compute()

Running evaluation...


  0%|          | 0/986 [00:00<?, ?it/s]

In [27]:
#from tqdm.notebook import tqdm
#import numpy as np

#print("Running evaluation...")

#for batch in tqdm(test_dataloader):
    # predict using generate
#    pixel_values = batch["pixel_values"].to(device)
#    outputs = model.generate(pixel_values)

    # decode
#    pred_str = processor.batch_decode(outputs, skip_special_tokens=True)
#    labels = batch["labels"]
#    labels[labels == -100] = processor.tokenizer.pad_token_id
#    label_str = processor.batch_decode(labels, skip_special_tokens=True)

    # add batch to metric
#    cer_metric.add_batch(predictions=pred_str, references=label_str)
#
#final_score = cer_metric.compute()

Running evaluation...


  0%|          | 0/986 [00:00<?, ?it/s]

In [37]:
print("Character error rate on test set:", final_score)

Character error rate on test set: 0.15821501014198783


In [39]:
print("Accuracy rate on test set:", Accuracy_score)

Accuracy rate on test set: 0.15821501014198783
156
986
