In [1]:
from data.loader.custom_loader import CustomLoader
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
import torch
from jiwer import cer
from tqdm import tqdm
from PIL import Image
from collections import defaultdict

# Load the pre-trained TrOCR model and processor
small_medium_model = VisionEncoderDecoderModel.from_pretrained("../custom_models/trocr-printed/w_augmentation/small_medium/1.0/vision_model")
small_medium_processor = TrOCRProcessor.from_pretrained("../custom_models/trocr-printed/w_augmentation/small_medium/1.0/processor")

kubhi_paths = ["../datasets/printed/Histrorical_News_Paper/combined.csv" ]
kubhist_cl = CustomLoader(kubhi_paths)
kubhist_cl.generate_dataframe()
#put ../ in every file name in the dataframe
kubhist_df = kubhist_cl.get_dataframe()
kubhist_df["file_name"] = "../" + kubhist_df["file_name"]
kubhist_df.head()

File exists: ../datasets/printed/Histrorical_News_Paper/combined.csv
Encoding: utf-8


Calculating max length: 100%|██████████| 8393/8393 [00:00<00:00, 22428.12it/s]


Unnamed: 0,file_name,text
0,../datasets/printed/Histrorical_News_Paper/TES...,denna ſekt.
1,../datasets/printed/Histrorical_News_Paper/TES...,﻿Antiomianer kallas ſå af Gre=
2,../datasets/printed/Histrorical_News_Paper/TES...,De antaga ej goda gerningar ſåſom
3,../datasets/printed/Histrorical_News_Paper/TES...,"nödwändiga medel till ſaligheten, och på="
4,../datasets/printed/Histrorical_News_Paper/TES...,"ſtå, att de utwalde ingenting kunna"


In [2]:
# Function to evaluate CER and show frequency of mismatched characters
def evaluate_cer_and_mismatched_chars(_model, _processor, _dataset):
    _model.eval()
    cer_scores = []
    mismatched_chars = defaultdict(int)

    for _, example in tqdm(_dataset.iterrows(), total=len(_dataset)):
        image_path = example["file_name"]
        ground_truth_text = example["text"]

        # Load the image
        image = Image.open(image_path).convert("RGB")

        # Preprocess the image
        pixel_values = _processor(images=image, return_tensors="pt").pixel_values

        # Generate prediction
        with torch.no_grad():
            generated_ids = _model.generate(pixel_values)
        predicted_text = _processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

        # Calculate CER
        cer_score = cer(ground_truth_text, predicted_text)
        cer_scores.append(cer_score)

        # Compare ground truth and predicted text character by character
        for gt_char, pred_char in zip(ground_truth_text, predicted_text):
            if gt_char != pred_char:
                mismatched_chars[(gt_char, pred_char)] += 1

    # Return the average CER and mismatched characters frequency
    _average_cer = sum(cer_scores) / len(cer_scores)
    return _average_cer, dict(mismatched_chars)

In [3]:
# Evaluate the model and get mismatched characters frequency
small_medium_model_average_cer, small_medium_model_mismatched_chars_freq = evaluate_cer_and_mismatched_chars(small_medium_model, small_medium_processor, kubhist_df[:2000])
print(f"Small medium model trained on 250k synthetic datasetAverage CER: {small_medium_model_average_cer}")
print("Mismatched Characters Frequency:")
for chars, freq in small_medium_model_mismatched_chars_freq.items():
    print(f"{chars}: {freq}")

100%|██████████| 2000/2000 [44:10<00:00,  1.33s/it] 

Small medium model trained on 250k synthetic datasetAverage CER: 0.9171647146017876
Mismatched Characters Frequency:
('e', 'o'): 337
('n', 'i'): 315
('n', ' '): 1115
('a', ':'): 49
('ſ', '1'): 18
('e', '0'): 40
('k', '.'): 23
('t', ' '): 980
('.', '1'): 22
('\ufeff', 'E'): 40
('A', 'n'): 13
('t', 's'): 225
('i', 't'): 177
('m', 'r'): 149
('i', ' '): 633
('n', 'v'): 103
('e', ' '): 1384
('r', 'd'): 141
(' ', 'e'): 768
('k', 'm'): 38
('a', ' '): 1153
('l', 'b'): 64
('l', 'o'): 145
('a', 'r'): 356
('s', ' '): 331
(' ', 'i'): 612
('ſ', ' '): 409
('å', 'O'): 1
(' ', 's'): 498
('a', 'l'): 250
('f', 'o'): 63
('G', 'p'): 3
('r', 'å'): 45
('=', 'V'): 3
('a', 'v'): 112
('n', 'a'): 419
('t', 'n'): 270
('g', 'i'): 138
('a', 'g'): 153
(' ', 'a'): 703
('j', 'p'): 8
(' ', 'å'): 147
('g', ' '): 338
('o', 'd'): 67
('d', 'a'): 204
('g', 'n'): 145
('r', 'j'): 56
('i', 'n'): 190
('n', 'u'): 127
('g', 'a'): 174
('r', 'i'): 280
('ſ', 'j'): 52
('å', 'u'): 17
('ſ', 'n'): 97
('o', 'i'): 132
('m', '.'): 30
('n'




In [4]:
# Load the pre-trained TrOCR model and processor
small_model = VisionEncoderDecoderModel.from_pretrained("../custom_models/trocr-printed/w_augmentation/small/1.1/vision_model")
small_processor = TrOCRProcessor.from_pretrained("../custom_models/trocr-printed/w_augmentation/small/1.1/processor")

small_model_average_cer, small_model_mismatched_chars_freq = evaluate_cer_and_mismatched_chars(small_model, small_processor, kubhist_df[:2000])
print(f"Small model trained on 120k synthetic dataset average CER: {small_model_average_cer}")
print("Mismatched Characters Frequency:")
for chars, freq in small_model_mismatched_chars_freq.items():
    print(f"{chars}: {freq}")

100%|██████████| 2000/2000 [43:45<00:00,  1.31s/it]

Small model trained on 120k synthetic dataset average CER: 0.9763435173651054
Mismatched Characters Frequency:
('d', 'u'): 67
('e', 't'): 369
('n', 'g'): 145
('n', '.'): 48
('a', ' '): 1135
(' ', ']'): 34
('ſ', ' '): 400
('e', ')'): 12
('k', ','): 63
('t', ' '): 992
('.', 's'): 50
('\ufeff', '1'): 39
('A', '6'): 1
('n', ' '): 1059
('t', 'k'): 141
('i', 'm'): 76
('o', ' '): 528
('m', 'v'): 60
('a', 'n'): 382
('e', 's'): 386
('r', ' '): 1009
(' ', 'a'): 841
('k', 'v'): 31
('l', 'A'): 20
('l', 'n'): 228
('a', 't'): 326
('s', 'o'): 71
(' ', 'n'): 708
('ſ', 'i'): 122
('å', 'n'): 49
('f', 'O'): 1
(' ', 'r'): 808
('G', 'd'): 3
('r', 'i'): 325
('e', 'n'): 451
('=', 'a'): 29
('a', 'v'): 146
('n', 'a'): 431
('t', 'n'): 309
('g', ' '): 396
('a', '4'): 9
('e', '-'): 19
('j', ' '): 70
(' ', '1'): 86
('g', ','): 50
('d', '1'): 28
(' ', '-'): 65
('e', '0'): 38
('r', ','): 165
('i', '1'): 21
('g', 'p'): 50
('a', 'o'): 241
('r', 'ä'): 75
('ſ', 'g'): 46
('å', ','): 29
('o', '0'): 14
('m', ' '): 334
('n'




In [5]:
# transfer learning
# Load the pre-trained TrOCR model and processor
tl_small_model = VisionEncoderDecoderModel.from_pretrained("../custom_models/trocr-printed/transfer-learning/small/1.1/vision_model")
tl_small_processor = TrOCRProcessor.from_pretrained("../custom_models/trocr-printed/transfer-learning/small/1.1/processor")

tl_small_model_average_cer, tl_small_model_mismatched_chars_freq = evaluate_cer_and_mismatched_chars(tl_small_model, tl_small_processor, kubhist_df[:2000])
print(f"transfer learned HTR with small synthetic dataset 120k model average CER: {tl_small_model_average_cer}")
print("Mismatched Characters Frequency:")
for chars, freq in tl_small_model_mismatched_chars_freq.items():
    print(f"{chars}: {freq}")

100%|██████████| 2000/2000 [46:44<00:00,  1.40s/it] 

transfer learned HTR with small synthetic dataset 120k model average CER: 0.8310155748062448
Mismatched Characters Frequency:
('d', ')'): 42
('e', ','): 214
('n', '.'): 93
('\ufeff', '3'): 12
('A', '1'): 5
('n', ' '): 996
('t', '%'): 3
('i', ' '): 675
('o', 'a'): 175
('m', 'v'): 39
('a', 'ö'): 44
('e', 's'): 298
('r', ' '): 1009
(' ', 'B'): 41
('k', 'N'): 3
('a', 'P'): 9
('l', ' '): 736
('l', 'f'): 75
('s', 'l'): 92
(' ', 'l'): 557
('ſ', 'e'): 112
('å', 'r'): 61
('a', 'p'): 98
('f', 'å'): 29
('G', 'o'): 7
('r', 'l'): 222
('e', 'j'): 57
('=', 'a'): 25
('D', 'B'): 2
('e', 'd'): 261
('a', 'v'): 101
('n', 'o'): 236
('t', 'k'): 147
('g', 'l'): 98
('a', 'e'): 443
(' ', 'r'): 603
('e', ' '): 1347
('j', '/'): 3
('g', '/'): 8
('o', ' '): 532
('d', 'g'): 51
('a', 'å'): 56
('g', ' '): 369
('e', 'v'): 169
('r', 'a'): 346
('i', 'l'): 253
('n', 'i'): 348
('r', 'n'): 325
('ſ', 'i'): 135
('å', ' '): 234
('ſ', 'f'): 61
('o', 'ä'): 28
('m', 'r'): 158
('n', 'd'): 176
('ö', 'u'): 14
('d', 'a'): 211
('w', 




In [6]:
# transfer learning
# Load the pre-trained TrOCR model and processor
tl_tiny_model = VisionEncoderDecoderModel.from_pretrained("../custom_models/trocr-printed/transfer-learning/tiny/1.0/vision_model")
tl_tiny_processor = TrOCRProcessor.from_pretrained("../custom_models/trocr-printed/transfer-learning/tiny/1.0/processor")

tl_tiny_model_average_cer, tl_tiny_model_mismatched_chars_freq = evaluate_cer_and_mismatched_chars(tl_tiny_model, tl_tiny_processor, kubhist_df[:2000])
print(f"transfer learned HTR with tiny synthetic dataset 42k model average CER: {tl_tiny_model_average_cer}")
print("Mismatched Characters Frequency:")
for chars, freq in tl_tiny_model_mismatched_chars_freq.items():
    print(f"{chars}: {freq}")

100%|██████████| 2000/2000 [47:32<00:00,  1.43s/it] 

transfer learned HTR with tiny synthetic dataset 42k model average CER: 0.9378913868262783
Mismatched Characters Frequency:
('d', '6'): 13
('e', ' '): 1412
('n', 'm'): 149
('n', 'a'): 417
('a', 'r'): 375
(' ', 's'): 538
('ſ', ' '): 385
('e', 'C'): 19
('k', '.'): 21
('\ufeff', '1'): 27
('A', '0'): 5
('n', ' '): 1020
('t', 'k'): 156
('i', 'm'): 60
('o', ' '): 531
('m', 'v'): 43
('a', 'n'): 360
('e', 's'): 394
('r', ' '): 1022
(' ', 'a'): 654
('k', 'v'): 23
('a', ' '): 1142
('l', 'K'): 16
('l', 'a'): 215
('a', 'l'): 281
('s', 'l'): 108
(' ', 'e'): 786
('å', 'M'): 3
(' ', 'o'): 406
('f', 'a'): 108
('G', 'p'): 4
('r', 'å'): 54
('=', 'M'): 1
('D', '1'): 5
('e', '5'): 5
('a', 'm'): 130
('t', 'r'): 258
('a', 's'): 322
('g', ' '): 418
('a', 'J'): 15
(' ', '.'): 250
('j', 'A'): 3
('o', 'C'): 12
('d', 'o'): 136
('a', 'o'): 241
(' ', 'k'): 267
('g', 'e'): 140
('e', ','): 207
('n', 'b'): 76
('i', 'r'): 207
('n', 'i'): 334
('g', 't'): 126
('a', 't'): 290
('r', 'i'): 290
('ſ', 'k'): 57
('å', ' '): 25


