In [1]:
import sys
sys.path.append('../src')

import os
import torch
import glob
import numpy as np
import torch.optim as optim
import torch.nn.functional as F

from ocr_data_loader import *
from ocr_utils import *
from ocr_image_transformations import *
from ocr_model import OCRModel
import matplotlib.pyplot as plt
from torchvision import transforms
from torch import nn
import torchvision
from PIL import Image
import torch
import torch.nn.functional as F
import numpy as np
import torch.optim as optim
import random
import time
from Levenshtein import distance
from ocr_utils import *
from torch import nn

# Load & Predict
load a model and get its predictions on the hold out book

In [2]:
BASE_DIR = '../../GT4HistOCR/corpus'
DATA_SET_NAME = 'RefCorpus-ENHG-Incunabula'
HOLDOUT_BOOK = '1499-CronicaCoellen'
MODEL_NAME = '17/model_3a20e32bea03553c26a0989a9ff9892d7dcd5918_dict'
FRAME_SIZE = 1
HIDDEN_LAYER_SIZE = 200
HIDDEN_LAYERS_NUM = 3
ALPHABET_SIZE = 92
BATCH_SIZE = 10

MAX_IMAGE_WIDTH=500
MAX_IMAGE_HEIGHT=20

INPUT_DIMENSION = MAX_IMAGE_HEIGHT * FRAME_SIZE

transformation = transforms.Compose([
    ImageThumbnail(MAX_IMAGE_HEIGHT, MAX_IMAGE_WIDTH),
    transforms.ToTensor(),
    ImageTensorPadding(MAX_IMAGE_HEIGHT, MAX_IMAGE_WIDTH),
    UnfoldImage(1, FRAME_SIZE)
])

train_data, test_data, holdout_data , dataset = load_data(base_dir = BASE_DIR, dataset_name = DATA_SET_NAME, 
                                           holdout_book = HOLDOUT_BOOK, transformation=transformation,
                                           batch_size=BATCH_SIZE, train_test_split=.8)

model = OCRModel(INPUT_DIMENSION, HIDDEN_LAYER_SIZE, HIDDEN_LAYERS_NUM, ALPHABET_SIZE)

state_dict = torch.load(f'../../models/RefCorpus-ENHG-Incunabula/{MODEL_NAME}.pth',
                       map_location=torch.device('cpu'))


model.load_state_dict(state_dict)
model.eval()
predictions = model.get_predictions(holdout_data, dataset.alphabet)

1628
23158


# Time Savings

In [None]:
def get_wrong_predictions_number(predictions, error_threshold):
    wrong = 0
    for true_text, predicted_text in zip(predictions['true_texts'], predictions['predicted_texts']):
        sample_error = distance(true_text, predicted_text)/ len(true_text)
        if sample_error > error_threshold: wrong+=1
    
    return wrong

def calculate_time_saving(samples_number, human_time_per_sample, wrong_predictions, acceptable_error_rate):
    human_time_per_sample = 5 # 5 seconds
    total_human_time = samples_number*human_time_per_sample

    prediction_error_rate = wrong_predictions*100/samples_number
    
    samples_done_by_human = max(prediction_error_rate - acceptable_error_rate, 0)*samples_number / 100
    
    human_time = samples_done_by_human * human_time_per_sample
    time_saving = (total_human_time - human_time)*100/total_human_time
    
    return time_saving

samples_number = len(predictions['predicted_texts'])
acceptable_error_rates = np.arange(0, 22, 1)

error_thersholds = np.arange(0, 0.15, 0.02)
fig, axs = plt.subplots(1, 1, figsize=(10, 10))

for error_threshold in error_thersholds:
    wrong_predictions = get_wrong_predictions_number(predictions, error_threshold)
    model_error_rate = wrong_predictions*100/samples_number
    print(model_error_rate)
    time_savings = []
    for acceptable_error_rate in acceptable_error_rates:
        time_saving = calculate_time_saving(samples_number, 5, wrong_predictions, acceptable_error_rate)
        time_savings.append(int(time_saving))
    axs.set_xlabel("Acceptable Error Rate %") 
    axs.set_ylabel("Time Saving %") 
    axs.plot(acceptable_error_rates, time_savings, label= f'Error Rate = {format(model_error_rate, ".2f")}%')
    axs.legend(loc='lower right')

plt.savefig('time_saving.png')


# Confidence

In [5]:
texts_confidence = np.array(predictions['texts_confidence'])
chars_confidence = np.array(predictions['chars_confidence'])
chars = { i: [] for i,char in enumerate(dataset.alphabet) }
for item in chars_confidence:
    for pair in item:
            chars[pair[0]].append(pair[1])
print(len(chars))
print(texts_confidence.min())
print(texts_confidence.max())

92
0.726382000092417
0.9937453635220709


In [6]:
print(texts_confidence.min())
print(texts_confidence.max())
print(np.array(chars[1]).mean())

0.726382000092417
0.9937453635220709
0.9693175922091773


In [7]:
for key, value in chars.items():
    print(f'{dataset.alphabet[key]} : {format(np.array(value).mean(), ".2f")}')

$ : nan
  : 0.97
! : 0.42
& : nan
( : nan
) : nan
, : nan
. : 0.90
/ : 0.88
3 : nan
4 : nan
: : 0.60
; : nan
? : 0.51
A : 0.83
B : 0.73
C : 0.89
D : 0.90
E : 0.88
F : 0.53
G : 0.81
H : 0.56
I : nan
J : 0.91
K : nan
L : 0.48
M : 0.78
N : 0.72
O : 0.82
P : 0.87
Q : nan
R : 0.66
S : 0.85
T : 0.75
U : nan
V : 0.88
W : 0.89
X : nan
Y : 0.37
Z : 0.51
a : 0.98
b : 0.96
c : 0.98
d : 0.98
e : 0.98
f : 0.97
g : 0.98
h : 0.98
i : 0.98
j : 0.90
k : 0.96
l : 0.98
m : 0.98
n : 0.98
o : 0.98
p : 0.96
q : 0.93
r : 0.97
s : 0.98
t : 0.98
u : 0.96
v : 0.98
w : 0.98
x : 0.87
y : 0.98
z : 0.97
¶ : 0.90
· : 0.85
ß : 0.93
ã : 0.90
ä : nan
ð : 0.91
ñ : 0.96
õ : 0.79
ö : 0.44
ü : 0.80
ÿ : 0.69
ĩ : nan
ũ : 0.86
ů : 0.79
ſ : 0.98
̃ : 0.56
̈ : 0.43
ͤ : 0.86
ᷣ : 0.62
ṽ : nan
ẽ : 0.96
ỹ : 0.67
⸗ : 0.85
ꝓ : nan
ꝭ : nan
ꝰ : nan


  


# Confusion Matrix
Calculate the confusion matrix on char level and report (Accuracy, Recall, Precision)

In [9]:
# use the blank as padding char since we don't care for it
padding_char = '$'

chars = { char : i for i,char in enumerate(dataset.alphabet) }

confusion = get_confusion_matrix(predictions['true_texts'], predictions['predicted_texts'], chars, padding_char)

tp = np.diag(confusion) # True positive
fp = confusion.sum(axis=0) - tp # False positive
fn = confusion.sum(axis=1) - tp # False negative
tn = confusion.trace() - tp # True negative

print(confusion[48][52])
print(confusion[52][48])
for char, i in chars.items():
    accuracy = (tp[i] + tn[i])/ (tp[i] + tn[i] + fp[i] + fn[i])
    recall = tp[i]/ (tp[i] + fn[i])
    precision = tp[i] / (tp[i] + fp[i])
    print(f'{char} : Accuracy {format(accuracy, ".2f")} - Recall {format(recall, ".2f")} - Precision {format(precision, ".2f")}')      


15.0
5.0
$ : Accuracy 1.00 - Recall nan - Precision nan
  : Accuracy 0.94 - Recall 0.84 - Precision 0.83
! : Accuracy 1.00 - Recall nan - Precision nan
& : Accuracy 1.00 - Recall nan - Precision nan
( : Accuracy 1.00 - Recall nan - Precision nan
) : Accuracy 1.00 - Recall nan - Precision nan
, : Accuracy 1.00 - Recall 0.00 - Precision nan
. : Accuracy 0.99 - Recall 0.68 - Precision 0.77
/ : Accuracy 1.00 - Recall 0.47 - Precision 0.83
3 : Accuracy 1.00 - Recall nan - Precision nan
4 : Accuracy 1.00 - Recall nan - Precision nan
: : Accuracy 1.00 - Recall nan - Precision 0.00
; : Accuracy 1.00 - Recall nan - Precision nan
? : Accuracy 1.00 - Recall 0.00 - Precision 0.00
A : Accuracy 1.00 - Recall 0.64 - Precision 0.79
B : Accuracy 1.00 - Recall 0.55 - Precision 0.73
C : Accuracy 1.00 - Recall 0.76 - Precision 0.77
D : Accuracy 1.00 - Recall 0.73 - Precision 0.71
E : Accuracy 1.00 - Recall 0.88 - Precision 0.68
F : Accuracy 1.00 - Recall 0.50 - Precision 0.71
G : Accuracy 1.00 - Recall 0.



In [None]:
DATA_SET_NAME = 'EarlyModernLatin'
HOLDOUT_BOOK = '1483-Decades-Biondo'
MODEL_NAME = '1_150_2/model_1_150_2_50_dict'
FRAME_SIZE = 1
HIDDEN_LAYER_SIZE = 150
HIDDEN_LAYERS_NUM = 2
ALPHABET_SIZE = 181
BATCH_SIZE = 300

MAX_IMAGE_WIDTH=1657
MAX_IMAGE_HEIGHT=50

INPUT_DIMENSION = MAX_IMAGE_HEIGHT * FRAME_SIZE

transformation = transforms.Compose([
    ImageThumbnail(MAX_IMAGE_HEIGHT, MAX_IMAGE_WIDTH),
    transforms.ToTensor(),
    ImageTensorPadding(MAX_IMAGE_HEIGHT, MAX_IMAGE_WIDTH),
    UnfoldImage(1, FRAME_SIZE)
])

train_data, test_data, holdout_data , dataset = load_data(base_dir = BASE_DIR, dataset_name = DATA_SET_NAME, 
                                           holdout_book = HOLDOUT_BOOK, transformation=transformation,
                                           batch_size=BATCH_SIZE, train_test_split=.8)

model = OCRModel(INPUT_DIMENSION, HIDDEN_LAYER_SIZE, HIDDEN_LAYERS_NUM, ALPHABET_SIZE)

state_dict = torch.load(f'../../models/EarlyModelLatin/{MODEL_NAME}.pth',
                       map_location=torch.device('cpu'))


model.load_state_dict(state_dict)
model.eval()
predictions = model.get_predictions(holdout_data, dataset.alphabet)