### Error Analysis of PARSeq


In [40]:
import pandas as pd
import cv2
import os
from tqdm import tqdm
from PIL import Image
from editdistance import eval as edit_distance

In [2]:
df = pd.read_csv('./output_with_conf.csv')
df.head()

Unnamed: 0,path,gt,pred,confidence,confidence_string,correct
0,b'/Dataset/modern/Google_books/images/I1KG1259...,པའི་རྩ་བ་ཉམས་པ།བླ་མ་དང་མཆེད་གྲོགས་ལ་དམ་ཚིག་ཉམས་པ་,པའི་རྩ་བ་ཉམས་པ།བླ་མ་དང་མཆེད་གྲོགས་ལ་དམ་ཚིག་ཉམས་པ་,0.724866,"tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000...",True
1,b'/Dataset/modern/Google_books/images/I2PD1835...,སྐྱིད།པོ་ནི་བོད་ཡུལ་མུན་སེལ་གཅིག་པོ་ཡོང་བའི་པོ...,སྐྱིད།པོ་ནི་བོད་ཡུལ་མུན་སེལ་གཅིག་པོ་ཡོང་བའི་པོ...,0.995991,"tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000...",True
2,b'/Dataset/modern/Google_books/images/I1KG1259...,གོས་མེད་གཅེར་བུར་འཁྱགས་སྦུབས་རླུང་མི་བཟད་པས་ཉེ...,གོས་མེད་གཅེར་བུར་འཁྱགས་སྦུབས་རླུང་མི་བཟད་པས་ཉེ...,0.89609,"tensor([1.0000, 0.9999, 1.0000, 1.0000, 1.0000...",True
3,b'/Dataset/modern/Google_books/images/I1KG1259...,།བསམ་མི་ཁྱབ་,།བསམ་མི་ཁྱབ་,0.974782,"tensor([0.9963, 0.9888, 0.9998, 1.0000, 1.0000...",True
4,b'/Dataset/modern/Google_books/images/I1KG1260...,དབང་པོ་གང་དང་གང་ལམ་ཉིད།།,དབང་པོ་གང་དང་གང་ལམ་ཉིད།།,0.97497,"tensor([1.0000, 1.0000, 1.0000, 0.9999, 1.0000...",True


In [3]:
print('Total rows: ', len(df))
print('Total correct: ', len(df[df['correct'] == True]))
print('Total incorrect: ', len(df[df['correct'] == False]))
print('Accuracy: ', len(df[df['correct'] == True]) / len(df))


Total rows:  245790
Total correct:  200935
Total incorrect:  44855
Accuracy:  0.817506814760568


In [7]:
from io import BytesIO

confidence_threshold_buckets = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
# ... existing code until the Excel writing part ...
for i in range(len(confidence_threshold_buckets)-1):
    th1 = confidence_threshold_buckets[i]
    th2 = confidence_threshold_buckets[i+1]
    # incorrect_df is df[df['confidence'] > th1] and df[df['confidence'] < th2]
    incorrect_df = df[(df['confidence'] > th1) & (df['confidence'] < th2)]
    incorrect_df = incorrect_df[incorrect_df['confidence'] < th2]
    incorrect_df = incorrect_df[incorrect_df['correct'] == False]
    # pick max(50, len(incorrect_df))
    incorrect_df = incorrect_df.head(min(50, len(incorrect_df)))
    with pd.ExcelWriter(
        f'./visualized/visual_analysis_{th1}.xlsx', 
        engine='xlsxwriter'
    ) as writer:
        workbook = writer.book
        worksheet = workbook.add_worksheet('Analysis')
        
        # Set column widths
        worksheet.set_column('A:A', 50)  # Image column
        worksheet.set_column('B:D', 20)  # Other columns
        
        # Write headers
        headers = ['Image', 'Ground Truth', 'Prediction', 'Confidence', 'Edit Distance', 'Label Length']
        for col, header in enumerate(headers):
            worksheet.write(0, col, header)
        
        row = 1
        for index, row_data in incorrect_df.iterrows():
            path, gt, pred, conf = row_data['path'], row_data['gt'], row_data['pred'], row_data['confidence']
            path = path.replace('b\'', '').replace('\'', '')
            img_name = path.split('/')[-1]
            path = os.path.join('/Dataset/monlam-data/monlam.ai.ocr/OCR/training_images/', img_name)
            edit_d = edit_distance(gt, pred)
            label_len = len(gt)
            # Read and process image
            img = cv2.imread(path)
            img = cv2.resize(img, (512, 32))
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB
            
            # Save image to memory buffer
            image_buffer = BytesIO()
            Image.fromarray(img).save(image_buffer, format='PNG')
            
            # Insert image into worksheet
            worksheet.insert_image(row, 0, '', {'image_data': image_buffer})
            worksheet.set_row(row, 40)  # Set row height to accommodate image
            
            # Write other data
            worksheet.write(row, 1, gt)
            worksheet.write(row, 2, pred)
            worksheet.write(row, 3, float(conf))
            worksheet.write(row, 4, edit_d)
            worksheet.write(row, 5, label_len)
            
            row += 1

### Edit Distance Analysis

In [38]:
"""
Edit distance = minimum( insertion, deletion, replacements)
While computing edit distance let’s track the most optimal path and note down the frequency insertions, deletions and replacements for each character. 
"""
# import edit distance from editdistance
import editdistance
import pandas as pd

insertions = {}
deletions = {}
replacements = {}

def edit_distance_dp(s1, s2):
    if len(s1) > len(s2):
        s1, s2 = s2, s1
    
    dp = [[0] * (len(s2) + 1) for _ in range(len(s1) + 1)]
    path = [[None] * (len(s2) + 1) for _ in range(len(s1) + 1)]
    for i in range(len(s1) + 1):
        dp[i][0] = i
    for j in range(len(s2) + 1):
        dp[0][j] = j

    for i in range(1, len(s1) + 1):
        for j in range(1, len(s2) + 1):
            if s1[i-1] == s2[j-1]:
                dp[i][j] = dp[i-1][j-1]
                path[i][j] = (i-1, j-1)
            else:
                dp[i][j] = min(dp[i-1][j], dp[i][j-1], dp[i-1][j-1]) + 1
                # here dp[i-1][j] is deletion, dp[i][j-1] is insertion, dp[i-1][j-1] is replacement
                if dp[i][j] == dp[i-1][j] + 1:
                    path[i][j] = (i-1, j)
                elif dp[i][j] == dp[i][j-1] + 1:
                    path[i][j] = (i, j-1)
                else:
                    path[i][j] = (i-1, j-1)
            
        # backtrack to get the path
        i, j = len(s1), len(s2)
        while path[i][j] is not None:
            prev = path[i][j]
            if prev[0] == i-1 and prev[1] == j-1 and s1[i-1] != s2[j-1]:
                replacement_pair = (s1[i-1], s2[j-1])
                replacements[replacement_pair] = replacements.get(replacement_pair, 0) + 1
            elif prev[0] == i-1 and prev[1] == j:
                deletions[s1[i-1]] = deletions.get(s1[i-1], 0) + 1
            elif prev[0] == i and prev[1] == j-1:
                insertions[s2[j-1]] = insertions.get(s2[j-1], 0) + 1
            i, j = prev
    return dp[-1][-1]


output_df = pd.read_csv('./output_with_conf.csv')

# apply edit distance dp to gt and pred, but remove these two chars before computing edit distance: ་, །
output_df['edit_distance'] = output_df.apply(lambda row: editdistance.eval(row['gt'].replace('་', '').replace('།', ''), row['pred'].replace('་', '').replace('།', '')), axis=1)
output_df.to_csv('./output_with_edit_distance_no_diacritics.csv', index=False)

In [42]:
# calculate these values:, Accuracy, 1 - NED, Confidence, Label Length
no_diacritics_df = pd.read_csv('./output_with_edit_distance_no_diacritics.csv')
total_acc = 0
total_ned = 0
total_conf = 0
total_label_len = 0
for i in tqdm(range(len(no_diacritics_df))):
    gt, pred = no_diacritics_df.iloc[i]['gt'], output_df.iloc[i]['pred']
    if gt.replace('་', '').replace('།', '') == pred.replace('་', '').replace('།', ''):
        total_acc += 1     
    total_ned += 1 - no_diacritics_df.iloc[i]['edit_distance'] / max(len(gt), len(pred))
    total_conf += no_diacritics_df.iloc[i]['confidence']
    total_label_len += len(gt)
    
print('Results without symbols: tsek and shey')
print('Total samples: ', len(no_diacritics_df))
print('Accuracy: ', total_acc / len(output_df))
print('1 - NED: ', total_ned / len(output_df))
print('Confidence: ', total_conf / len(output_df))
print('Label Length: ', total_label_len / len(output_df))

100%|██████████| 245790/245790 [00:35<00:00, 6839.96it/s]

Results without symbols: tsek and shey
Total samples:  245790
Accuracy:  0.89003620977257
1 - NED:  0.9969187756614225
Confidence:  0.8958869955094385
Label Length:  57.43771512266569





In [14]:
# sort dictionaries with value
insertions = dict(sorted(insertions.items(), key=lambda item: item[1], reverse=True))
deletions = dict(sorted(deletions.items(), key=lambda item: item[1], reverse=True))
replacements = dict(sorted(replacements.items(), key=lambda item: item[1], reverse=True))

# update value with (value, unicode(key))
insertions_new = {k: (v, ord(k)) for k, v in insertions.items()}
deletions = {k: (v, ord(k)) for k, v in deletions.items()}
replacements = {k: (v, ord(k)) for k, v in replacements.items()}


TypeError: ord() expected string of length 1, but tuple found

In [24]:
# get top 10 
print(list(insertions.items())[:10])
print(list(deletions.items())[:10])
print(list(replacements.items())[:10])

[('་', (40211, 3851)), ('།', (5468, 3853)), ('ུ', (1599, 3956)), ('ོ', (876, 3964)), ('ི', (621, 3954)), ('ྲ', (605, 4018)), ('ེ', (541, 3962)), ('ས', (392, 3942)), ('ྱ', (387, 4017)), ('ཱ', (370, 3953))]
[('་', (237, 3851)), ('།', (173, 3853)), ('ུ', (70, 3956)), ('ོ', (62, 3964)), ('ས', (61, 3942)), ('ེ', (60, 3962)), ('ི', (50, 3954)), ('ད', (45, 3921)), ('ག', (42, 3906)), ('ྲ', (38, 4018))]
[(('༌', '་'), 9668), (('་', '༌'), 3327), (('༑', '།'), 2491), (('།', '་'), 1025), (('་', '།'), 476), (('།', '༑'), 406), (('ཪ', 'ར'), 292), (('ེ', 'ི'), 257), (('ི', 'ེ'), 231), (('༎', '།'), 210)]


In [34]:
print(ord('ཪ'))

3946


In [37]:
# print dict values for Tibetan numerals
tib_numerals = [chr(i) for i in range(0x0F20, 0x0F33)]
print('insertions:')
for num in tib_numerals:
    print(num, insertions.get(num, 0))
print('deletions:')
for num in tib_numerals:    
    print(num, deletions.get(num, 0))
print('replacements:')
for num in tib_numerals:
    print(num, replacements.get(num, 0))

insertions:
༠ 0
༡ 0
༢ 0
༣ 0
༤ 0
༥ 0
༦ 0
༧ 0
༨ 0
༩ 0
༪ 0
༫ 0
༬ 0
༭ 0
༮ 0
༯ 0
༰ 0
༱ 0
༲ 0
deletions:
༠ 0
༡ 0
༢ 0
༣ 0
༤ 0
༥ 0
༦ 0
༧ 0
༨ 0
༩ 0
༪ 0
༫ 0
༬ 0
༭ 0
༮ 0
༯ 0
༰ 0
༱ 0
༲ 0
replacements:
༠ 0
༡ 0
༢ 0
༣ 0
༤ 0
༥ 0
༦ 0
༧ 0
༨ 0
༩ 0
༪ 0
༫ 0
༬ 0
༭ 0
༮ 0
༯ 0
༰ 0
༱ 0
༲ 0
