# Optical Character Recognition (OCR)

## Libraries and settings

In [12]:
# Prerequisites:
# 1.) Install tesseract on your computer 
#     Windows: https://linuxhint.com/install-tesseract-windows
#     (dont forget to set the path to your installation of tesseract)
#     Mac: brew install tesseract
#
# 2.) Download german language model (default is english):
#     Go to: https://github.com/tesseract-ocr/tessdata
#     Download: 'deu.traineddata'
#     Copy the file into your tesseract installation folder under
#     ...\Tesseract-OCR\tessdata\deu.traineddata

# Libraries
import os
import re
import cv2
import pytesseract
from pytesseract import Output
import PIL
from PIL import Image

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Show current working directory
print(os.getcwd())

/Users/ivesbrunner/Documents/Studium/01_Bachelor/04_Semester/04_ScientificProgramming/scientific_programming/Week_07/exercises


## Read and plot digital image of a receipt

In [13]:
# Filename to read
img_file = 'receipt_eng.jpg'

# Import and plot image of the receipt
img = cv2.imread(img_file)

cv2.imshow('img', img)
cv2.waitKey(0)
cv2.destroyAllWindows()

## Image processing to improve the image quality

In [14]:
# Increase brightness function
def increase_brightness(img, value=30):
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    h, s, v = cv2.split(hsv)

    lim = 255 - value
    v[v > lim] = 255
    v[v <= lim] += value

    final_hsv = cv2.merge((h, s, v))
    img = cv2.cvtColor(final_hsv, cv2.COLOR_HSV2BGR)
    return img

# Improve image quality step by step
img_01 = cv2.imread(img_file)
img    = increase_brightness(img_01, value=30) 

cv2.imshow('img', img)
cv2.waitKey(0)
cv2.destroyAllWindows()

## Getting boxes around the text

In [15]:
h, w, c = img.shape
boxes = pytesseract.image_to_boxes(img, lang='eng')

for b in boxes.splitlines():
    b = b.split(' ')
    img = cv2.rectangle(img, (int(b[1]), h - int(b[2])), (int(b[3]), h - int(b[4])), (0, 255, 0), 1)

d = pytesseract.image_to_data(img, output_type=Output.DICT, lang='eng')

cv2.imshow('img', img)
cv2.waitKey(0)
cv2.destroyAllWindows()

## Template matching

In [16]:
img_01 = cv2.imread(img_file)
img    = increase_brightness(img_01, value=30)

d = pytesseract.image_to_data(img, output_type=Output.DICT, lang='eng')
keys = list(d.keys())
vals = list(d.values())

# Try to find all numbers using regex
pattern = r'\d+'

n_boxes = len(d['text'])
for i in range(n_boxes):
    if int(d['conf'][i]) >= 0:
        if re.match(pattern, d['text'][i]):
            (x, y, w, h) = (d['left'][i], d['top'][i], d['width'][i], d['height'][i])
            img = cv2.rectangle(img, (x, y), (x + w, y + h), (0,128,255), 2)

# Plot image in a separate window
cv2.imshow('img', img)
cv2.waitKey(0)
cv2.destroyAllWindows()

## Print content of boxes around the text

In [17]:
# Total number of text-boxes
print('Number of boxes:', n_boxes, '\n')

# Boxes
boxes = pytesseract.image_to_boxes(img, lang='eng')
print(boxes[0:180])

# Keys
print(d.keys())

# Values
list(d.values())[11][17:21]

Number of boxes: 77 

> 70 332 88 368 0
# 93 340 103 362 0
O 106 320 120 366 0
C 118 317 136 370 0
K 136 328 153 373 0
E 157 326 170 371 0
N 172 326 187 370 0
G 188 327 202 368 0
L 205 329 216 367 0
I 2
dict_keys(['level', 'page_num', 'block_num', 'par_num', 'line_num', 'word_num', 'left', 'top', 'width', 'height', 'conf', 'text'])


['', 'GIBRALTAR', '', 'TEL']

In [18]:
def replace_chars(text):
    """
    Replaces all characters instead of numbers from 'text'.
    :param text: Text string to be filtered
    :return: Resulting number
    """
    list_of_numbers = re.findall(r'\d+', text)
    result_number = '|'.join(list_of_numbers)
    return result_number

# Content of the receipt
ocr_result = pytesseract.image_to_string(Image.open(img_file), lang='eng')
print(ocr_result)

# Numbers only
ocr_result = replace_chars(ocr_result)
print('Numbers only:')
print(ocr_result)

>OQKENGLIs"

FISH a CHIPS

1 CASEMATES square
GIBRALTAR
TEL +350 20051218

REG 29505-2016 13:33
CLERK 4 Mca 2373

TABLE Noz107

1 LRG GOD & chips
£11.95

1 REG GOD & CHIPS £7.55
1 BREAD & BUTTER £1.00
3 SOFT DRINK £4.50
TOTAL... &25. 00
(37.50)


Numbers only:
1|350|20051218|29505|2016|13|33|4|2373|107|1|11|95|1|7|55|1|1|00|3|4|50|25|00|37|50


## Write extracted text to file

In [19]:
parse_text = []
word_list  = []
last_word  = ''

for word in d['text']:
    if word!='':
        word_list.append(word)
        last_word = word
    if (last_word!='' and word == '') or (word==d['text'][-1]):
        parse_text.append(word_list)
        word_list = []

print(parse_text)

# Write to .csv-file
import csv
with open('result_text_eng.txt',  'w', newline="") as file:
          csv.writer(file, delimiter=" ").writerows(parse_text)

[[], [], [], [], ['>#OCKENGLIS"'], [], [], ['FISH', 'g', 'CHIPS'], [], [], ['1', 'CASEMATES', 'square'], ['GIBRALTAR'], ['TEL', '+360', '20051218'], [], [], ['REG', '23205-2016', '13'], ['CLERK', '4', 'Mc#0'], [], [], ['TABLE', 'Noz107'], [], [], ['1', 'LRG', 'GOD', 'z', 'cHiPs'], ['£11.95'], ['1', 'REG', 'GOD', '&', 'CHIPS', '47.55'], ['1', 'BREAD', '£', 'BUTTER', '£1.00'], ['3', 'SOFT', 'DRINK', '£4.', '60'], ['TOTAL...', '£25.', 'G0'], ['(€37.50)'], [], [], []]


### Jupyter notebook --footer info-- (please always provide this at the end of each notebook)

In [20]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')

-----------------------------------
POSIX
Darwin | 23.4.0
Datetime: 2024-04-03 15:23:14
Python Version: 3.10.13
-----------------------------------
