# Optical Character Recognition (OCR)

## Libraries and settings

In [10]:
# Prerequisites:
# 1.) Install tesseract on your computer 
#     Windows: https://linuxhint.com/install-tesseract-windows
#     (dont forget to set the path to your installation of tesseract)
#     Mac: brew install tesseract
# 2.) Download german language model (default is english):
#     Go to: https://github.com/tesseract-ocr/tessdata
#     Download: 'deu.traineddata'
#     Copy the file into your tesseract installation folder under
#     ...\Tesseract-OCR\tessdata\deu.traineddata

# Libraries
import os
import re
import cv2
import pytesseract
from pytesseract import Output
import PIL
from PIL import Image

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Show current working directory
print(os.getcwd())

u:\Lektionen\Scientific_Programming_FS2023\scientific_programming\Week_07\exercises


## Read and plot digital image of a receipt

In [11]:
# Filename to read
img_file = 'receipt.png'

# Import and plot image of the receipt
img = cv2.imread(img_file)

cv2.imshow('img', img)
cv2.waitKey(0)
cv2.destroyAllWindows()

## Image processing to improve the image quality

In [12]:
# Increase brightness function
def increase_brightness(img, value=30):
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    h, s, v = cv2.split(hsv)

    lim = 255 - value
    v[v > lim] = 255
    v[v <= lim] += value

    final_hsv = cv2.merge((h, s, v))
    img = cv2.cvtColor(final_hsv, cv2.COLOR_HSV2BGR)
    return img

# Improve image quality step by step
img_01 = cv2.imread(img_file)
img    = increase_brightness(img_01, value=30) 

cv2.imshow('img', img)
cv2.waitKey(0)
cv2.destroyAllWindows()

## Getting boxes around the text

In [13]:
h, w, c = img.shape
boxes = pytesseract.image_to_boxes(img, lang='deu')

for b in boxes.splitlines():
    b = b.split(' ')
    img = cv2.rectangle(img, (int(b[1]), h - int(b[2])), (int(b[3]), h - int(b[4])), (0, 255, 0), 1)

d = pytesseract.image_to_data(img, output_type=Output.DICT, lang='deu')

cv2.imshow('img', img)
cv2.waitKey(0)
cv2.destroyAllWindows()

## Template matching

In [14]:
img_01 = cv2.imread(img_file)
img    = increase_brightness(img_01, value=30)

d = pytesseract.image_to_data(img, output_type=Output.DICT, lang='deu')
keys = list(d.keys())
vals = list(d.values())

# Try to find all numbers using regex
pattern = r'\d+'

n_boxes = len(d['text'])
for i in range(n_boxes):
    if int(d['conf'][i]) >= 0:
        if re.match(pattern, d['text'][i]):
            (x, y, w, h) = (d['left'][i], d['top'][i], d['width'][i], d['height'][i])
            img = cv2.rectangle(img, (x, y), (x + w, y + h), (0,128,255), 2)

# Plot image in a separate window
cv2.imshow('img', img)
cv2.waitKey(0)
cv2.destroyAllWindows()

## Print content of boxes around the text

In [15]:
# Total number of text-boxes
print('Number of boxes:', n_boxes, '\n')

# Boxes
boxes = pytesseract.image_to_boxes(img, lang='deu')
print(boxes[0:180])

# Keys
print(d.keys())

# Values
list(d.values())[11][17:21]

Number of boxes: 106 

B 214 827 229 853 0
e 233 827 247 848 0
r 251 827 263 847 0
q 270 824 284 847 0
h 288 827 303 853 0
o 307 827 321 847 0
t 325 827 340 853 0
e 344 827 358 847 0
l 366 826 375 852 0

dict_keys(['level', 'page_num', 'block_num', 'par_num', 'line_num', 'word_num', 'left', 'top', 'width', 'height', 'conf', 'text'])


['Rech.', 'Nr.', '4572', '30.']

In [16]:
def replace_chars(text):
    """
    Replaces all characters instead of numbers from 'text'.
    :param text: Text string to be filtered
    :return: Resulting number
    """
    list_of_numbers = re.findall(r'\d+', text)
    result_number = '|'.join(list_of_numbers)
    return result_number

# Content of the receipt
ocr_result = pytesseract.image_to_string(Image.open(img_file), lang='deu')
print(ocr_result)

# Numbers only
ocr_result = replace_chars(ocr_result)
print('Numbers only:')
print(ocr_result)

Berqhotel
Grosse Scheidegg
3818 Grindelwald
Familie R.Müller

Rech. Nr. 4572 30.07.2007/13:29:17
Bar Tisch 7/01
2><Latte Macchiato & 4. 50 CHF 9. 00
lelok1' & 5.00 CHF 5.00
1xSchweinschnitze\ ?: 22.00 CHF 22.00
1><Chässpätz H a 18. 50 CHF 18. 50

Total : CHF 54 ‚ 50
Inc]. 7.8% MwSt 54.50 CHF: 3.85

Entspricht in Euro 36. 33 EUR
Es bediente Sie: Ursula

MwSt Nr.: 430 234
Tel.: 033 853 67 15
Fax.: 033 853 67 @
E—mai ] : grossescheidegglbluewin. ch

Numbers only:
3818|4572|30|07|2007|13|29|17|7|01|2|4|50|9|00|1|5|00|5|00|1|22|00|22|00|1|18|50|18|50|54|50|7|8|54|50|3|85|36|33|430|234|033|853|67|15|033|853|67


## Write extracted text to file

In [17]:
parse_text = []
word_list  = []
last_word  = ''

for word in d['text']:
    if word!='':
        word_list.append(word)
        last_word = word
    if (last_word!='' and word == '') or (word==d['text'][-1]):
        parse_text.append(word_list)
        word_list = []

print(parse_text)

# Write to .csv-file
import csv
with open('result_text.txt',  'w', newline="") as file:
          csv.writer(file, delimiter=" ").writerows(parse_text)

[['Berqhotel'], ['Grosse', 'Scheidegg'], ['38l8', 'Grindelwald'], ['Familie', 'R.Müller'], [], [], ['Rech.', 'Nr.', '4572', '30.', '07.', '2007/1329:', '17'], ['Bar', 'Tisch', '7/01'], ['2><Latte', 'Macchiato', '&', '4.', '50', 'CHF', '9.00'], ['leloki', 'a', '5.00', 'CHF', '5.00'], ['1xSchweinschnitzel', '?:', '22.00', 'CHF', '22.00'], ['GChässpätz', 'li', '&', '18.', '50', 'CHF', '18.', '50'], [], [], ['um]:', 'CHF', '5450'], ['lncl.l.ß%lint', '54.50', 'CHF:', '385'], [], [], ['Entspricht', 'in', 'Euro', '36.33', 'EUR'], ['Es', 'bediente', 'Sie:', 'Ursula'], [], [], ['MwSt', 'Nr.:', '430', '234'], ['Tel.:', '033', '853', '67', '16'], ['Fax.:', '033', '853', '67', '19'], ['E—mai', 'l', ':', 'grossescheidegg@bluewim', 'ch']]


### Jupyter notebook --footer info-- (please always provide this at the end of each notebook)

In [18]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')

-----------------------------------
NT
Windows | 10
Datetime: 2023-04-05 17:01:03
Python Version: 3.10.9
-----------------------------------
