# Procedural Programming Example: Optical Character Recognition (OCR)

## Install & Import Python libraries

In [1]:
# Libraries
import cv2
import re
import pytesseract
from pytesseract import Output
import nltk
import PIL
from PIL import Image

print("Version of package cv2:", cv2.__version__)
print("Version of package re:", re.__version__)
print("Version of package pytesseract:", pytesseract.__version__)
print("Version of package nltk:", nltk.__version__)
print("Version of PIL:", PIL.__version__)

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Get current working directory
print(os.getcwd())

ModuleNotFoundError: No module named 'pytesseract'

## Read and plot digital image of the receipt

In [2]:
# Import and plot image of the receipt
img = cv2.imread('receipt.png')

cv2.imshow('img', img)
cv2.waitKey(0)
cv2.destroyAllWindows()

## Image processing to improve the image quality

In [3]:
# Increase brightness function
def increase_brightness(img, value=30):
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    h, s, v = cv2.split(hsv)

    lim = 255 - value
    v[v > lim] = 255
    v[v <= lim] += value

    final_hsv = cv2.merge((h, s, v))
    img = cv2.cvtColor(final_hsv, cv2.COLOR_HSV2BGR)
    return img

# Improve image quality step by step
img_01 = cv2.imread('receipt.png')
img    = increase_brightness(img_01, value=30) 

cv2.imshow('img', img)
cv2.waitKey(0)
cv2.destroyAllWindows()

## Getting boxes around the text

In [4]:
h, w, c = img.shape
boxes = pytesseract.image_to_boxes(img, lang='deu')

for b in boxes.splitlines():
    b = b.split(' ')
    img = cv2.rectangle(img, (int(b[1]), h - int(b[2])), (int(b[3]), h - int(b[4])), (0, 255, 0), 1)

d = pytesseract.image_to_data(img, output_type=Output.DICT, lang='deu')

cv2.imshow('img', img)
cv2.waitKey(0)
cv2.destroyAllWindows()

## Template Matching

In [5]:
img_01 = cv2.imread('receipt.png')
img    = increase_brightness(img_01, value=30)

d = pytesseract.image_to_data(img, output_type=Output.DICT, lang='deu')
keys = list(d.keys())
vals = list(d.values())

# Mark all numbers using regexpression
pattern = '[0-9][0-9]'

n_boxes = len(d['text'])
for i in range(n_boxes):
    if int(d['conf'][i]) >= 20:
        if re.match(pattern, d['text'][i]):
            (x, y, w, h) = (d['left'][i], d['top'][i], d['width'][i], d['height'][i])
            img = cv2.rectangle(img, (x, y), (x + w, y + h), (0,128,255), 2)

cv2.imshow('img', img)
cv2.waitKey(0)
cv2.destroyAllWindows()

# Box informations
# print(n_boxes)
# print(boxes)

## Print content of boxes around the text

In [6]:
# Total number of text-boxes
# print(n_boxes)

# Boxes
boxes = pytesseract.image_to_boxes(img, lang='deu')
print(boxes[0:180])

# Keys
# print(d.keys())

# Values
list(d.values())[11][17:21]

B 214 827 229 853 0
e 233 827 247 848 0
r 251 827 263 847 0
g 270 824 284 847 0
h 288 827 303 853 0
o 307 827 321 847 0
t 325 827 340 853 0
e 344 827 358 847 0
l 366 826 375 852 0



['Rech.', 'Nr.', '4572', '30.07.2007/13:29:']

In [7]:
def replace_chars(text):
    """
    Replaces all characters instead of numbers from 'text'.
    :param text: Text string to be filtered
    :return: Resulting number
    """
    list_of_numbers = re.findall(r'\d+', text)
    result_number = '|'.join(list_of_numbers)
    return result_number


ocr_result = pytesseract.image_to_string(Image.open('receipt.png'), lang='deu')

print(ocr_result)

Berdhotel
Grosse Scheidegg
3818 Grindelwald
Familie R.Müller

Rech. Nr. 4572 30.07. 2007/13:29: 17
Bar Tisch (/0M
2xLatte Macchiato ä 4.50 CHF 9.00
1xGloki 8 500 HE 5.00
1xSchweinschnitzel ä 22.00 CHF 22.00
1xChässpätz 1i &a 1850° CHE 18050

Total : CHF 54.50
Incl. 7.6% MwSt 54.50 CHF: 3.85

Entspricht in Euro 36.33 EUR
Es bediente Sie: Ursula

MwSt Nr. : 430 234
Tel. : 033 853 67 16
Fax.: 053853 649
E-mail: grossescheidegg@bluewin. ch




## Write Text to File

In [8]:
parse_text = []
word_list  = []
last_word  = ''

for word in d['text']:
    if word!='':
        word_list.append(word)
        last_word = word
    if (last_word!='' and word == '') or (word==d['text'][-1]):
        parse_text.append(word_list)
        word_list = []

print(parse_text[1:4])

# Write to .csv-file
import csv
with open('result_text.txt',  'w', newline="") as file:
          csv.writer(file, delimiter=" ").writerows(parse_text)

[['Grosse', 'Scheidegg'], ['3818', 'Grindelwald'], ['Familie', 'R.Müller']]


## Simple Language-Model

In [15]:
import nltk
# nltk.download('reuters')
# nltk.download('punkt')
from nltk.corpus import reuters
from nltk import bigrams, trigrams
from collections import Counter, defaultdict

# Create a placeholder for the model
model = defaultdict(lambda: defaultdict(lambda: 0))

# Count frequency of co-occurence  
for sentence in reuters.sents():
    for w1, w2, w3 in trigrams(sentence, pad_right=True, pad_left=True):
        model[(w1, w2)][w3] += 1

# Let's transform the counts to probabilities
for w1_w2 in model:
    total_count = float(sum(model[w1_w2].values()))
    for w3 in model[w1_w2]:
        model[w1_w2][w3] /= total_count

In [13]:
# Apply language model
terms = dict(model["today", "the"])
sorted_terms = sorted(terms.items(), key=lambda x: x[1], reverse=True)

# Next word according to its probability
for i in sorted_terms[0:10]:
    print(i[0], i[1])

company 0.16666666666666666
price 0.1111111111111111
public 0.05555555555555555
European 0.05555555555555555
Bank 0.05555555555555555
emirate 0.05555555555555555
overseas 0.05555555555555555
newspaper 0.05555555555555555
Turkish 0.05555555555555555
increase 0.05555555555555555


### Jupyter notebook --footer info-- (please always provide this at the end of each notebook)

In [None]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')