# Optical Character Recognition (OCR)

## Libraries and settings

In [None]:
# Prerequisites:
# 1.) Install tesseract on your computer 
#     Windows: https://linuxhint.com/install-tesseract-windows
#     (dont forget to set the path to your installation of tesseract)
#     Mac: brew install tesseract
#
# 2.) Download german language model (default is english):
#     Go to: https://github.com/tesseract-ocr/tessdata
#     Download: 'deu.traineddata'
#     Copy the file into your tesseract installation folder under
#     ...\Tesseract-OCR\tessdata\deu.traineddata

# Libraries
import os
import re
import cv2
import pytesseract
from pytesseract import Output
import PIL
from PIL import Image

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Show current working directory
print(os.getcwd())

## Read and plot digital image of a receipt

In [None]:
# Filename to read
img_file = 'receipt.png'

# Import and plot image of the receipt
img = cv2.imread(img_file)

cv2.imshow('img', img)
cv2.waitKey(0)
cv2.destroyAllWindows()

## Image processing to improve the image quality

In [None]:
# Increase brightness function
def increase_brightness(img, value=30):
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    h, s, v = cv2.split(hsv)

    lim = 255 - value
    v[v > lim] = 255
    v[v <= lim] += value

    final_hsv = cv2.merge((h, s, v))
    img = cv2.cvtColor(final_hsv, cv2.COLOR_HSV2BGR)
    return img

# Improve image quality step by step
img_01 = cv2.imread(img_file)
img    = increase_brightness(img_01, value=30) 

cv2.imshow('img', img)
cv2.waitKey(0)
cv2.destroyAllWindows()

## Getting boxes around the text

In [None]:
h, w, c = img.shape
boxes = pytesseract.image_to_boxes(img, lang='deu')

for b in boxes.splitlines():
    b = b.split(' ')
    img = cv2.rectangle(img, (int(b[1]), h - int(b[2])), (int(b[3]), h - int(b[4])), (0, 255, 0), 1)

d = pytesseract.image_to_data(img, output_type=Output.DICT, lang='deu')

cv2.imshow('img', img)
cv2.waitKey(0)
cv2.destroyAllWindows()

## Template matching

In [None]:
img_01 = cv2.imread(img_file)
img    = increase_brightness(img_01, value=30)

d = pytesseract.image_to_data(img, output_type=Output.DICT, lang='deu')
keys = list(d.keys())
vals = list(d.values())

# Try to find all numbers using regex
pattern = r'\d+'

n_boxes = len(d['text'])
for i in range(n_boxes):
    if int(d['conf'][i]) >= 0:
        if re.match(pattern, d['text'][i]):
            (x, y, w, h) = (d['left'][i], d['top'][i], d['width'][i], d['height'][i])
            img = cv2.rectangle(img, (x, y), (x + w, y + h), (0,128,255), 2)

# Plot image in a separate window
cv2.imshow('img', img)
cv2.waitKey(0)
cv2.destroyAllWindows()

## Print content of boxes around the text

In [None]:
# Total number of text-boxes
print('Number of boxes:', n_boxes, '\n')

# Boxes
boxes = pytesseract.image_to_boxes(img, lang='deu')
print(boxes[0:180])

# Keys
print(d.keys())

# Values
list(d.values())[11][17:21]

In [None]:
def replace_chars(text):
    """
    Replaces all characters instead of numbers from 'text'.
    :param text: Text string to be filtered
    :return: Resulting number
    """
    list_of_numbers = re.findall(r'\d+', text)
    result_number = '|'.join(list_of_numbers)
    return result_number

# Content of the receipt
ocr_result = pytesseract.image_to_string(Image.open(img_file), lang='deu')
print(ocr_result)

# Numbers only
ocr_result = replace_chars(ocr_result)
print('Numbers only:')
print(ocr_result)

## Write extracted text to file

In [None]:
parse_text = []
word_list  = []
last_word  = ''

for word in d['text']:
    if word!='':
        word_list.append(word)
        last_word = word
    if (last_word!='' and word == '') or (word==d['text'][-1]):
        parse_text.append(word_list)
        word_list = []

print(parse_text)

# Write to .csv-file
import csv
with open('result_text.txt',  'w', newline="") as file:
          csv.writer(file, delimiter=" ").writerows(parse_text)

### Jupyter notebook --footer info-- (please always provide this at the end of each notebook)

In [None]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')