In [16]:
import cv2 
import pytesseract
import numpy as np

pytesseract.pytesseract.tesseract_cmd = r'/opt/homebrew/bin/tesseract'

### Functions for image preprocessing

In [17]:
# get grayscale image
def get_grayscale(image):
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# noise removal
def remove_noise(image):
    return cv2.medianBlur(image,5)
 
#thresholding
def thresholding(image):
    return cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

#dilation
def dilate(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.dilate(image, kernel, iterations = 1)
    
#erosion
def erode(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.erode(image, kernel, iterations = 1)

#opening - erosion followed by dilation
def opening(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel)

#canny edge detection
def canny(image):
    return cv2.Canny(image, 100, 200)

#skew correction
def deskew(image):
    coords = np.column_stack(np.where(image > 0))
    angle = cv2.minAreaRect(coords)[-1]
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle
    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
    return rotated

#template matching
def match_template(image, template):
    return cv2.matchTemplate(image, template, cv2.TM_CCOEFF_NORMED)

In [None]:
img = cv2.imread('../receipts/receipt2.jpg')

gray = get_grayscale(img)
thresh = thresholding(gray)

# TODO: are there more useful image preprocessing methods 

img = thresh

In [30]:
pytesseract.image_to_string(img, lang='deu')

'zZ, Die Gastronorni eirußppe\n\nHochschule Luzern\nInformatik & Wirtschaft\nc/o ZFV-Unternehmungen\nFlüelastrasse 51\n8047 Zürich\n\nCHF\n\nPizza Neapolitana 10.50\nRamseier äpfelschor) 3.00\n\nTotal CHF 13,50\n\nZ MASTERCARD 13.50\n\nMwSt\n\nMwSt Prozent Netto Brutto\n(1) 8.10 12.49 13.50 1.01\n\nProfit Center: InHouse\n23.11.2024 11:36:49 #:2450 0p:568071 Cı56\n8002 $:568\n\nEs bediente Sie - selflheck\n\nKopie # |\n\nCHE-105..827.102 MWST\nVielen Dank und auf Wiedersehen\n\n'