At first, we import the necessary libraries.
**It is important that you adjust the path you have installed tesseract to accordingly!**

In [107]:
import pytesseract
from PIL import Image

# Due to TesseractNotFoundError
# https://stackoverflow.com/questions/50951955/pytesseract-tesseractnotfound-error-tesseract-is-not-installed-or-its-not-i
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

Now comes the main method that takes a screenshot of ingredients as input and outputs a dictionary in the form of *{'product_name':['quantity','unit'], ...}*.

In [132]:
def createIngredientDict(image):
    """
    Outputs a dictionary in the form of {'product_name':['quantity','unit'], ...}
    Input: a screenshot of ingredients
    """
    ingredientList = createIngredientList(image)
    ingredientDict = {}
    
    for elem in ingredientList:
        quantity = isolateQuantity(elem)
        unit = isolateUnit(elem)
        productName = isolateProductName(ingredient=elem, quantity=quantity, unit=unit)
        ingredientDict[productName]=[quantity, unit]
    
    return ingredientDict

This method converts the image to text and then seperates the text row by row into a list.

In [144]:
def createIngredientList(imagePath):
    """
    Creates a list of ingredients from a screenshot
    """
    image = Image.open(imagePath)
    text = cleanUp(pytesseract.image_to_string(image)).lower()
    print(text)
    ingredients = text.splitlines()
    if '' in ingredients:
        ingredients.remove('')
    return ingredients

In [110]:
def cleanUp(text):
    """
    Function to counter common Tesseract mistakes
    """
    # Replacing '%' wit ',5' (e.g. in 1,5 Liter) fits most of the time
    text = text.replace('%',',5 ')
    text = text.replace('ii','ü')
    return text


The next three methods filter the quantity, unit and the product name from a row of ingredients.

In [152]:
def isolateQuantity(ingredient):
    """
    Isolates the digits from the input String
    """
    set = '0123456789'
    if "n.B." in ingredient:
        return "nach Belieben"
    text = ''.join([c for c in ingredient if c in set])
    if text.strip() == ",5":
        return "0,5"
    return text

In [142]:
def isolateUnit(ingredient):
    """
    Isolates the unit from the input String
    """
    ingredient = ingredient.lower()
    measurementUnitInt = ['gramm','g ', 'dekagramm', 'dag', 'kilogramm', 'kg ', 'pfd', 'pfund', 'deciliter', 'dl ', 'centiliter', 'cl ', 'ml ', 'liter',
                       'esslöffel', 'el ', 'tl ', 'ssp.' 'tr', 'tropfen', 'sp', 'spritzer', 'schuss', 'messerspitze', 'msp', 'tasse', 'tas', 'scheibe', 'sc',
                       'kleine', 'kleines' 'große', 'großes'
                       'etwas', 'priese', 'priesen', 'priese(n)',
                       'bund', 'bd ', 'dose', 'dosen', 'dose(n)', 'glas', 'gläser', 'packung', 'packungen', 'pck.', 'rolle', 'rollen', 'rolle(n)' 'würfel']
    
    measurementUnitUS = ['teaspoons','tablespoons','cups','containers','packets','bags','quarts','pounds','cans','bottles',
                          'pints','packages','ounces','jars','heads','gallons','drops','envelopes','bars','boxes','pinches',
                          'dashes','bunches','recipes','layers','slices','links','bulbs','stalks','squares','sprigs',
                          'fillets','pieces','legs','thighs','cubes','granules','strips','trays','leaves','loaves','halves']
    
    for elem in measurementUnitInt :
        if elem in ingredient:
            return elem.rstrip()
    
    for elem in measurementUnitUS:
        if elem in ingredient:
            return elem
        
    return ''

In [113]:
def isolateProductName(ingredient, quantity, unit):
    """
    Stripping the quantity and unit from the ingredient, leaving the product name
    """
    return ingredient.replace(str(quantity), '').replace(unit, '').strip()

Lastly, there is the call of the main function.

In [145]:
InDict = createIngredientDict(image="testbild4.png")
print(InDict)

1. grore knoblauchzehe(n), klein gewirfelt
olivenol
pfeffer, schwarzer, frisch aus der mahle
fleur de sel
etwas milch zum bestreichen

n.b. basilikum, gehacktes
{'. grore knoblauchzehe(n), klein gewirfelt': ['1', ''], 'olivenol': ['', ''], 'pfeffer, hwarzer, frih aus der mahle': ['', 'sc'], 'fleur de sel': ['', ''], 'etwas milch zum bestreichen': ['', ''], 'n.b. basilikum, gehacktes': ['', '']}


In [155]:
InDict = createIngredientDict(image="testbild1.png")
print(InDict)

500 ml tomate(n), passierte
3el butter

2el mehl
{'tomate(n), passierte': ['500', 'ml'], 'butter': ['3', 'el'], 'mehl': ['2', 'el']}


In [156]:
InDict = createIngredientDict(image="testbild4.png")
print(InDict)

1. grore knoblauchzehe(n), klein gewirfelt
olivenol
pfeffer, schwarzer, frisch aus der mahle
fleur de sel
etwas milch zum bestreichen

n.b. basilikum, gehacktes
{'. grore knoblauchzehe(n), klein gewirfelt': ['1', ''], 'olivenol': ['', ''], 'pfeffer, hwarzer, frih aus der mahle': ['', 'sc'], 'fleur de sel': ['', ''], 'etwas milch zum bestreichen': ['', ''], 'n.b. basilikum, gehacktes': ['', '']}


In [157]:
InDict = createIngredientDict(image="testbild5.png")
print(InDict)

1409

6d

,5  liter

zucker
vanilleschote(n)
eigelb
kurbiskernol

schlagsahne
{'': ['', ''], 'd': ['6', ''], ',': ['5', 'liter'], 'zucker': ['', ''], 'vanillehote(n)': ['', 'sc'], 'eigelb': ['', ''], 'kurbiskernol': ['', ''], 'hlagsahne': ['', 'sc']}


In [153]:
image = Image.open("testbild5.png")
text = cleanUp(pytesseract.image_to_string(image)).lower()
print(text)

1409

6d

,5  liter

zucker
vanilleschote(n)
eigelb
kurbiskernol

schlagsahne
