In [2]:
import re
import pandas as pd
import numpy as np
import os

In [396]:
EXTRACT_PATTERNS = {
  "bbc_food": {
      "title" : [
          {
              "findallFirst" : "<h1 class=\"gel-trafalgar content-title__text\".*?>(.+?)</h1>"
              
          }
      ],
      "servings" : [
          {
              "findallFirst" : "<p class=\"recipe-metadata__serving\".*?>(.+?)</p>"
          },
          {
              "sub" : ("[^0-9]", ""),
          }
      ]
  },
     "british": {
      "title" : [
          {
              "findallFirst" : "<h1 itemprop=\"name\">(.+?)</h1>"
              
          }
      ],
      "servings" : [
          {
              "findallFirst" : "<span class=\"header-attribute-text\">(.+?)</span>"
          },
          {
              "sub" : ("[^0-9]", ""),
          }
      ]
  },
    "italian": {
      "title" : [
          {
              "findallFirst" : "<h1 itemprop=\"name\">(.+?)</h1>"
              
          }
      ],
      "servings" : [
          {
              "findallFirst" : "<span class=\"header-attribute-text\">(.+?)</span>"
          },
          {
              "sub" : ("[^0-9]", ""),
          }
      ]
  },
    "delish": {
      "title" : [
          {
              "findallFirst" : "<h1 class=\"content-hed recipe-hed\">(.+?)</h1>"
              
          }
      ],
      "servings" : [
          {
              "findallFirst" : "<span class=\"yields-amount\">(.+?)</span>"
          },
          {
              "sub" : ("[^0-9]", ""),
          }
      ]
  },
        "good_food": {
      "title" : [
          {
              "findallFirst" : "<h1 class=\"heading-1\">(.+?)</h1>"
              
          }
      ],
      "servings" : [
          {
              "findall3" : "<div class=\"icon-with-text__children\">(.+?)</div>"
          },
          {
              "sub" : ("[^0-9]", ""),
          }
      ]
  },
     "jamie": {
      "title" : [
          {
              "findallFirst" : "<h3 class=\"h1 single-recipe-title\">(.+?)</h3>"
              
          }
      ],
      "servings" : [
          {
              "findallFirst" : "<div class=\"recipe-detail serves\">(.+?)</div>"
          },
          {
              "sub" : ("[^0-9]", ""),
          }
      ]
  },
     "serious": {
      "title" : [
          {
              "findallFirst" : "<h1 class=\"heading__title\">(.+?)</h1>"
              
          }
      ],
      "servings" : [
          {
              "findall3" : "<span class=\"meta-text__data\">(.+?)</span>"
          },
          {
              "sub" : ("[^0-9]", ""),
          }
      ]
  },
     "simply": {
      "title" : [
          {
              "findallFirst" : "<h1 class=\"heading__title\">(.+?)</h1>"
              
          }
      ],
      "servings" : [
          {
              "findall3" : "<span class=\"meta-text__data\">(.+?)</span>"
          },
          {
              "sub" : ("[^0-9]", ""),
          }
      ]
  },
     "spoon": {
      "title" : [
          {
              "findallFirst" : "<h1 itemprop=\"name\">(.+?)</h1>"
              
          }
      ],
      "servings" : [
          {
              "findallFirst" : "<input style=float:left type=number size=2 id=spoonacular-serving-stepper value=(.+?)>"
          },
        {
              "sub" : ("[^0-9]", ""),
          }
      ]
  }
}


In [281]:
def applyCustomRegex(regexType, regexPattern, text):
    if regexType == "findall":
        t = re.findall(regexPattern, text)
        return t
    elif regexType == "findallFirst":
        t = re.findall(regexPattern, text)[0]
        return t
    elif regexType == "findall3":
        t = re.findall(regexPattern, text)[2]
        return t
    elif regexType == "sub":
        pattern = regexPattern[0]
        replaceWith = regexPattern[1]
        if replaceWith == "":
            t = re.sub(pattern, "",text)
        else:
            t = re.sub(pattern, replaceWith,text)
        return t
    elif regexType == "subSplit":
        pattern = regexPattern[0]
        replaceWith = regexPattern[1]
        if replaceWith == "":
            t = [x.strip() for x in re.sub(pattern, "",text).split("\n") if x != ""]

        else:
            t = re.sub(pattern, replaceWith,text).split("\n")
        return t

In [462]:
def getTitleFromSiteContent(title_patterns, site_content):
    content = site_content
    for step in title_patterns:
        for key, value in step.items():
            content = applyCustomRegex(key, value, content)
    return content

def getServingsFromSiteContent(servings_patterns, site_content):
    content = site_content
    for step in servings_patterns:
        for key, value in step.items():
            content = applyCustomRegex(key, value, content)
    return content

def getIngredientsFromSiteContent(site_content,website):
    if website == "bbc_food":
        pattern_remove = "<[^>]*>"
        pattern = "<ul class=\"recipe-ingredients__list\".*?>(.+?)</ul>"
        x = re.findall(pattern, site_content)
        ingredients = list()
        for y in x:
            y = re.sub("</li>","\n",y)
            ingredients.extend([x.strip() for x in re.sub(pattern_remove,"",y).split("\n") if x != ""])
        return ingredients
    elif website == "british" or website == "italian":
        pattern_remove = "<[^>]*>"
        pattern = "<li class=\"IngredientsList__IngredientItem\" itemprop=\"recipeIngredient\">(.+?)</li>"
        x = re.findall(pattern, site_content)
        ingredients = list()
        for y in x:
            y = re.sub("</li>","\n",y)
            ingredients.extend([re.sub(' +', ' ',x.strip()) for x in re.sub(pattern_remove,"",y).split("\n") if x != ""])
        return ingredients
    elif website == "delish": 
        pattern_remove = "<[^>]*>"
        pattern = "<div class=\"ingredient-item\">(.+?)</div>"
        x = re.findall(pattern, site_content)
        ingredients = list()
        for y in x:
            y = re.sub("</div>","\n",y)
            ingredients.extend([re.sub(' +', ' ',x.strip()) for x in re.sub(pattern_remove,"",y).split("\n") if x != ""])
        return ingredients
    elif website == "good_food": 
        pattern_remove = "<[^>]*>"
        pattern = "<li class=\"pb-xxs pt-xxs list-item list-item--separator\">(.+?)</li>"
        x = re.findall(pattern, site_content)
        ingredients = list()
        for y in x:
            y = re.sub("</div>","\n",y)
            ingredients.extend([re.sub(' +', ' ',x.strip()) for x in re.sub(pattern_remove,"",y).split("\n") if x != ""])
        return ingredients
    elif website == "jamie": 
        pattern_remove = "<[^>]*>"
        pattern = "<div class=\"col-md-12 ingredient-wrapper\">(.+?)</div>"
        x = re.findall(pattern, site_content)
        ingredients = list()
        for y in x:
            y = re.sub("</li>","\n",y)
            ingredients.extend([re.sub(' +', ' ',x.strip()) for x in re.sub(pattern_remove,"",y).split("\n") if x != ""])
        return ingredients
    elif website == "simply":
        pattern = "<ul class=\"structured-ingredients__list text-passage\">(.+?)</ul>"
        pattern_remove = "<[^>]*>"
        steps = list()
        x = re.findall(pattern, site_content)
        for y in x:
            y = re.sub("</li>","\n",y)
            steps.extend([x.strip() for x in re.sub(pattern_remove,"",y).split("\n") if x != ""])
        return steps
    elif website == "serious":
        pattern = "<ul id=\"ingredient-list_1-0\" class=\"comp ingredient-list simple-list simple-list--circle \">(.+?)</ul>"
        pattern_remove = "<[^>]*>"
        steps = list()
        x = re.findall(pattern, site_content)
        for y in x:
            y = re.sub("</li>","\n",y)
            steps.extend([x.strip() for x in re.sub(pattern_remove,"",y).split("\n") if x != ""])
        return steps
    elif website == "spoon":
        pattern = "<div class=spoonacular-name>(.+?)</div>"
        pattern_remove = "<[^>]*>"
        ingredient_names = list()
        x = re.findall(pattern, site_content)
        for y in x:
            y = re.sub("</div>","\n",y)
            ingredient_names.extend([x.strip() for x in re.sub(pattern_remove,"",y).split("\n") if x != ""])
        pattern = "<div class=spoonacular-ingredient>(.+?)</div>"
        pattern_remove = "<[^>]*>"
        ingredient_value = list()
        x = re.findall(pattern, site_content)
        for y in x:
            y = re.sub("</div>","\n",y)
            ingredient_value.extend([x.strip() for x in re.sub(pattern_remove,"",y).split("\n") if x != ""])
        return [ingredient_value[i].strip() + " " + ingredient_names[i].strip()  for i in range(len(ingredient_value))]

    
def getStepsFromSiteContent(site_content,website):
    if website == "bbc_food":
        pattern = "<ol class=\"recipe-method__list\".*?>(.+?)</ol>"
        pattern_remove = "<[^>]*>"
        steps = list()
        x = re.findall(pattern, site_content)
        for y in x:
            y = re.sub("</li>","\n",y)
            steps.extend([x.strip() for x in re.sub(pattern_remove,"",y).split("\n") if x != ""])
        return steps
    elif website == "british" or website == "italian":
        pattern = "<div class=\"MethodList__StepText\" itemprop=\"recipeInstructions\">\s*(.+?)</div>"
        pattern_remove = "<[^>]*>"
        steps = list()
        x = re.findall(pattern, site_content)
        for y in x:
            y = re.sub("</li>","\n",y)
            steps.extend([x.strip() for x in re.sub(pattern_remove,"",y).split("\n") if x != ""])
        return steps
    elif website == "delish":
        pattern = "<div class=\"direction-lists\">\s*(.+?)</div>"
        pattern_remove = "<[^>]*>"
        steps = list()
        x = re.findall(pattern, site_content)
        for y in x:
            y = re.sub("</li>","\n",y)
            steps.extend([x.strip() for x in re.sub(pattern_remove,"",y).split("\n") if x != ""])
        return steps
    elif website == "good_food":
        pattern = "<li class=\"pb-xs pt-xs list-item\">(.+?)</div>"
        pattern_remove = "<[^>]*>"
        steps = list()
        x = re.findall(pattern, site_content)
        for y in x:
            y = re.sub("</li>","\n",y)
            steps.extend([x.strip() for x in re.sub(pattern_remove,"",y).split("\n") if x != ""])
        return steps
    elif website == "jamie":
        pattern = "<ol class=\"recipeSteps\">(.+?)</div>"
        pattern_remove = "<[^>]*>"
        steps = list()
        x = re.findall(pattern, site_content)
        for y in x:
            y = re.sub("</li>","\n",y)
            steps.extend([x.strip() for x in re.sub(pattern_remove,"",y).split("\n") if x != ""])
        return steps
    elif website == "serious" or website == "simply":
        pattern = "<OL id=\"mntl-sc-block_3-0\" class=\"comp mntl-sc-block-group--OL mntl-sc-block mntl-sc-block-startgroup\">(.*)</OL>"
        pattern_remove = "<[^>]*>"
        steps = list()
        x = re.findall(pattern, site_content)
        for y in x:
            y = re.sub("</p>","\n",y)
            steps.extend([x.strip() for x in re.sub(pattern_remove,"",y).split("\n") if x != ""])
        return steps
    elif website == "spoon":
        pattern = "<div class=\"recipeInstructions\" itemprop=\"recipeInstructions\">(.+?)</div>"
        pattern_remove = "<[^>]*>"
        steps = list()
        x = re.findall(pattern, site_content)
        for y in x:
            y = re.sub("</li>","\n",y)
            steps.extend([x.strip() for x in re.sub(pattern_remove,"",y).split("\n") if x != ""])
        return steps


In [463]:
#html_path = r"C:\Users\ramang\Developer\Recipe-Extractor\RecipeHtmlExtractor\DATA\british\‘Viennoise’ Plaice Recipe - Great British Chefs.html"
#html_path = r"C:\Users\ramang\Developer\Recipe-Extractor\RecipeHtmlExtractor\DATA\bbc\1970s-style chicken curry recipe - BBC Food.html"
#html_path = r"C:\Users\ramang\Developer\Recipe-Extractor\RecipeHtmlExtractor\DATA\italian\Amaretti Biscuits Recipe - Great Italian Chefs.html"
#html_path = r"C:\Users\ramang\Developer\Recipe-Extractor\RecipeHtmlExtractor\DATA\delish\Apricot-and-Cheddar Chicken Melt Recipe.html"
#html_path = r"C:/Users/ramang/Developer/Recipe-Extractor/RecipeHtmlExtractor/DATA/good_food/Allotment cake recipe.html"
html_path = r"C:\Users\ramang\Developer\Recipe-Extractor\RecipeHtmlExtractor\DATA\spoon/Zucchini and Carrot Crunch.html"
with open(html_path, "r", encoding='utf-8') as f:

    site_content = f.read()
    site_content = re.sub("[^\S ]+","",site_content)
    site_patterns = EXTRACT_PATTERNS['spoon']
    # title
    title_patterns = site_patterns['title']

    # number of servings
    servings_patterns = site_patterns['servings']


    title = getTitleFromSiteContent(title_patterns, site_content)
    print(title)
    servings = getServingsFromSiteContent(servings_patterns, site_content)
    print(servings)
    ingredients = getIngredientsFromSiteContent(site_content,'spoon')
    print(ingredients)
    steps = getStepsFromSiteContent(site_content,'spoon')
    print(steps)

Zucchini and Carrot Crunch
18
['128 g carrots', '124 g yellow zucchini', '150 g parmesan cheese', '1 egg white', '1 garlic clove', '1 pinch sea salt', '0.5 tsps dried rosemary', '0.5 tsps black pepper']
['Preheat oven to 400 FCombine all ingredients together and with a tablespoon drop batter on a baking sheet lined with parchment paperBake until slightly golden about 15 minutes depending on your oven']
