In [20]:
import extruct
import requests
from pprint import pprint
from bs4 import BeautifulSoup
from w3lib.html import get_base_url
from recipe_scrapers import scrape_me

In [43]:
urls = [
    'https://cookieandkate.com/best-tahini-sauce-recipe/',
    'https://www.bonappetit.com/recipe/chicken-tikka-masala',
    'https://www.delish.com/cooking/recipe-ideas/recipes/a46330/skillet-sicilian-chicken-recipe/'
]

In [3]:
def get_html(url: str, headers: dict={}) -> bytes:
    headers = {
        'Access-Control-Allow-Origin': '*',
        'Access-Control-Allow-Methods': 'GET',
        'Access-Control-Allow-Headers': 'Content-Type',
        'Access-Control-Max-Age': '3600',
        'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
    }
    response = requests.get(url, headers=headers)
    return response.text

In [4]:
def get_metadata(html: bytes, url: str) -> dict:
    metadata = extruct.extract(
        html,
        base_url=get_base_url(url),
        syntaxes=['json-ld'],
        uniform=True
    )['json-ld']
    if bool(metadata) and isinstance(metadata, list):
        metadata = metadata[0]
    return metadata

In [5]:
def scrape(url: str, pretty_print: bool=False) -> dict:
    """Parse structured data from a target page."""
    html = get_html(url)
    metadata = get_metadata(html, url)
    if pretty_print:
        pprint(metadata, indent=2, width=150)
    return metadata

In [7]:
metadata = [scrape(url) for url in urls]

In [11]:
metadata[0].keys()

dict_keys(['@context', '@graph'])

In [12]:
metadata[1].keys()

dict_keys(['@context', '@type', 'articleBody', 'alternativeHeadline', 'keywords', 'thumbnailUrl', 'publisher', 'isPartOf', 'isAccessibleForFree', 'author', 'aggregateRating', 'description', 'image', 'headline', 'name', 'recipeIngredient', 'recipeInstructions', 'recipeYield', 'url', 'dateModified', 'datePublished'])

In [50]:
metadata[1].get('recipeIngredient')

['6 garlic cloves, finely grated',
 '4 tsp. finely grated peeled ginger',
 '4 tsp. ground turmeric',
 '2 tsp. garam masala',
 '2 tsp. ground coriander',
 '2 tsp. ground cumin',
 '1½ cups whole-milk yogurt (not Greek)',
 '1 Tbsp. kosher salt',
 '2 lb. skinless, boneless chicken breasts, halved lengthwise',
 '3 Tbsp. ghee (clarified butter) or vegetable oil',
 '1 small onion, thinly sliced',
 '¼ cup tomato paste',
 '6 cardamom pods, crushed',
 '2 dried chiles de árbol or ½ tsp. crushed red pepper flakes',
 '1 28-oz. can whole peeled tomatoes, like San Marzano',
 '2 cups heavy cream',
 '¾ cup chopped cilantro, plus sprigs for garnish',
 'Steamed basmati rice (for serving)']

In [13]:
metadata[2].keys()

dict_keys(['@graph', '@context', 'url', 'publisher', '@type', 'author', 'datePublished', 'headline', 'image', 'mainEntityOfPage', 'thumbnailUrl', 'dateModified', 'isAccessibleForFree', 'hasPart', 'name', 'prepTime', 'cookTime', 'totalTime', 'recipeIngredient', 'recipeInstructions', 'video', 'recipeCuisine', 'aggregateRating', 'review', 'recipeCategory', 'recipeYield', 'description', 'keywords'])

In [49]:
metadata[2].get('recipeIngredient')

['1 tbsp. <p>extra-virgin olive oil</p>',
 '6 <p>bone-in, skin-on chicken thighs (about 2 pounds)</p>',
 '<p>Kosher salt</p>',
 '<p>Freshly ground black pepper</p>',
 '2 <p>cloves garlic, minced</p>',
 '1 tbsp. <p>fresh thyme leaves</p>',
 '1 tsp. <p>crushed red pepper flakes</p>',
 '3/4 c. <p>low-sodium chicken broth</p>',
 '1/2 c. <p>heavy cream</p>',
 '1/2 c. <p>chopped sun-dried tomatoes</p>',
 '1/4 c. <p>freshly grated Parmesan</p>',
 '<p>Freshly torn basil, for serving</p>']

In [48]:
for dic in metadata[0]['@graph']:
    if tags[1] in dic.keys():
        print (dic.get(tags[1]))

['4 medium-to-large cloves garlic, pressed or minced', '1/4 cup lemon juice', '1/2 cup tahini', '1/2 teaspoon fine sea salt', 'Pinch of ground cumin', '6 tablespoons ice water, more as needed']


In [47]:
tags = ['totalTime', 'recipeIngredient']

In [51]:
metadata[2].get('@graph')

[{'@type': 'Question',
  'text': 'Do you prefer cooking chicken thighs or breasts?',
  'suggestedAnswer': [{'@type': 'Answer', 'text': 'Thighs.'},
   {'@type': 'Answer', 'text': 'Breasts.'}]}]

In [55]:
metadata[0]['@context']

'https://schema.org'

## Metadata Structure

In [104]:
metadata[0].get('@type')

In [105]:
metadata[1].get('@type')

'Recipe'

In [106]:
metadata[2].get('@type')

'Recipe'

In [119]:
meta = scrape(urls[1])
mdtype = meta.get('@type', '')
gtype = meta.get('@graph', None)

if mdtype.lower() == 'recipe':
    print ('a')
    data = meta

elif isinstance(gtype, list):
    print ('b')
    for tag in gtype:
        if tag.get('@type', '').lower() == 'recipe':
            data = tag
            break
            
else:
    raise ValueError('Recipe not found')

a


In [120]:
meta.get('@type', '')

'Recipe'