In [65]:
import extruct
import requests
from pprint import pprint
from bs4 import BeautifulSoup
from w3lib.html import get_base_url

In [66]:
url = 'https://cookieandkate.com/best-tahini-sauce-recipe/'

In [67]:
def get_html(url: str, headers: dict={}) -> bytes:
    headers = {
        'Access-Control-Allow-Origin': '*',
        'Access-Control-Allow-Methods': 'GET',
        'Access-Control-Allow-Headers': 'Content-Type',
        'Access-Control-Max-Age': '3600',
        'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
    }
    response = requests.get(url, headers=headers)
    return response.text

In [68]:
def get_metadata(html: bytes, url: str) -> dict:
    metadata = extruct.extract(
        html,
        base_url=get_base_url(url),
        syntaxes=['json-ld'],
        uniform=True
    )['json-ld']
    if bool(metadata) and isinstance(metadata, list):
        metadata = metadata[0]
    return metadata

In [73]:
def scrape(url: str, pretty_print: bool=False) -> dict:
    """Parse structured data from a target page."""
    html = get_html(url)
    metadata = get_metadata(html, url)
    if pretty_print:
        pprint(metadata, indent=2, width=150)
    return metadata

In [74]:
metadata = scrape(url)
ba_metadata = scrape('https://www.bonappetit.com/recipe/chicken-tikka-masala')

In [77]:
ba_metadata.keys()

dict_keys(['@context', '@type', 'articleBody', 'alternativeHeadline', 'keywords', 'thumbnailUrl', 'publisher', 'isPartOf', 'isAccessibleForFree', 'author', 'aggregateRating', 'description', 'image', 'headline', 'name', 'recipeIngredient', 'recipeInstructions', 'recipeYield', 'url', 'dateModified', 'datePublished'])

In [76]:
metadata.keys()

dict_keys(['@context', '@graph'])

In [80]:
metadata['@graph']

[{'@type': 'Organization',
  '@id': 'https://cookieandkate.com/#organization',
  'name': 'Cookie and Kate',
  'url': 'https://cookieandkate.com/',
  'sameAs': ['https://www.facebook.com/cookieandkate',
   'https://instagram.com/cookieandkate',
   'https://www.linkedin.com/in/kathryne-taylor-20012213/',
   'https://www.youtube.com/user/cookieandkate/',
   'https://www.pinterest.com/cookieandkate/',
   'https://twitter.com/cookieandkate'],
  'logo': {'@type': 'ImageObject',
   '@id': 'https://cookieandkate.com/#logo',
   'inLanguage': 'en-US',
   'url': 'https://cookieandkate.com/images/2017/04/header-google.jpg',
   'contentUrl': 'https://cookieandkate.com/images/2017/04/header-google.jpg',
   'width': 782,
   'height': 336,
   'caption': 'Cookie and Kate'},
  'image': {'@id': 'https://cookieandkate.com/#logo'}},
 {'@type': 'WebSite',
  '@id': 'https://cookieandkate.com/#website',
  'url': 'https://cookieandkate.com/',
  'name': 'Cookie and Kate',
  'description': 'Whole Foods and Veget

In [None]:
tags = []

In [54]:
metadata['recipeYield']

'6 servings'

In [30]:
metadata['recipeIngredient']

['6 garlic cloves, finely grated',
 '4 tsp. finely grated peeled ginger',
 '4 tsp. ground turmeric',
 '2 tsp. garam masala',
 '2 tsp. ground coriander',
 '2 tsp. ground cumin',
 '1½ cups whole-milk yogurt (not Greek)',
 '1 Tbsp. kosher salt',
 '2 lb. skinless, boneless chicken breasts, halved lengthwise',
 '3 Tbsp. ghee (clarified butter) or vegetable oil',
 '1 small onion, thinly sliced',
 '¼ cup tomato paste',
 '6 cardamom pods, crushed',
 '2 dried chiles de árbol or ½ tsp. crushed red pepper flakes',
 '1 28-oz. can whole peeled tomatoes, like San Marzano',
 '2 cups heavy cream',
 '¾ cup chopped cilantro, plus sprigs for garnish',
 'Steamed basmati rice (for serving)']

In [32]:
metadata['recipeInstructions']

[{'@type': 'HowToStep',
  'text': 'Combine garlic, ginger, turmeric, garam masala, coriander, and cumin in a small bowl. Whisk yogurt, salt, and half of spice mixture in a medium bowl; add chicken and turn to coat. Cover and chill 4-6 hours. Cover and chill remaining spice mixture.'},
 {'@type': 'HowToStep',
  'text': 'Heat ghee in a large heavy pot over medium heat. Add onion, tomato paste, cardamom, and chiles and cook, stirring often, until tomato paste has darkened and onion is soft, about 5 minutes. Add remaining half of spice mixture and cook, stirring often, until bottom of pot begins to brown, about 4 minutes.'},
 {'@type': 'HowToStep',
  'text': 'Add tomatoes with juices, crushing them with your hands as you add them. Bring to a boil, reduce heat, and simmer, stirring often and scraping up browned bits from bottom of pot, until sauce thickens, 8-10 minutes.'},
 {'@type': 'HowToStep',
  'text': 'Add cream and chopped cilantro. Simmer, stirring occasionally, until sauce thickens

In [34]:
metadata['description']

'For this chicken tikka masala recipe, the yogurt helps tenderize the chicken; the garlic, ginger, and spices in the marinade infuse it with lots of flavor.'

In [38]:
metadata['image']

['https://assets.bonappetit.com/photos/5b69f163d3d14670539a2174/5:7/w_1900,h_2660,c_limit/ba-tikka-masala-2.jpg',
 'https://assets.bonappetit.com/photos/5b69f163d3d14670539a2174/6:9/w_1776,h_2664,c_limit/ba-tikka-masala-2.jpg',
 'https://assets.bonappetit.com/photos/5b69f163d3d14670539a2174/5:4/w_3330,h_2664,c_limit/ba-tikka-masala-2.jpg',
 'https://assets.bonappetit.com/photos/5b69f163d3d14670539a2174/8:5/w_4264,h_2665,c_limit/ba-tikka-masala-2.jpg',
 'https://assets.bonappetit.com/photos/5b69f163d3d14670539a2174/3:2/w_3999,h_2666,c_limit/ba-tikka-masala-2.jpg',
 'https://assets.bonappetit.com/photos/5b69f163d3d14670539a2174/16:9/w_4736,h_2664,c_limit/ba-tikka-masala-2.jpg',
 'https://assets.bonappetit.com/photos/5b69f163d3d14670539a2174/4:3/w_3552,h_2664,c_limit/ba-tikka-masala-2.jpg',
 'https://assets.bonappetit.com/photos/5b69f163d3d14670539a2174/1:1/w_2666,h_2666,c_limit/ba-tikka-masala-2.jpg']

In [39]:
metadata['author']

[{'@type': 'Person',
  'name': 'Alison Roman',
  'sameAs': 'https://www.bonappetit.com/contributor/alison-roman'}]