In [39]:
import extruct
import requests
from pprint import pprint
from bs4 import BeautifulSoup
from w3lib.html import get_base_url
from recipe_scrapers import scrape_me
from bs4 import BeautifulSoup

In [40]:
SYNTAXES = ["json-ld", "microdata"]

def get_html(url: str, headers: dict={}) -> bytes:
    headers = {
        'Access-Control-Allow-Origin': '*',
        'Access-Control-Allow-Methods': 'GET',
        'Access-Control-Allow-Headers': 'Content-Type',
        'Access-Control-Max-Age': '3600',
        'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
    }
    response = requests.get(url, headers=headers)
    return response.text

def get_metadata(html: bytes, url: str) -> dict:
    metadata = extruct.extract(html, syntaxes=SYNTAXES, errors="log", uniform=True)
    return metadata

def scrape(url: str, pretty_print: bool=False) -> dict:
    """Parse structured data from a target page."""
    html = get_html(url)
    metadata = get_metadata(html, url)
    if pretty_print:
        pprint(metadata, indent=2, width=150)
    return metadata

## Serious Eats

In [218]:
url = 'https://www.allrecipes.com/recipe/228238/goat-stew/'
page = get_html(url)
soup = BeautifulSoup(page, "html.parser")

In [219]:
soup.find('meta', {'name': 'og:rating'})

<meta content="4.5777777777777775" name="og:rating"/>

In [160]:
tags = soup.find('ul', {'class': 'instructions-section'})

In [164]:
tags.find_all('p')[0].get_text()

'Mix goat meat with vinegar, soy sauce, and garlic in a large bowl; cover and refrigerate from 1 to 8 hours. For best flavor, marinate at least 6 hours. Remove meat from marinade and pat dry with paper towels; reserve marinade and garlic cloves.'

In [172]:
text = soup.find('div', 
          {'class': 'recipe-nutrition-section'}
         ).find('div', {'class': 'section-body'}).get_text().strip()

In [174]:
text.endswith('Full Nutrition')

True

In [180]:
text = text.replace('. Full Nutrition', '')
text2 = text.split(';')
nutrition = {}
nutrition['Calories'] = float(text2[0].split(' ')[0])

In [185]:
{t.strip().split(' ')[0]: t.strip().split(' ')[1] for t in text2[1:]}

{'protein': '26.2g',
 'carbohydrates': '27g',
 'fat': '6.5g',
 'cholesterol': '53.1mg',
 'sodium': '1670.3mg'}

In [189]:
import re

In [200]:
pattern = re.compile('total')
columns = soup.find_all('div', text='total', attrs={'class' : 'recipe-meta-item'})

In [201]:
columns

[]

In [203]:
soup.find_all('meta')

[<meta charset="utf-8"/>,
 <meta content="IE=edge" http-equiv="X-UA-Compatible"/>,
 <meta content="width=device-width,initial-scale=1" name="viewport"/>,
 <meta content="This goat meat stew is a version of a beloved Filipino dish (calderata) with carrots, potatoes, and peas in a flavorful tomato sauce. Serve it hot over cooked rice." name="description"/>,
 <meta content="Goat Stew" property="og:title"/>,
 <meta content="article" property="og:type"/>,
 <meta content="Allrecipes" property="og:site_name"/>,
 <meta content="https://www.allrecipes.com/recipe/228238/goat-stew/" property="og:url"/>,
 <meta content="This goat meat stew is a version of a beloved Filipino dish (calderata) with carrots, potatoes, and peas in a flavorful tomato sauce. Serve it hot over cooked rice. " property="og:description"/>,
 <meta content="https://www.allrecipes.com/recipe/228238/goat-stew/" name="pinterest:url"/>,
 <meta content="summary_large_image" name="twitter:card"/>,
 <meta content="https://www.allreci

In [212]:
soup.find('a', {'class': 'breadcrumbs__link--last'}).find('span').get_text()

'Stews'