In [5]:
### Defining functions to scrape website & food blogs
## 1. With application/ld+json
## 2. With schema tags itemprop
## 3. With unstructured format 




In [186]:
json_ld: [
    "https://www.marmiton.org/recettes/recette_gratin-dauphinois-tres-facile_58956.aspx",
    "https://clemfoodie.com/2021/08/22/crumble-aux-prunes-rouges-et-amandes/",
    "https://tangerinezest.com/pizza-tomate-burrata-basilic-et-mortadelle/",
    "https://recettesdejulie.fr/10351/crumble-mirabelles-avoine-noisettes/"
]
    
microdata: [
    "https://mesbrouillonsdecuisine.fr/pancakes-sales-a-la-farine-de-pois-chiches-jeunes-pousses-depinards-et-tomates-sechees/"
]
    
hard_test_urls: [
    "https://www.undejeunerdesoleil.com/2018/05/tramezzini-thon-artichauts-venise.html",
    "http://www.chezmisa.com/burgers-de-boeuf-persilles-et-sauce-au-miel/",
    "https://cookingjulia.blogspot.com/2021/08/poulet-la-mexicaine.html",
    "https://wernerhappyeats.com/djeunerdner/2017/12/29/grilled-chicken-lunch-bento-yxbn2",
    "https://doriannn.blogspot.com/2021/08/knackinkorea-parce-que-decidement-je-ne.html",
    "https://madamcadamia.com/2021/09/02/crumble-aux-mures-et-noisettes/",
    "https://www.plusunemiettedanslassiette.fr/moules-marinara/"
]

In [163]:
import pandas as pd
import numpy as np
import requests
import extruct
import pprint
from w3lib.html import get_base_url

In [402]:
url = 'https://www.marmiton.org/recettes/recette_gratin-dauphinois-tres-facile_58956.aspx'
recipe = scrape(url)
print(recipe)
#print(recipe['recipe']['yield'].item())

{'recipe': {'name': 0    Pancakes salés à la farine de pois chiches, je...
Name: name, dtype: object, 'yield': 0    pour 2 personnes
Name: recipeYield, dtype: object, 'ingredients': 0    [pour 6 pancakes :, 140g de farine de pois chi...
Name: recipeIngredient, dtype: object}}


# 1. Scrapping JSON-LD OR MICRODATA

In [393]:
"""Fetch structured JSON-LD OR MICRODATA data from a given URL."""
from typing import Optional, List
import requests
import extruct
from w3lib.html import get_base_url
from bs4 import BeautifulSoup


def scrape(url: str) -> Optional[List[dict]]:
    """Parse structured data from a URL."""
    headers = {
        'Access-Control-Allow-Origin': '*',
        'Access-Control-Allow-Methods': 'GET',
        'Access-Control-Allow-Headers': 'Content-Type',
        'Access-Control-Max-Age': '3600',
        'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
    }    
    req = requests.get(url, headers=headers)
    base_url = get_base_url(req.content, url)
    html = get_html(url) 
    metadata = get_metadata(html, url)
    if metadata:
        recipe_df = get_recipe_df(metadata)
        recipe = {"recipe": {
                     "name": recipe_df.name,
                     "yield": recipe_df.recipeYield,
                     "ingredients": recipe_df.recipeIngredient    
                    }
                }            
    else:
        recipe = {"no results"
                }        
    return recipe


def get_html(url: str):
    """Get raw HTML from a URL."""
    headers = {
        'Access-Control-Allow-Origin': '*',
        'Access-Control-Allow-Methods': 'GET',
        'Access-Control-Allow-Headers': 'Content-Type',
        'Access-Control-Max-Age': '3600',
        'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
    }
    req = requests.get(url, headers=headers)
    return req.text


def get_metadata(html, url: str):
    r = requests.get(url)
    """Fetch JSON-LD structured data."""
    metadata = extruct.extract(
        html,
        base_url = get_base_url(r.text, r.url),
        syntaxes=['json-ld'],
        uniform=True
    )['json-ld']
    """If empty, try fetch Microdata structured data."""    
    if metadata == []:
        metadata = extruct.extract(
            html,
            base_url = get_base_url(r.text, r.url),
            syntaxes=['microdata'],
            uniform=True
        )['microdata']        
    if bool(metadata) and isinstance(metadata, list):
        metadata = metadata[0]
    return metadata

def get_recipe_df(metadata):
    #check metadata dict format 
    if '@graph' in metadata:
        recipe_df = pd.DataFrame(metadata['@graph'])
    else:
        recipe_df = pd.DataFrame.from_dict(metadata, orient='index').transpose()
    #get first row with recipeIngredient
    recipe_df = recipe_df.sort_values(by='recipeIngredient').head(1)
    return recipe_df


# 2. Non-structured data

In [20]:
import scrape_schema_recipe

In [360]:
url = 'https://www.byacb4you.com/number-cake-noisette-ganache-pralinoise.html'
recipe_list = scrape_schema_recipe.scrape_url(url, python_objects=True)
recipe = recipe_list[0]

print(recipe['name'])
print(recipe['recipeIngredient'])


Number cake, biscuit noisette et ganache Pralinoise
['4  œufs', '160 g de poudre de noisette', '160 g de sucre glace', '40 g de beurre fondu', '4  blancs d’œuf', '25 g de sucre', '60 g de farine', '50 cl de crème fleurette', '360 g de Pralinoise (2 tablettes)', '40 g de chocolat noir', '100 g de mûres', 'Amandes enrobées de chocolat', 'Crêpes dentelle au chocolat', 'Quelques fleurs de Souci']
['20', '20 parts']


In [408]:
hard_test_urls = [
    "https://www.undejeunerdesoleil.com/2018/05/tramezzini-thon-artichauts-venise.html",
    "http://www.chezmisa.com/burgers-de-boeuf-persilles-et-sauce-au-miel/",
    "https://cookingjulia.blogspot.com/2021/08/poulet-la-mexicaine.html",
    "https://wernerhappyeats.com/djeunerdner/2017/12/29/grilled-chicken-lunch-bento-yxbn2",
    "https://doriannn.blogspot.com/2021/08/knackinkorea-parce-que-decidement-je-ne.html",
    "https://madamcadamia.com/2021/09/02/crumble-aux-mures-et-noisettes/",
    "https://www.plusunemiettedanslassiette.fr/moules-marinara/"
]

In [409]:
pp = pprint.PrettyPrinter(indent=2)
r = requests.get(hard_test_urls[0])
base_url = get_base_url(r.text, r.url)
data = extruct.extract(r.text, base_url=base_url)

pp.pprint(data)

{ 'dublincore': [ { 'elements': [ { 'URI': 'http://purl.org/dc/elements/1.1/title',
                                    'content': 'Tramezzini au thon et aux '
                                               'artichauts comme à Venise - Un '
                                               'déjeuner de soleil',
                                    'name': 'dc.title'},
                                  { 'URI': 'http://purl.org/dc/elements/1.1/description',
                                    'content': 'Recette des tramezzini au thon '
                                               'et aux artichauts comme en '
                                               'Italie (Venise). Un apéritif '
                                               '(antipasto) ou en-cas facile, '
                                               'gourmand et frais.',
                                    'name': 'dc.description'},
                                  { 'URI': 'http://purl.org/dc/elements/1.1/relation',
       