In [5]:
### Defining functions to scrape website & food blogs
## 1. With application/ld+json
## 2. With schema tags itemprop
## 3. With unstructured format 




In [7]:
json_ld= [
    "https://www.marmiton.org/recettes/recette_gratin-dauphinois-tres-facile_58956.aspx",
    "https://clemfoodie.com/2021/08/22/crumble-aux-prunes-rouges-et-amandes/",
    "https://tangerinezest.com/pizza-tomate-burrata-basilic-et-mortadelle/",
    "https://recettesdejulie.fr/10351/crumble-mirabelles-avoine-noisettes/"
]
    
microdata = [
    "https://mesbrouillonsdecuisine.fr/pancakes-sales-a-la-farine-de-pois-chiches-jeunes-pousses-depinards-et-tomates-sechees/"
]
    
hard_test_urls = [
    "https://www.undejeunerdesoleil.com/2018/05/tramezzini-thon-artichauts-venise.html",
    "http://www.chezmisa.com/burgers-de-boeuf-persilles-et-sauce-au-miel/",
    "https://cookingjulia.blogspot.com/2021/08/poulet-la-mexicaine.html",
    "https://wernerhappyeats.com/djeunerdner/2017/12/29/grilled-chicken-lunch-bento-yxbn2",
    "https://doriannn.blogspot.com/2021/08/knackinkorea-parce-que-decidement-je-ne.html",
    "https://madamcadamia.com/2021/09/02/crumble-aux-mures-et-noisettes/",
    "https://www.plusunemiettedanslassiette.fr/moules-marinara/"
]

In [5]:
import pandas as pd
import numpy as np
import requests
import extruct
import pprint
from w3lib.html import get_base_url

In [None]:
recipe = scrape_recipe(hard_test_urls[1])
recipe


# Scrapping class

In [46]:

def scrape_recipe(url):
    recipe = scrape_structured_data(url)
    if recipe is None:
        recipe = scrape_unstructured_data(url)
    return recipe
    

# 0. Ingredients filter

In [113]:
import spacy
saved_model_path = "/Users/vincentsalamand/Documents/datascience/models/recipe_cat_spacy_model_150921_full"
nlp = spacy.load(saved_model_path)

def get_ingredients(text_list):
    docs = list(nlp.pipe(text_list))
    predictions = []
    for doc in docs:
        if max(doc.cats, key=doc.cats.get) == "ingredient":
            predictions.append(doc.text)
    return predictions



# 1. Scrapping JSON-LD OR MICRODATA

In [102]:
"""Fetch structured JSON-LD OR MICRODATA data from a given URL."""
from typing import Optional, List
import requests
import extruct
from w3lib.html import get_base_url
from bs4 import BeautifulSoup

def scrape_structured_data(url: str) -> Optional[List[dict]]:
    """Parse structured data from a URL."""
    headers = {
        'Access-Control-Allow-Origin': '*',
        'Access-Control-Allow-Methods': 'GET',
        'Access-Control-Allow-Headers': 'Content-Type',
        'Access-Control-Max-Age': '3600',
        'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
    }    
    req = requests.get(url, headers=headers)
    base_url = get_base_url(req.content, url)
    html = get_html(url) 
    metadata = get_metadata(html, url)
    try:
        if metadata:
            recipe_df = get_recipe_df(metadata)
            recipe = {"recipe": {
                         "name": recipe_df.name.values[0],
                         "yield": recipe_df.recipeYield.values[0],
                         "ingredients": get_ingredients(recipe_df.recipeIngredient.values[0])
                        }
                    }            
            return recipe
        else:
            return None
    except:
        return None


def get_html(url: str):
    """Get raw HTML from a URL."""
    headers = {
        'Access-Control-Allow-Origin': '*',
        'Access-Control-Allow-Methods': 'GET',
        'Access-Control-Allow-Headers': 'Content-Type',
        'Access-Control-Max-Age': '3600',
        'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
    }
    req = requests.get(url, headers=headers)
    return req.text


def get_metadata(html, url: str):
    r = requests.get(url)
    """Fetch JSON-LD structured data."""
    metadata = extruct.extract(
        html,
        base_url = get_base_url(r.text, r.url),
        syntaxes=['json-ld'],
        uniform=True
    )['json-ld']
    """If empty, try fetch Microdata structured data."""    
    if metadata == []:
        metadata = extruct.extract(
            html,
            base_url = get_base_url(r.text, r.url),
            syntaxes=['microdata'],
            uniform=True
        )['microdata']        
    if bool(metadata) and isinstance(metadata, list):
        metadata = metadata[0]
    return metadata

def get_recipe_df(metadata):
    #check metadata dict format 
    if '@graph' in metadata:
        recipe_df = pd.DataFrame(metadata['@graph'])
    else:
        recipe_df = pd.DataFrame.from_dict(metadata, orient='index').transpose()
    #get first row with recipeIngredient
    recipe_df = recipe_df.sort_values(by='recipeIngredient').head(1)
    return recipe_df


# 2. Non-structured data

## Using spaCy 

In [103]:
from trafilatura import fetch_url, extract
import trafilatura
from nltk.tokenize import sent_tokenize


def scrape_unstructured_data(url: str):
    downloaded = fetch_url(url)
    # to get the main text of a page
    if downloaded is not None:
        result = extract(downloaded, include_comments=False)
        text = sentence_parser(result)
        if text is not None:
            df = pd.DataFrame(text, columns=['text']).dropna()
            recipe = {"recipe": {
                     "name": get_title(downloaded),
                     "yield": None,
                     "ingredients": get_ingredients(df.text.to_list())
                    }
                }  
            return recipe
        else:
            pass

def sentence_parser(result):
    # getting all the paragraphs
    text = []
    try:
        sentences = sent_tokenize(result, language='french')
        for sentence in sentences:
            if (sentence.replace("\n","* ").replace("– ","* ").replace("- ","* ").replace("• ","* ").replace("• ","* ").count('*') > 2):
                [text.append(x) for x in sentence.replace("\n","* ").replace("– ","* ").replace("- ","* ").replace("• ","* ").replace("• ","* ").split("* ")]
            else:
                [text.append(x) for x in sentence.replace('\n', '* ').replace('\r', '* ').replace('\xa0', '* ').split('* ')]

        # remove empty strings from list
        return list(filter(None, text))
    except:
        return None
    

def get_title(downloaded):
    return trafilatura.bare_extraction(downloaded)['title']
