In [5]:
### Defining functions to scrape website & food blogs
## 1. With application/ld+json
## 2. With schema tags itemprop
## 3. With unstructured format 




In [186]:
json_ld: [
    "https://www.marmiton.org/recettes/recette_gratin-dauphinois-tres-facile_58956.aspx",
    "https://clemfoodie.com/2021/08/22/crumble-aux-prunes-rouges-et-amandes/",
    "https://tangerinezest.com/pizza-tomate-burrata-basilic-et-mortadelle/",
    "https://recettesdejulie.fr/10351/crumble-mirabelles-avoine-noisettes/"
]
    
microdata: [
    "https://mesbrouillonsdecuisine.fr/pancakes-sales-a-la-farine-de-pois-chiches-jeunes-pousses-depinards-et-tomates-sechees/"
]
    
hard_test_urls: [
    "https://www.undejeunerdesoleil.com/2018/05/tramezzini-thon-artichauts-venise.html",
    "http://www.chezmisa.com/burgers-de-boeuf-persilles-et-sauce-au-miel/",
    "https://cookingjulia.blogspot.com/2021/08/poulet-la-mexicaine.html",
    "https://wernerhappyeats.com/djeunerdner/2017/12/29/grilled-chicken-lunch-bento-yxbn2",
    "https://doriannn.blogspot.com/2021/08/knackinkorea-parce-que-decidement-je-ne.html",
    "https://madamcadamia.com/2021/09/02/crumble-aux-mures-et-noisettes/",
    "https://www.plusunemiettedanslassiette.fr/moules-marinara/"
]

In [163]:
import pandas as pd
import numpy as np
import requests
import extruct
import pprint
from w3lib.html import get_base_url

In [440]:
url = 'https://yuka.io/recettes/banana-bread/'
recipe = scrape(url)
print(recipe)
#print(recipe['recipe']['yield'].item())

{'recipe': {'name': 0    Banana bread au beurre de cacahuètes
Name: name, dtype: object, 'yield': 0    1 cake 
Name: recipeYield, dtype: object, 'ingredients': 0    [3 bananes bien mûres , 3 œufs , 100g de beurr...
Name: recipeIngredient, dtype: object}}


# 1. Scrapping JSON-LD OR MICRODATA

In [393]:
"""Fetch structured JSON-LD OR MICRODATA data from a given URL."""
from typing import Optional, List
import requests
import extruct
from w3lib.html import get_base_url
from bs4 import BeautifulSoup


def scrape(url: str) -> Optional[List[dict]]:
    """Parse structured data from a URL."""
    headers = {
        'Access-Control-Allow-Origin': '*',
        'Access-Control-Allow-Methods': 'GET',
        'Access-Control-Allow-Headers': 'Content-Type',
        'Access-Control-Max-Age': '3600',
        'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
    }    
    req = requests.get(url, headers=headers)
    base_url = get_base_url(req.content, url)
    html = get_html(url) 
    metadata = get_metadata(html, url)
    if metadata:
        recipe_df = get_recipe_df(metadata)
        recipe = {"recipe": {
                     "name": recipe_df.name,
                     "yield": recipe_df.recipeYield,
                     "ingredients": recipe_df.recipeIngredient    
                    }
                }            
    else:
        recipe = {"no results"
                }        
    return recipe


def get_html(url: str):
    """Get raw HTML from a URL."""
    headers = {
        'Access-Control-Allow-Origin': '*',
        'Access-Control-Allow-Methods': 'GET',
        'Access-Control-Allow-Headers': 'Content-Type',
        'Access-Control-Max-Age': '3600',
        'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
    }
    req = requests.get(url, headers=headers)
    return req.text


def get_metadata(html, url: str):
    r = requests.get(url)
    """Fetch JSON-LD structured data."""
    metadata = extruct.extract(
        html,
        base_url = get_base_url(r.text, r.url),
        syntaxes=['json-ld'],
        uniform=True
    )['json-ld']
    """If empty, try fetch Microdata structured data."""    
    if metadata == []:
        metadata = extruct.extract(
            html,
            base_url = get_base_url(r.text, r.url),
            syntaxes=['microdata'],
            uniform=True
        )['microdata']        
    if bool(metadata) and isinstance(metadata, list):
        metadata = metadata[0]
    return metadata

def get_recipe_df(metadata):
    #check metadata dict format 
    if '@graph' in metadata:
        recipe_df = pd.DataFrame(metadata['@graph'])
    else:
        recipe_df = pd.DataFrame.from_dict(metadata, orient='index').transpose()
    #get first row with recipeIngredient
    recipe_df = recipe_df.sort_values(by='recipeIngredient').head(1)
    return recipe_df


# 2. Non-structured data

In [725]:
# Load label dataset
label_data = pd.read_csv(r'/Users/vincentsalamand/Documents/datasets/label_recipe_text.csv')
print(len(label_data[label_data.isnull().any(axis=1)]))
label_data.tail()


0


Unnamed: 0,text,type
104944,j’ai refait ma recette de brioche moelleuse dé...,other
104945,je n’ai pas pu m’empêcher de la photographier ...,other
104946,"pour cette recette, j’ai utilisé environ 1/3 d...",other
104947,"la brioche est moelleuse, parfaitement aromati...",other
104948,elle monte parfaitement pour être très légère ...,other


In [726]:
#Splitting the dataset into train and test
train = label_data.sample(frac = 0.8, random_state = 42)
test = label_data.drop(train.index)


In [731]:
#convert train & test datasets to array of tuples
train_subset = train[['text', 'type']]
train_tuples = [tuple(x) for x in train_subset.to_numpy()]
test_subset = test[['text', 'type']]
test_tuples = [tuple(x) for x in test_subset.to_numpy()]



In [734]:
import textblob

ModuleNotFoundError: No module named 'textblob'

In [733]:
#reate a Naive Bayes classifier, passing the training data into the constructor
from textblob.classifiers import NaiveBayesClassifier
cl = NaiveBayesClassifier(train_tuples)



ModuleNotFoundError: No module named 'textblob'