In [8]:
## Checking out website structure
# requirements
# !pip install requests
# !pip install openai
# !pip install beautifulsoup4

import requests
from bs4 import BeautifulSoup
import json



In [9]:
#https://somuchfoodblog.com/red-wine-braised-beef/
website_request = requests.get("https://somuchfoodblog.com/red-wine-braised-beef/")

#convert request to parseable object
website_text = website_request.text
soup = BeautifulSoup(website_text)

# if we look carefully, we can see that the recipe is contained within the class yoast-schema-graph; let's extract this and explore the data model
# print(website_text)

In [58]:
#based on several examples, there should only be one Yoast graph here; the text of this script is a valid json object
yoast_graph = soup.find("script",{"class": "yoast-schema-graph"})
yoast_graph

<script class="yoast-schema-graph" type="application/ld+json">{"@context":"https://schema.org","@graph":[{"@type":"Article","@id":"https://somuchfoodblog.com/red-wine-braised-beef/#article","isPartOf":{"@id":"https://somuchfoodblog.com/red-wine-braised-beef/"},"author":{"name":"Jenny","@id":"https://somuchfoodblog.com/#/schema/person/a377debbb58942b818242c967f402b04"},"headline":"Red Wine Braised Beef","datePublished":"2021-10-28T16:06:10+00:00","dateModified":"2023-12-26T14:57:29+00:00","wordCount":1664,"commentCount":25,"publisher":{"@id":"https://somuchfoodblog.com/#/schema/person/a377debbb58942b818242c967f402b04"},"image":{"@id":"https://somuchfoodblog.com/red-wine-braised-beef/#primaryimage"},"thumbnailUrl":"https://somuchfoodblog.com/wp-content/uploads/2021/10/red-wine-braised-beef5-scaled.jpg","keywords":["beef","braising","one pot meals"],"articleSection":["Beef","Braising","Dinner Party","Fall","Gluten Free","Holiday","Main Dishes","One Pot Meals","Winter"],"inLanguage":"en-US

In [64]:
yoast_graph_json = json.loads(yoast_graph.text)['@graph']

#as we can see there are several items within the graph
for item in yoast_graph_json:
    print(item['@type'])

In [69]:
# inside the recipe object, we get the JSON object containing the fields that we wan 
[x for x in yoast_graph_json[-1].keys()]

['@type',
 'name',
 'author',
 'description',
 'datePublished',
 'image',
 'recipeYield',
 'prepTime',
 'cookTime',
 'totalTime',
 'recipeIngredient',
 'recipeInstructions',
 'aggregateRating',
 'review',
 'recipeCategory',
 'keywords',
 'nutrition',
 '@id',
 'isPartOf',
 'mainEntityOfPage']

In [20]:
##another example
#https://www.food.com/recipe/gordon-ramsays-ultimate-burger-265782
website_request_gr = requests.get("https://www.food.com/recipe/gordon-ramsays-ultimate-burger-265782")
website_text_gr = website_request_gr.text


In [24]:
#here we do not have the same object structure as above. While we could do an extract on the Yoast in the first example, we have a different object here
soup_gr = BeautifulSoup(website_text_gr)

In [34]:
len([x for x in soup_gr.getText().split() if x != '\n'])

998

In [39]:
len(soup_gr.getText())

15137

In [18]:
print(len(website_request.text), len(soup.getText()),len(soup.getText())/len(website_request.text))

381797 18316 0.04797313755739307


In [9]:
#since we know that there are variable structures and individual parsers will be an intensive workload, why don't why give a LLM a shot at extracting the key content?

In [2]:
from openai import OpenAI
client = OpenAI()

In [62]:
sample_site_content = " ".join([x for x in soup_gr.getText().split()])


In [60]:
len(website_request.text)

381797

In [45]:
completion = client.chat.completions.create(
  model="gpt-3.5-turbo",
  response_format={ "type": "json_object" },
  messages=[
    {"role": "system", "content": "You are a highly specialized text extraction service that will read semi-structured content out of websites and return the specific items you're asked to retrieve"},
    {"role": "user", "content": f"""Given the text from the website under the dashed line, 
     extract the ordered list of steps for the recipe (return as a list of strings with json key 'steps'), 
     extract the ingredients for the entire recipe (return as a list of strings with json key 'ingredients'; if the recipe is broken up into several parts, this list should be a superset of all ingredients needed) 
     extract the total time to prepare the dish from prep to cook when present (return as integer of minutes with json_key "time")

     If you are given a website and any of the objects above are not present, return a valid NoneType object

     -------------- Website content starts below this line -----------

     {sample_site_content}
    """}
  ]
)

In [57]:
json.loads(completion.choices[0].message.content)

{'steps': ['Combine all ingredients.',
  'Form into patties.',
  'Cook as desired.'],
 'ingredients': ['1 lb ground sirloin',
  '1 onion, minced',
  '1 tablespoon ketchup',
  '1 tablespoon sun-dried tomato, cut thinly',
  '1/4 cup garlic, chopped',
  '1/2 teaspoon Worcestershire sauce',
  '1/2 teaspoon Tabasco sauce',
  '1/2 teaspoon honey mustard'],
 'time': 13}

In [64]:
#now let's apply the same preprocessing to the yoast site

yoast_site_content = " ".join([x for x in soup.getText().split()])

In [68]:
harder_completion = client.chat.completions.create(
  model="gpt-3.5-turbo",
  response_format={ "type": "json_object" },
  messages=[
    {"role": "system", "content": "You are a highly specialized text extraction service that will read semi-structured content out of websites and return the specific items you're asked to retrieve; it is possible you will be given websites with prose unrelated to the recipe, but it is imperative you ignore prose not clearly related to the target data."},
    {"role": "user", "content": f"""Given the text from the website under the dashed line, 
     extract the recipe name (return as a string with json key "recipe_name")
     extract the ordered list of steps for the recipe (return as a list of strings with json key 'recipe_steps'), 
     extract the ingredients for the entire recipe (return as a list of strings with json key 'recipe_ingredients'; if the recipe is broken up into several parts, this list should be a superset of all ingredients needed) 
     extract the total time to prepare the dish from prep to cook when present (return as integer of minutes with json_key "recipe_total_minutes")

     If you are given a website and any of the objects above are not present, return a valid NoneType object

     -------------- Website content starts below this line -----------

     {yoast_site_content}
    """}
  ]
)

In [69]:
json.loads(harder_completion.choices[0].message.content)

{'recipe_name': 'Red Wine Braised Beef',
 'recipe_steps': ['Cube your beef. You want the pieces to be at least 3-4 inch pieces. Too small and the beef will fall apart into the sauce.',
  'Season everything. Pat your beef really dry (helps with browning). These are thick pieces of beef so season all sides really well with salt and pepper.',
  'Sear the beef. This is a crucial step! Sear the beef really well on all sides until deeply golden brown.',
  'Cook your veggies. After your beef is nice and browned, remove it from the pan and add in your onions, leeks, and garlic.',
  'Add your wine and stock. Once the veggies are softened, add the wine and beef stock and return the beef to the pan. Nestle the carrots in around the beef with the thyme sprigs and bay leaves.',
  'Bring everything to a simmer, cover, and transfer to the oven.',
  'Braise for about 3 hours, until the beef is fork-tender.',
  'Remove from the oven and skim the layer of fat off the top. Cover and let the beef rest in 