This file scrapes all recipes on the NYT Cooking website and saves the data into a CSV file. It iterates through search results (with zero input). This does not require log-in on the website.

In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import re
import requests

# Lists to store data in
recipe_name = []
recipe_author = []
recipe_rating = []
recipe_review_count = []
recipe_links = []

# Number of pages to parse through; this is under the current state of the site, where there are 421 pages of search results
pages_url = [str(i) for i in range(1, 422)] 

# For every page in the search interval
for page in pages_url:
    response = requests.get('https://cooking.nytimes.com/search?q=&page=' + page)
    html = response.text
    soup = BeautifulSoup(html)
    
    for link in soup.find_all('a', attrs={'href': re.compile('^/recipes')}):
        recipe_links.append('https://cooking.nytimes.com' + link.get('href'))

# Remove duplicates recipe links
recipe_links = list(dict.fromkeys(recipe_links))

# Iterates through each recipes link to pull recipe title, author, rating, and number of reviews
for recipe in recipe_links:
    response_recipe = requests.get(recipe)
    html_recipe = response_recipe.text
    soup_recipe = BeautifulSoup(html_recipe, 'html.parser')
    
    # Adds title of recipe; if recipe title isn't available, recipe title is set as 'None'
    try:
        recipe_name.append(soup_recipe.title.string)
    except:
        recipe_name.append('none')
    
    # Adds author of recipe; if recipe author isn't available, recipe author is set as 'None'
    try: 
        recipe_author.append(soup_recipe.find('span', {'class': 'byline-name', 'itemprop': 'author'}).text)
    except:
        recipe_author.append('None')

    # Adds average rating and number of reviews of recipe; if they aren't available, they are set as '0'
    pattern = '\=\s(\d+)' # Pattern for obtaining the recipe rating and number of reviews
    value = re.findall(pattern, str(soup_recipe.find(text=re.compile("bootstrap.recipe.avg_rating"))))
    try:
        recipe_rating.append(value[0])
    except:
        recipe_rating.append('0')
    try:
        recipe_review_count.append(value[1])
    except:
        recipe_review_count.append('0')        

# Creates a DataFrame for recipe information
recipe_information = pd.DataFrame({
    'Recipe Name': recipe_name,
    'Recipe Author': recipe_author,
    'Recipe Rating': recipe_rating,
    'Recipe Review Count': recipe_review_count
})

# Writes the recipe dataframe to a CSV file
recipe_information.to_csv('recipe_information.csv')