In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import re
import pickle as pkl
import time
import random
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

In [None]:
# adblock is necessary for selenium to run smoothly on the recipe pages
options = webdriver.ChromeOptions()
options.add_extension('/Applications/AdBlock.crx') 

In [None]:
urls = []
recipes = []
num_pages = 50

In [None]:
def get_urls(page):
    
    """
    Scrapes all URLs of a search page on allrecipes.com
    
    Parameters:
        page (str): a valid allrecipes.com search page url
    Returns:
        urls (list): a list of urls from the results of the search page
    """
    
    text = requests.get(page).text
    soup = BeautifulSoup(text, 'lxml')
    
    urls = []
    for link in soup.find_all(class_='fixed-recipe-card__title-link')[::2]:
        try:
            urls.append(link['href'])
        except KeyError:
            print('No')
    
    return urls

In [None]:
def get_ingredients(url):    
    
    """
    Scrapes the ingredients of a recipe page in metric measurements on allrecipes.com. 
    This method uses both Selenium and BeautifulSoup since there are interactive elements 
    on the page which convert the measurements to metric. 
    
    Parameters:
        url (str): a valid allrecipes.com recipe page url
    Returns:
        ingredients (list): a list of ingredients for a single recipe on allrecipes.com
    """
    
    driver = webdriver.Chrome(executable_path='/Applications/chromedriver', options=options)
    driver.get(url)
    metric_1 = '//a[@class="servings-adust-trigger"]'
    metric_2 = '//li[@class="adjust-servings__select"]'
    metric_3 = '//a[@id="btn-adjust"]'
    driver.find_element_by_xpath(metric_1).click()
    time.sleep(.1)
    driver.find_elements_by_xpath(metric_2)[1].click()
    time.sleep(.1)
    driver.find_element_by_xpath(metric_3).click()
    time.sleep(1)
    
    soup = BeautifulSoup(driver.page_source, 'lxml')
    driver.close()
    
    name = soup.find(class_='recipe-summary__h1').get_text()
    ingredient_soup = soup.find_all('li', class_='checkList__line ng-scope')
    ingredients = []
    for line in ingredient_soup:
        if line.findChild():
            try:
                ingredients.append(line.findChild().get_text().strip())
            except AttributeError:
                pass
    
    return ingredients

In [None]:
def get_cookie_page(url):
    
    """
    Scrapes all features of a recipe page on allrecipes.com
    
    Parameters:
        url (str): a valid allrecipes.com recipe page url
    Returns:
        (dict): name (str): name of the recipe
                rating (str): recipe rating
                num_ratings (str): number of ratings
                num_reviews (str): number of reviews
                made_it (str): number of people who made the recipe
                prep_time (str): preparation time
                servings (str): number of servings
                calories (str): number of calories
                num_photos (str): number of photos for the recipe
                oven_temp (str): oven temperature
                ingredients (list): list of ingredients}
                
    """
    
    text = requests.get(url).text
    soup = BeautifulSoup(text, 'lxml')
    
    name = soup.find(class_='recipe-summary__h1').get_text()
    rating = soup.find(class_='rating-stars')['data-ratingstars']
    num_photos = soup.find(class_='picture-count-link').get_text()
    oven_temp = soup.find(text=re.compile('degrees'))
    
    try:
        num_ratings = soup.find('h4', class_='helpful-header').get_text()
    except:
        num_ratings = np.nan
        
    try: 
        made_it = soup.find(class_='made-it-count').next_element.get_text()
    except:
        made_it = np.nan
    
    try:
        prep_time = soup.find(class_='ready-in-time').get_text()
    except:
        prep_time = np.nan
    
    try:
        servings = soup.find(id='metaRecipeServings')['content']
    except:
        servings = np.nan

    try:    
        calories = soup.find(class_="nutrition-trigger").findChild()['aria-label']
    except:
        calories = np.nan
    
    try:
        num_reviews = soup.find(class_='recipe-reviews__header--count').get_text()
    except:
        num_reviews = np.nan
        
    ingredients = get_ingredients(url)

    print(f'{name} scraped')
    return {'name': name,
            'rating': rating,
            'num_ratings': num_ratings,
            'num_reviews': num_reviews,
            'made_it': made_it,
            'prep_time': prep_time,
            'servings': servings,
            'calories': calories,
            'num_photos': num_photos,
            'oven_temp': oven_temp,
            'ingredients': ingredients}

In [None]:
# get recipe page urls for num_pages of search results
for i in range(1,num_pages+1):
    page = 'https://www.allrecipes.com/search/results/?wt=chocolate%20chip%20cookies&sort=re&page=' + str(i)
    urls.extend(get_urls(page))
    time.sleep(1+random.random())

In [None]:
# only keep the recipes which are actually cookies
cookie_urls = [url for url in urls if url.find('cookie') != -1]

In [None]:
# scrape all pages
for url in cookie_urls:
    recipes.append(get_cookie_page(url))
    time.sleep(1+random.random())

In [None]:
df = pd.DataFrame(recipes)

In [None]:
df.head()

In [None]:
# SAVE FILE
pd.to_pickle(df, 'test_df.pkl')