In [1]:
import pandas as pd
import requests
!pip3 install bs4
from bs4 import BeautifulSoup



## In Class Assignment 1

**Goal:** Formalize a pipeline to scrape this site

https://www.allrecipes.com/search/results/?search=cheese
    
1. Write `extract_recipes(s_query)` which:
    * takes the search phrase (e.g. 'cheese') as input argument
    * builds the correct url that leads directly to the page that lists the recipes
    * uses `requests` to get the content of this page returns the html text of the page
    * returns an html string
    * builds a BeautifulSoup object out of that text 
    * finds names of all recipes
        - to identify which tags / classes to `find_all()`, open the page in your browser and "inspect" 
        - start from the recipe object above, and call another `find_all()` to zoom into the recipe name itself
    * returns a dataframe with a single column "recipe"
        * the names of the recipes might be a bit mangled, having "save" and "1,243 raters" just now, thats ok    
    * we'll want to add more features to this dataframe later, building it up as a list of dictionaries (one per row) allows us to extend to other features easily:
    
```python
row_list = list()
for recipe in recipe_list:
    # build a dictionary representing this recipe (row)
    d = {'name': name}
    row_list.append(d)
 
df = pd.DataFrame(row_list)
```

In [2]:
def extract_recipes(query):
    """ builds list of recipe names from allrecipies html
    
    Args:
        query (str): input quuery
    
    Returns:
        df_recipe (pd.DataFrame): each row is a recipe 
    """
    
    # build soup object from search query
    url = f'https://www.allrecipes.com/search?q={query}'
    html = requests.get(url).text
    soup = BeautifulSoup(html)
    
    # get list of recipe tag
    recipe_list = list()
    for tag in soup.find_all('a', class_='mntl-card-list-items'):
        # search to find all star icons
        star_list = tag.find_all('svg', class_= 'icon-star')
        if star_list:
            # add to recipe
            recipe_list.append(tag)
    
    # extract features to build df
    row_list = list()
    for recipe in recipe_list:
        # build dictionary to represent recipe
        name = recipe.text.replace('\n', ' ') 
        d = {'name': name}
        row_list.append(d)
    
    return pd.DataFrame(row_list)

In [3]:
extract_recipes('cheese')

Unnamed: 0,name
0,Save Southern Pimento Chee...
1,Save Basic Cream Cheese Fr...
2,Save Grilled Cheese Sandwi...
3,Save Homemade Mac and Chee...
4,Save Best Cheese Ball ...
5,Save Simple Macaroni and C...
6,Save Baked Mac and Cheese ...
7,Save Absolutely the BEST R...
8,Save Baked Ham and Cheese ...
9,Save Jalapeño Popper Grill...


## In Class Assignment 2 - Getting Nutritional Information
Write an `extract_nutrition()` function, which accepts a url of a particular recipe (see ex directly above) and returns a dictionary of nutritional information:

```python
url = 'https://www.allrecipes.com/recipe/189930/southern-pimento-cheese/'
extract_nutrition(url)

```

yields:

```python
{'Calories': '208',
 'Fat': '20g',
 'Carbs': '2g',
 'Protein': '6g'}

```

Once complete, incorporate `extract_nutrition()` into `extract_recipes()` todo


In [4]:
def extract_nutrition(url):
    """ returns a dictionaru of nutrition info
    
    Args:
        query (str): url - location off all recipes
    
    Returns:
        nutrition_dict (dict): 
            keys are molecule types
            vals are str of quantity
    """
    # build soup object from search query
    html = requests.get(url).text
    soup = BeautifulSoup(html)
    
    # extra nutrition info
    nutr = soup.find_all(class_='mntl-nutrition-facts-summary__table-body')[0].text
    nutr_list = nutr.split()
    
    # make dictionary
    nutr_dict = dict(zip(nutr_list[1::2], nutr_list[0::2]))
    
    return nutr_dict
    

In [5]:
# get soup from url
url = 'https://www.allrecipes.com/recipe/189930/southern-pimento-cheese/'
extract_nutrition(url)

{'Calories': '208', 'Fat': '20g', 'Carbs': '2g', 'Protein': '6g'}

In [6]:
# tqdm is a progress bar, not necessary, but fun to see once
# (scraping often takes a moment, nice to get some updates)
!pip3 install tqdm



In [7]:
from tqdm import tqdm 

def extract_recipes(s_query):
    """ builds list of recipe names from allrecipies html
    
    Args:
        s_query (str): input query (i.e. "cheese")
        
    Returns:
        df_recipe (pd.DataFrame): each row is a recipe
    """
    # build soup object from search query
    url = f'https://www.allrecipes.com/search?q={s_query}'
    s_html = requests.get(url).text
    soup = BeautifulSoup(s_html)
    
    # get a list of recipe tags
    recipe_list = list()
    for tag in soup.find_all('a', class_='mntl-card-list-items'):
        # search within tag to find all star icons
        star_list = tag.find_all('svg', class_='icon-star')
        if star_list:
            # some star icon is found, store this as its a real recipe
            recipe_list.append(tag)
            
    # extract features to build dataframe
    row_list = list()
    for recipe in tqdm(recipe_list, desc='querying nutrition per recipe'):
        # extract name & url
        name = recipe.text.replace('\n', ' ').replace('Save', ' ')
        url = recipe.attrs['href']
        
        # lookup nutrition info
        d = extract_nutrition(url)
        d['name'] = name
        d['url'] = url
        
        row_list.append(d)
        
        
    return pd.DataFrame(row_list)
    

In [8]:
extract_recipes('cheese')

querying nutrition per recipe: 100%|████████████| 14/14 [00:06<00:00,  2.03it/s]


Unnamed: 0,Calories,Fat,Carbs,Protein,name,url
0,208,20g,2g,6g,Southern Pimento Cheese ...,https://www.allrecipes.com/recipe/189930/south...
1,292,14g,40g,2g,Basic Cream Cheese Frost...,https://www.allrecipes.com/recipe/8379/basic-c...
2,400,28g,26g,11g,Grilled Cheese Sandwich ...,https://www.allrecipes.com/recipe/23891/grille...
3,845,48g,65g,37g,Homemade Mac and Cheese ...,https://www.allrecipes.com/recipe/11679/homema...
4,413,39g,4g,15g,Best Cheese Ball ...,https://www.allrecipes.com/recipe/16600/herman...
5,630,34g,55g,27g,Simple Macaroni and Chee...,https://www.allrecipes.com/recipe/238691/simpl...
6,415,23g,30g,22g,Baked Mac and Cheese wit...,https://www.allrecipes.com/recipe/229815/baked...
7,93,9g,1g,3g,Absolutely the BEST Rich...,https://www.allrecipes.com/recipe/58745/absolu...
8,208,14g,11g,10g,Baked Ham and Cheese Sli...,https://www.allrecipes.com/recipe/216756/baked...
9,528,34g,41g,17g,Jalapeño Popper Grilled ...,https://www.allrecipes.com/recipe/217267/jalap...
