In [1]:
# Import libaries
import pandas as pd
import requests

from bs4 import BeautifulSoup

### Step 1: Create a soup object from the home page

In [2]:
soup = BeautifulSoup()

### Step 2: Scrape the home page soup for every restaurant

Note: Your best bet is to create a list of dictionaries, one for each restaurant. Each dictionary contains the restaurant's name and path from the `href`. The result of your scrape should look something like this:

```python
restaurants = [
    {'name': 'A&W Restaurants', 'href': 'restaurants/1.html'}, 
    {'name': "Applebee's", 'href': 'restaurants/2.html'},
    ...
]
```

In [3]:
url = 'https://pages.git.generalassemb.ly/rldaggie/for-scraping/'

In [4]:
rq = requests.get(url, timeout=5)

In [5]:
rq

<Response [200]>

In [6]:
# Scrape into memory, then convert to soup object for easier processing

In [7]:
soup = BeautifulSoup(rq.content, 'lxml')

In [8]:
soup.find('td')

<td>
<a href="restaurants/1.html">A&amp;W Restaurants</a> </td>

In [9]:
# https://stackoverflow.com/questions/43814754/python-beautifulsoup-how-to-get-href-attribute-of-a-element/43814994
[a['href'] for a in soup.find_all('a', href=True)]

['/',
 'restaurants/1.html',
 'restaurants/2.html',
 'restaurants/3.html',
 'restaurants/4.html',
 'restaurants/5.html',
 'restaurants/6.html',
 'restaurants/7.html',
 'restaurants/8.html',
 'restaurants/9.html',
 'restaurants/10.html',
 'restaurants/11.html',
 'restaurants/12.html',
 'restaurants/13.html',
 'restaurants/14.html',
 'restaurants/15.html',
 'restaurants/16.html',
 'restaurants/17.html',
 'restaurants/18.html',
 'restaurants/19.html',
 'restaurants/20.html',
 'restaurants/21.html',
 'restaurants/22.html',
 'restaurants/23.html',
 'restaurants/24.html',
 'restaurants/25.html',
 'restaurants/26.html',
 'restaurants/27.html',
 'restaurants/28.html',
 'restaurants/29.html',
 'restaurants/30.html',
 'restaurants/31.html',
 'restaurants/32.html',
 'restaurants/33.html',
 'restaurants/34.html',
 'restaurants/35.html',
 'restaurants/36.html',
 'restaurants/37.html',
 'restaurants/38.html',
 'restaurants/39.html',
 'restaurants/40.html',
 'restaurants/41.html',
 'restaurants/42.ht

In [10]:
{'name': i.text for i in soup.find_all('td')}

{'name': "\nWendy's "}

In [11]:
restaurants = [{'name': i.text.replace('\n',''), 'href': i.find('a').get('href')} for i in (soup.find_all('td'))]

In [12]:
restaurants[1]

{'name': "Applebee's ", 'href': 'restaurants/2.html'}

### Step 3: Using the `href`, scrape each restaurant's page and create a single list of food dictionaries.

Your list of foods should look something like this:
```python
foods = [
    {
        'calories': '0',
        'carbs': '0',
        'category': 'Drinks',
        'fat': '0',
        'name': 'A&W® Diet Root Beer',
        'restaurant': 'A&W Restaurants'
    },
    {
        'calories': '0',
        'carbs': '0',
        'category': 'Drinks',
        'fat': '0',
        'name': 'A&W® Diet Root Beer',
        'restaurant': 'A&W Restaurants'
    },
    ...
]
```

**Note**: Remove extra white space from each category

In [13]:
foods_html = [BeautifulSoup(requests.get(i, timeout=5).content, 'lxml') for i in [url+a['href'] for a in soup.find_all('a', href=True)]]

In [14]:
foods_html[1]

<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<title>Nutrition Information</title>
<link crossorigin="anonymous" href="https://stackpath.bootstrapcdn.com/bootstrap/4.4.1/css/bootstrap.min.css" integrity="sha384-Vkoo8x4CGsO3+Hhxv8T/Q5PaXtkKtu6ug5TOeNV6gBiFeWPGFN9MuhOf23Q9Ifjh" rel="stylesheet"/>
</head>
<body>
<header>
<section class="container">
<nav class="navbar navbar-expand-lg navbar-light bg-light" role="navigation">
<a class="navbar-brand" href="/">Nutrition Information</a> </nav>
</section>
</header>
<main class="container" role="main">
<br/>
<div class="alert alert-danger">
        NOTE: This data is super old and rife with errors. It's meant for scraping practice only.
      </div>
<h3 class="display-3">
A&amp;W Restaurants</h3>
<p class="lead">Data Source: http://www.awrestaurants.com/</p>
<table class="table">
<thead>
<tr>
<th>Name<

In [15]:
foods_html[1].find('h3').text.replace('\n','')

'A&W Restaurants'

In [16]:
[i.text for i in foods_html[1].find_all('th')]

['Name', 'Category', 'Calories', 'Fat', 'Carbs']

In [17]:
len([i.text for i in foods_html[1].find_all('td')])/5

122.0

In [18]:
foods_1 = list(zip([i.text for i in foods_html[1].find_all('th')]*122, [i.text for i in foods_html[1].find_all('td')]))

In [19]:
foods_1

[('Name', 'Original Bacon Double Cheeseburger'),
 ('Category', 'Burgers'),
 ('Calories', '760'),
 ('Fat', '45'),
 ('Carbs', '45'),
 ('Name', 'Coney (Chili) Dog'),
 ('Category', 'Entrees'),
 ('Calories', '340'),
 ('Fat', '20'),
 ('Carbs', '26'),
 ('Name', 'Chili Fries'),
 ('Category', 'French Fries'),
 ('Calories', '370'),
 ('Fat', '15'),
 ('Carbs', '49'),
 ('Name', 'Strawberry Milkshake (small)'),
 ('Category', 'Shakes'),
 ('Calories', '670'),
 ('Fat', '29'),
 ('Carbs', '90'),
 ('Name', 'A&W® Root Beer Freeze (large)'),
 ('Category', 'Shakes'),
 ('Calories', '820'),
 ('Fat', '18'),
 ('Carbs', '150'),
 ('Name', 'Caramel Sundae'),
 ('Category', 'Desserts'),
 ('Calories', '340'),
 ('Fat', '9'),
 ('Carbs', '57'),
 ('Name', 'Strawberry Banana Smoothee'),
 ('Category', 'Shakes'),
 ('Calories', '420'),
 ('Fat', '6'),
 ('Carbs', '86'),
 ('Name', 'Chocolate Fudge Blendrrr'),
 ('Category', 'Desserts'),
 ('Calories', '1010'),
 ('Fat', '59'),
 ('Carbs', '152'),
 ('Name', 'Strawberry Limeade'),
 ('

In [20]:
test_2 = [dict(foods_1[i:i+6]) for i in range(1,122,5)]

In [21]:
test_2

[{'Category': 'Entrees',
  'Calories': '760',
  'Fat': '45',
  'Carbs': '45',
  'Name': 'Coney (Chili) Dog'},
 {'Category': 'French Fries',
  'Calories': '340',
  'Fat': '20',
  'Carbs': '26',
  'Name': 'Chili Fries'},
 {'Category': 'Shakes',
  'Calories': '370',
  'Fat': '15',
  'Carbs': '49',
  'Name': 'Strawberry Milkshake (small)'},
 {'Category': 'Shakes',
  'Calories': '670',
  'Fat': '29',
  'Carbs': '90',
  'Name': 'A&W® Root Beer Freeze (large)'},
 {'Category': 'Desserts',
  'Calories': '820',
  'Fat': '18',
  'Carbs': '150',
  'Name': 'Caramel Sundae'},
 {'Category': 'Shakes',
  'Calories': '340',
  'Fat': '9',
  'Carbs': '57',
  'Name': 'Strawberry Banana Smoothee'},
 {'Category': 'Desserts',
  'Calories': '420',
  'Fat': '6',
  'Carbs': '86',
  'Name': 'Chocolate Fudge Blendrrr'},
 {'Category': 'Drinks',
  'Calories': '1010',
  'Fat': '59',
  'Carbs': '152',
  'Name': 'Strawberry Limeade'},
 {'Category': 'Drinks',
  'Calories': '420',
  'Fat': '0',
  'Carbs': '105',
  'Name'

In [22]:
# https://thispointer.com/python-how-to-add-append-key-value-pairs-in-dictionary-using-dict-update/
[i.update([('restaurant',str(foods_html[1].find('h3').text.replace('\n','')))]) for i in test_2];

In [23]:
test_2

[{'Category': 'Entrees',
  'Calories': '760',
  'Fat': '45',
  'Carbs': '45',
  'Name': 'Coney (Chili) Dog',
  'restaurant': 'A&W Restaurants'},
 {'Category': 'French Fries',
  'Calories': '340',
  'Fat': '20',
  'Carbs': '26',
  'Name': 'Chili Fries',
  'restaurant': 'A&W Restaurants'},
 {'Category': 'Shakes',
  'Calories': '370',
  'Fat': '15',
  'Carbs': '49',
  'Name': 'Strawberry Milkshake (small)',
  'restaurant': 'A&W Restaurants'},
 {'Category': 'Shakes',
  'Calories': '670',
  'Fat': '29',
  'Carbs': '90',
  'Name': 'A&W® Root Beer Freeze (large)',
  'restaurant': 'A&W Restaurants'},
 {'Category': 'Desserts',
  'Calories': '820',
  'Fat': '18',
  'Carbs': '150',
  'Name': 'Caramel Sundae',
  'restaurant': 'A&W Restaurants'},
 {'Category': 'Shakes',
  'Calories': '340',
  'Fat': '9',
  'Carbs': '57',
  'Name': 'Strawberry Banana Smoothee',
  'restaurant': 'A&W Restaurants'},
 {'Category': 'Desserts',
  'Calories': '420',
  'Fat': '6',
  'Carbs': '86',
  'Name': 'Chocolate Fudge

In [24]:
# Okay, let's put it all together

In [29]:
def food_dict(url):
    '''
    This is a scraper built for this website: https://pages.git.generalassemb.ly/rldaggie/for-scraping/
    '''
    # set up output object
    food_output = []
    
    # get list of html soup objects
    foods_html = [BeautifulSoup(requests.get(i, timeout=5).content, 'lxml') for i in [url+a['href'] for a in soup.find_all('a', href=True)]]
    
    # for each item in list of html soup objects
    for j in range(len(foods_html)):
            # dict name multiplier dependent on length of each document
            multiplier = (len([i.text for i in foods_html[j].find_all('td')]) + 1 )/ 5
            multiplier = int(multiplier)
            
            # zip together a repeated list of names with an assosciated value
            foods = list(zip([i.text for i in foods_html[j].find_all('th')] * multiplier, [i.text for i in foods_html[j].find_all('td')]))
            
            #divide into blocks of 5 to match pre defined data range for calories, fat, carbs, categeory, name
            foods = [dict(foods[i:i+5]) for i in range(1,len(foods),5)]
            
            if j < 1:
                pass
            else:
                # run dictionary update to update each entry with a resturant and value from html soup object k
                [i.update([('restaurant',str(foods_html[j].find('h3').text.replace('\n','')))]) for i in foods];
                
            food_output.append(foods)
    
    # ignore the first entry because they don't have important information
    return food_output[1:]


### Step 4: Create a pandas DataFrame from your list of foods

**Note**: Your DataFrame should have 5,131 rows

In [30]:
url = 'https://pages.git.generalassemb.ly/rldaggie/for-scraping/'
foods = food_dict(url)

In [31]:
# Now we have a list of dicts, let's make it into a list of dataframes and then concat them together

In [32]:
df_foods = pd.concat([pd.DataFrame(i) for i in foods])

In [33]:
df_foods = df_foods.reset_index(drop=True)

In [34]:
df_foods

Unnamed: 0,Category,Calories,Fat,Carbs,Name,restaurant
0,Burgers,760,45,45,Coney (Chili) Dog,A&W Restaurants
1,Entrees,340,20,26,Chili Fries,A&W Restaurants
2,French Fries,370,15,49,Strawberry Milkshake (small),A&W Restaurants
3,Shakes,670,29,90,A&W® Root Beer Freeze (large),A&W Restaurants
4,Shakes,820,18,150,Caramel Sundae,A&W Restaurants
...,...,...,...,...,...,...
5126,Shakes,200,5,32,Grilled Chicken Go Wrap,Wendy's
5127,Wraps,260,10,25,Asiago Ranch Chicken Club,Wendy's
5128,Sandwiches,670,32,57,Spicy Chicken Go Wrap,Wendy's
5129,Wraps,330,16,30,Large Strawberry Frosty™ Shake,Wendy's


In [199]:
df_foods = df_foods[['restaurant', 'Category', 'Name', 'Calories', 'Carbs', 'Fat']]

### Step 5: Export to csv

**Note:** Don't export the index column from your DataFrame

In [201]:
#df_foods.to_csv("./df_foods.csv", index=False)

### Step 6 Do the same thing as above, but use `pd.read_html()` to scrape the table from each page instead of BS4.

In [170]:
# I don't know how to get the href tags, and it seems that i still need to use bs4, so, i will use bs4 only to get href tags
# https://stackoverflow.com/questions/56757261/extract-href-using-pandas-read-html

In [None]:
# Get list of HREF urls

In [171]:
href = pd.DataFrame([url+i['href'] for i in soup.find_all('a', href=True)]).drop(0).reset_index(drop=True)

In [172]:
href.columns = ['links']

In [173]:
df_html = pd.read_html(url)[0].drop('Unnamed: 1', axis=1)

In [174]:
df_html = df_html.join(href)

In [None]:
# Combine into a dataframe

In [175]:
df_html

Unnamed: 0,Name,links
0,A&W Restaurants,https://pages.git.generalassemb.ly/rldaggie/fo...
1,Applebee's,https://pages.git.generalassemb.ly/rldaggie/fo...
2,Arby's,https://pages.git.generalassemb.ly/rldaggie/fo...
3,Atlanta Bread Company,https://pages.git.generalassemb.ly/rldaggie/fo...
4,Bojangle's Famous Chicken 'n Biscuits,https://pages.git.generalassemb.ly/rldaggie/fo...
5,Buffalo Wild Wings,https://pages.git.generalassemb.ly/rldaggie/fo...
6,Burger King,https://pages.git.generalassemb.ly/rldaggie/fo...
7,Captain D's,https://pages.git.generalassemb.ly/rldaggie/fo...
8,Carl's Jr.,https://pages.git.generalassemb.ly/rldaggie/fo...
9,Charley's Grilled Subs,https://pages.git.generalassemb.ly/rldaggie/fo...


In [None]:
# Testing

In [176]:
df_html.iloc[0]['Name']

'A&W Restaurants'

In [177]:
list_df_foods = [pd.read_html(i) for i in df_html['links']]

In [178]:
list_df_foods[0][0]

Unnamed: 0,Name,Category,Calories,Fat,Carbs
0,Original Bacon Double Cheeseburger,Burgers,760,45,45
1,Coney (Chili) Dog,Entrees,340,20,26
2,Chili Fries,French Fries,370,15,49
3,Strawberry Milkshake (small),Shakes,670,29,90
4,A&W® Root Beer Freeze (large),Shakes,820,18,150
...,...,...,...,...,...
117,Chocolate Fudge Blendrrr,Desserts,490,30,68
118,Strawberry Limeade,Drinks,230,0,58
119,Cherry Slushee,Drinks,570,0,141
120,Lime Slushee,Drinks,280,0,69


In [191]:
[j[0].insert(1, 'restaurant', df_html.iloc[i]['Name']) for i,j in enumerate(list_df_foods)]

ValueError: cannot insert restaurant, already exists

In [None]:
# added column to dataframe
# https://www.geeksforgeeks.org/adding-new-column-to-existing-dataframe-in-pandas/

In [193]:
list_df_foods

[[                                   Name       restaurant      Category  \
  0    Original Bacon Double Cheeseburger  A&W Restaurants       Burgers   
  1                     Coney (Chili) Dog  A&W Restaurants       Entrees   
  2                           Chili Fries  A&W Restaurants  French Fries   
  3          Strawberry Milkshake (small)  A&W Restaurants        Shakes   
  4         A&W® Root Beer Freeze (large)  A&W Restaurants        Shakes   
  ..                                  ...              ...           ...   
  117            Chocolate Fudge Blendrrr  A&W Restaurants      Desserts   
  118                  Strawberry Limeade  A&W Restaurants        Drinks   
  119                      Cherry Slushee  A&W Restaurants        Drinks   
  120                        Lime Slushee  A&W Restaurants        Drinks   
  121                      A&W® Root Beer  A&W Restaurants        Drinks   
  
       Calories  Fat  Carbs  
  0         760   45     45  
  1         340   20     

In [None]:
# Combine list of dataframes into dataframe

In [195]:
df_foods_2 = pd.concat([[i][0][0] for i in list_df_foods])

In [203]:
df_foods_2 = df_foods_2[['restaurant', 'Category', 'Name', 'Calories', 'Carbs', 'Fat']]

In [204]:
df_foods_2

Unnamed: 0,restaurant,Category,Name,Calories,Carbs,Fat
0,A&W Restaurants,Burgers,Original Bacon Double Cheeseburger,760,45,45
1,A&W Restaurants,Entrees,Coney (Chili) Dog,340,26,20
2,A&W Restaurants,French Fries,Chili Fries,370,49,15
3,A&W Restaurants,Shakes,Strawberry Milkshake (small),670,90,29
4,A&W Restaurants,Shakes,A&W® Root Beer Freeze (large),820,150,18
...,...,...,...,...,...,...
59,Wendy's,Shakes,Jr. Original Chocolate Frosty™,200,32,5
60,Wendy's,Wraps,Grilled Chicken Go Wrap,260,25,10
61,Wendy's,Sandwiches,Asiago Ranch Chicken Club,670,57,32
62,Wendy's,Wraps,Spicy Chicken Go Wrap,330,30,16
