In [1]:
# Import libaries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

In [2]:
# target webpage
url = 'https://pages.git.generalassemb.ly/rldaggie/for-scraping/'

In [3]:
# Establishing the connection to the web page:
response = requests.get(url)

In [4]:
print(response.status_code)

200


In [5]:
# Pull the HTML string out of requests and convert it to a Python string.
html = response.text

### Step 1: Create a soup object from the home page

In [6]:
soup = BeautifulSoup(html, 'lxml')

### Step 2: Scrape the home page soup for every restaurant

Note: Your best bet is to create a list of dictionaries, one for each restaurant. Each dictionary contains the restaurant's name and path from the `href`. The result of your scrape should look something like this:

```python
restaurants = [
    {'name': 'A&W Restaurants', 'href': 'restaurants/1.html'}, 
    {'name': "Applebee's", 'href': 'restaurants/2.html'},
    ...
]
```

In [7]:
# This code collects the restaurant names and urls from homepage

# List to store results
restaurants = []

# Get all the <td>... elements
all_td = soup.find_all('td')
for element in all_td:
    # start a dictionary to store this item's data
    result = {}
    
    # get the title and full url
    a_href = element.find('a')
    if a_href:
        result['name'] = a_href.text   # element text
        result['href'] = a_href['href'] # href link
        
    # only store "full" rows of data
    if len(result) == 2:
        restaurants.append(result)
        
restaurants[:3]

[{'name': 'A&W Restaurants', 'href': 'restaurants/1.html'},
 {'name': "Applebee's", 'href': 'restaurants/2.html'},
 {'name': "Arby's", 'href': 'restaurants/3.html'}]

### Step 3: Using the `href`, scrape each restaurant's page and create a single list of food dictionaries.

Your list of foods should look something like this:
```python
foods = [
    {
        'calories': '0',
        'carbs': '0',
        'category': 'Drinks',
        'fat': '0',
        'name': 'A&W® Diet Root Beer',
        'restaurant': 'A&W Restaurants'
    },
    {
        'calories': '0',
        'carbs': '0',
        'category': 'Drinks',
        'fat': '0',
        'name': 'A&W® Diet Root Beer',
        'restaurant': 'A&W Restaurants'
    },
    ...
]
```

**Note**: Remove extra white space from each category

In [8]:
# get the list of full urls to scrape through
def restaurant_fulllinks():
    new_list = []
    for item in restaurants:
        each_dict = {}
        each_dict['name'] = item['name']
        each_dict['href'] = 'https://pages.git.generalassemb.ly/rldaggie/for-scraping/' + item['href']
        if len(each_dict) == 2:
            new_list.append(each_dict)
    return new_list

In [9]:
# function for getting a list of each key
def each_food(st_range, all_td):
    new_list = []
    for i in range(st_range, len(all_td), 5):
        for name in all_td:
            new_list.append(all_td[i].text)
            break
    return new_list

# for 'name', st_range==0
# for 'category', st_range==1
# for 'calories', st_range==2
# for 'fat', st_range==3
# for 'carbs', st_range==4

In [10]:
# This code collects the info from each restaurant page
def get_food():
    # List to store results
    foods = []
    
    # loop through each restaurant url
    for item in restaurant_fulllinks():
        response = requests.get(item['href'])
        html = response.text
        soup = BeautifulSoup(html, 'lxml')
        
        # Get all the <td class="title"... elements
        all_td = soup.find_all('td')
        for i in range(0, len(all_td), 5):
        # start a dictionary to store this item's data
            result = {} 
            # build each item dictionary
            for a in each_food(i, all_td):
                result['name'] = a # 'name': value
                break
            for b in each_food(i+1, all_td): 
                result['category'] = b # 'category': value
                break
            for c in each_food(i+2, all_td): 
                result['calories'] = c # 'calories': value
                break
            for d in each_food(i+3, all_td):
                result['fat'] = d # 'fat': value
                break
            for e in each_food(i+4, all_td):
                result['carbs'] = e # 'carbs' : value
                break
            result['restaurant'] = item['name']
            # only store "full" rows of data
            if len(result) == 6:
                foods.append(result)
            
    return foods

In [11]:
%%time
import time
named_tuple = time.localtime() # get start_time
time_string = time.strftime("%d/%m/%Y, %H:%M", named_tuple) # format time print
print(f'start time: {time_string}')

# initiate function
foods = get_food()

start time: 24/07/2022, 15:03
CPU times: total: 6.59 s
Wall time: 1min 5s


In [12]:
foods[:2]

[{'name': 'Original Bacon Double Cheeseburger',
  'category': 'Burgers',
  'calories': '760',
  'fat': '45',
  'carbs': '45',
  'restaurant': 'A&W Restaurants'},
 {'name': 'Coney (Chili) Dog',
  'category': 'Entrees',
  'calories': '340',
  'fat': '20',
  'carbs': '26',
  'restaurant': 'A&W Restaurants'}]

In [13]:
len(foods) # 5131 datapoints

5131

### Step 4: Create a pandas DataFrame from your list of foods

**Note**: Your DataFrame should have 5,131 rows

In [14]:
df = pd.DataFrame(foods) # convert to pandas dataframe

In [15]:
df.shape

(5131, 6)

In [16]:
df.head(3)

Unnamed: 0,name,category,calories,fat,carbs,restaurant
0,Original Bacon Double Cheeseburger,Burgers,760,45,45,A&W Restaurants
1,Coney (Chili) Dog,Entrees,340,20,26,A&W Restaurants
2,Chili Fries,French Fries,370,15,49,A&W Restaurants


### Step 5: Export to csv

**Note:** Don't export the index column from your DataFrame

In [17]:
df.to_csv('foods.csv',index=False) 