# Web Scraping With BeautifulSoup and Selenium

We'll begin the web scraping process by importing necessary libraries and packages, specifically using web scraping tools BeautifulSoup and Selenium.

In [None]:
from bs4 import BeautifulSoup
import requests
import pickle
import time, os

import pandas as pd
import random

import os.path
from os import path

from IPython.core.display import display, HTML

from selenium import webdriver
from selenium.webdriver.common.keys import Keys

In [None]:
chromedriver = "/Applications/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

In [None]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'}

url = 'https://www.totalwine.com/spirits/scotch/blended-scotch/johnnie-walker-blue-year-of-the-ox/p/234446750?s=1006&igrules=true'
response = requests.get(url, headers=headers)
page = response.text
soup = BeautifulSoup(page)

driver = webdriver.Chrome(chromedriver)
driver.get(url)

soup = BeautifulSoup(driver.page_source, 'html.parser')

In [None]:
response.status_code  #200 = success!

## Web scraping a single page

We'll first gather the attributes we wish to obtain for a single page from [Total Wine](https://www.totalwine.com/), and then we'll write a function that can be used to obtain this same information for each bottle of whiskey. We want to get the following information to put into a table (i.e. dataframe):
* Whiskey name
* Rating
* Rating source
* User rating
* Number of reviews
* Price
* Brand
* Country
* State
* Spirit type
* Taste

#### Whiskey Name

In [None]:
name = soup.find('h1').text
name

#### Rating

In [None]:
try:
    rating = soup.find('div', class_='redBadgeNumber__DZXSWqnj').text
except:
    rating = None
rating

#### Rating Source

In [None]:
try:
    source = soup.find('div', class_='redBadgeSource__1hMXdJ5Z').text.strip()
except:
    source = None
source

#### User Rating

In [None]:
try:
    user_rating = soup.find(class_='bv_avgRating_component_container notranslate').text
except:
    user_rating = None
user_rating

#### Number of reviews

In [None]:
reviews = soup.find(class_='bv_numReviews_component_container').text
reviews

#### Price

In [None]:
try:
    price = soup.find(id='edlpPrice').text
except:
    price = None
price

#### Brand

In [None]:
brand = soup.find(class_='detailsTableText__1SvcRdYn').findChild().text
brand

#### Country

In [None]:
country = soup.find(text='COUNTRY').findNext().text
country

#### State

In [None]:
try:
    state = soup.find(text='STATE').findNext().text
except:
    state = None
state

#### Spirit type

In [None]:
spirit_type = soup.find(text='SPIRITS TYPE').findNext().text
spirit_type

#### Taste

In [None]:
try:
    taste = soup.find(text='TASTE').findNext().text
except:
    taste = None
taste

Create helper functions to parse strings into appropriate data types.

In [None]:
def money_to_float(moneystring):
    moneystring = moneystring.replace('$', '').replace(',', '')
    return float(moneystring)

def to_taste(taste_string):
    taste = taste_string.split(",")
    return taste

def name_to_title(whiskey_name):
    whiskey_name = whiskey_name.title()
    return whiskey_name

def format_number_of_reviews(num_reviews):
    num_reviews = num_reviews.replace('\xa0(', '').replace(')', '')
    return int(num_reviews)

Apply the conversions.

In [None]:
price = money_to_float(price)
price

In [None]:
taste = to_taste(taste)
taste

In [None]:
name = name_to_title(name)
name

In [None]:
reviews = format_number_of_reviews(reviews)
reviews

Put the items from a single page in a dictionary.

In [None]:
headers = ['whiskey_name', 'rating', 'rating_source', 'user_rating',
           'num_reviews', 'price', 'brand', 'country',
           'state', 'spirit_type', 'taste']

whiskey_data = []
whiskey_dict = dict(zip(headers, [name, rating, source, user_rating,
                                  reviews, price, brand, country,
                                  state, spirit_type, taste]))

whiskey_data.append(whiskey_dict)
whiskey_data

## Web scraping multiple pages

First, gather page URLs for each page.

In [None]:
page_urls = []
for num in range(1, 11):
    page_urls.append("https://www.totalwine.com/spirits/whiskey/c/9238919?viewall=true&page={}&pageSize=120&spiritsvolume=Standard%20Size%20750%20ml&aty=1,1,1,1".format(num))

page_urls

Next, visit the first page to gather the links for each bottle on a single page.

In [None]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'}

response = requests.get(page_urls[1], headers=headers)
page = response.text
soup = BeautifulSoup(page)

driver = webdriver.Chrome(chromedriver)
driver.get(page_urls[0])

In [None]:
driver.page_source[:1000]

In [None]:
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [None]:
totalwine_url = 'https://totalwine.com'

# get the anchor tags
anchors = soup.find(class_='grid__1eZnNfL-').find_all('a')

# get the hrefs
hrefs = [totalwine_url + a.get('href') for a in anchors]
hrefs

We can see that each bottle of whiskey contains a duplicate and then another link to the same bottle, but with '&tab3' appended. We need to remove the link with the appended characters and then remove the duplicate value to get a list with unique links.

In [None]:
# remove links with the appended characters
[hrefs.remove(link) for link in hrefs if '&tab=3' in link]
hrefs

In [None]:
# remove duplicates
hrefs = list(set(hrefs))
hrefs

Now we put it all in a function so that for each results page we visit we can gather the links for each bottle.

In [None]:
def get_page_links(page_url):
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'}

    response = requests.get(page_url, headers=headers)
    page = response.text
    soup = BeautifulSoup(page)
    
    driver = webdriver.Chrome(chromedriver)
    driver.get(page_url)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    totalwine_url = 'https://totalwine.com'

    # get the anchor tags
    anchors = soup.find(class_='grid__1eZnNfL-').find_all('a')

    # get the hrefs
    hrefs = [totalwine_url + a.get('href') for a in anchors]
    
    # remove links with the appended characters
    [hrefs.remove(link) for link in hrefs if '&tab=3' in link]
    
    # remove duplicates
    hrefs = list(set(hrefs))
    
    return hrefs

Let's gather the links for each of the 120 bottles on each of the first 10 pages.

In [None]:
bottles_list = []
for url in page_urls:
    bottles_list.append(get_page_links(url))

In [None]:
bottles_list

In [None]:
#flatten the list of links
bottles_list = [link for sublist in bottles_list for link in sublist]
len(bottles_list)

Pickle the list. We had to use Selenium since information on the Total Wine website is dynamic (i.e. it changes with new products, reviews, etc.), so it's a good idea to save the list we scraped so that we minimize duplicates when scraping pages for individual whiskeys.

In [None]:
# save the bottles list
filename = 'bottles.pkl'
outfile = open(filename,'wb')
pickle.dump(bottles_list,outfile)
outfile.close()

## Putting it all together

Lastly, we'll combine all the steps from the beginning where we scraped information from a single page into a function, so that we can loop through each of the pages in our `bottles_list` and scrape information on each bottle and store it in a list to be used later.

In [None]:
def get_whiskey_dict(link):
    '''
    From TotalWine link stub, request bottle html, parse with BeautifulSoup, and
    collect 
        - whiskey name 
        - rating
        - rating source 
        - user rating
        - number of reviews
        - price
        - brand
        - country
        - state
        - spirit type
        - taste
    Return information as a dictionary.
    '''
    
    columns = ['whiskey_name', 'rating', 'rating_source', 'user_rating',
           'num_reviews', 'price', 'brand', 'country',
           'state', 'spirit_type', 'taste']
    
    #Request HTML and parse
    user_agent = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'}
    response = requests.get(link, headers=user_agent)
    page = response.text
    soup = BeautifulSoup(page)

    driver = webdriver.Chrome(chromedriver)
    driver.get(link)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    # Get whiskey name
    name = soup.find('h1').text
    name = name_to_title(name)

    # Get rating
    try:
        rating = int(soup.find('div', class_='redBadgeNumber__DZXSWqnj').text)
    except:
        rating = None
    
    # Get rating source
    try:
        source = soup.find('div', class_='redBadgeSource__1hMXdJ5Z').text.strip()
    except:
        source = None

    # Get user rating
    try:
        user_rating = float(soup.find(class_='bv_avgRating_component_container notranslate').text)
    except:
        user_rating = None

    # Get number of reviews
    try:
        reviews = soup.find(class_='bv_numReviews_component_container').text
        reviews = format_number_of_reviews(reviews)
    except:
        reviews = None
    
    # Get price
    try:
        price = soup.find(id='edlpPrice').text
        price = money_to_float(price)
    except:
        price = None
    
    # Get brand
    brand = soup.find(class_='detailsTableText__1SvcRdYn').findChild().text
    
    # Get country
    country = soup.find(text='COUNTRY').findNext().text
    
    # Get state
    try:
        state = soup.find(text='STATE').findNext().text
    except:
        state = None
    
    # Get spirit type
    spirit_type = soup.find(text='SPIRITS TYPE').findNext().text
    
    # Get taste
    try:
        taste = soup.find(text='TASTE').findNext().text
        taste = to_taste(taste)
    except:
        taste = None
    
    # Create whiskey dictionary and return
    whiskey_dict = dict(zip(columns, [name, rating, source, user_rating,
                                  reviews, price, brand, country,
                                  state, spirit_type, taste]))

    return whiskey_dict

Load the `bottles_list` from earlier.

In [None]:
# load bottles_list
bottles_filename = 'bottles.pkl'
infile = open(bottles_filename,'rb')
new_bottles_list = pickle.load(infile)
infile.close()

new_bottles_list

Loop through 

In [None]:
filename = 'whiskeys.pkl'

# check if we already have the file in the event that scraping was blocked by the website
if path.exists(filename):
    infile = open(filename,'rb')
    whiskey_list = pickle.load(infile)
    infile.close()
else:
    whiskey_list = []

outfile = open(filename, 'wb')

# loop through the list of all urls for whiskey bottles and scrape each page
i = 1172
while i < 1200:
    try:
        link = new_bottles_list[i]
        whiskey_list.append(get_whiskey_dict(link))
        i += 1
    except:
        pickle.dump(whiskey_list, outfile)
        break
        
pickle.dump(whiskey_list, outfile)
outfile.close()

We can check that our pickle is writing new entries to the file each time our scraping process is ended by the Total wine website.

In [None]:
whiskey_filename = 'whiskeys.pkl'
infile = open(whiskey_filename,'rb')
new_whiskey_list = pickle.load(infile)
infile.close()

print(len(new_whiskey_list))