In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException, StaleElementReferenceException
import time
import csv
import datetime

In [None]:
# To run Chrome in headless mode
# options = Options()
# options.headless = True
# options.add_argument("--window-size-1920,1200")

In [3]:
DRIVER_PATH = 'chromedriver'

In [None]:
cbb_driver = webdriver.Chrome(executable_path=DRIVER_PATH)

results = 1
pages = 0
cbb_driver.get(f'https://beerandbrewing.com/beer-reviews?q=&hPP={results}&idx=cbb_web_review_search&p={pages}')

In [None]:
cbb_main_reviews_window = cbb_driver.current_window_handle

In [4]:
# Define style families for search and categorization
style_families = ['Wheat', 'Sour', 'Belgian Ale', 'Pale Ale', 'English Bitter', 'Scottish Ale', 'Brown Ale', 'Porter', 'Stout', \
               'Pilsner', 'American Lager', 'European Lager', 'Bock', 'Alt', 'Barley Wine']

### This section is for scraping Craft Beer & Brewing.

In [None]:
# Functions to handle data processing for Craft Beer & Brewing

def get_item_text(item):
    return item.text or ''

def parse_review_meta(review_meta):
    meta_keys = ['Style', 'ABV', 'IBU']
    meta_items = []
    
    for i in review_meta:
        if get_item_text(i) is not None and get_item_text(i) != '':
            if 'Style' in get_item_text(i):
                item = get_item_text(i).split(': ')
            else:
                item = get_item_text(i).replace(':','').split()
            meta_items.append(item)

    for i in meta_items:
        if 'Style' not in i:
            first_pair = ' '.join(i[:2])
            second_pair = ' '.join(i[-2:])
        else:
            first_pair = None
            second_pair = None

    transformed_meta_items = [i for i in meta_items if 'Style' in i]
    if first_pair is not None:
        transformed_meta_items.append(first_pair.split())
    if second_pair is not None:
        transformed_meta_items.append(second_pair.split())
    
    meta_items_dict = {x: y.strip() for x, y in transformed_meta_items}
    return tuple([meta_items_dict.get(key, None) for key in meta_keys])
    
def parse_scores(scores):
    split_scores = scores.split()
    score = split_scores[0].split('/')[0]
    aroma = split_scores[2]
    appearance = split_scores[4]
    flavor = split_scores[6]
    mouthfeel = split_scores[8]
    return score, aroma, appearance, flavor, mouthfeel

def parse_reviews(reviews):
#     brewers = reviews[2].text.strip('\"')
#     panel = reviews[3].text.strip('\"')
#     editors = reviews[4].text.strip('\"')
#     total_review = brewers+' '+panel+' '+editors
#     return brewers, panel, editors, total_review
    total_review = []
    for review in reviews:
        review_text = get_item_text(review)
        if 'Print Shelf Talker' not in review_text and 'How We Review' not in review_text:
            total_review.append(review_text.strip('\"').strip('\n'))
    return ' '.join(total_review)

In [None]:
beer_styles = [get_item_text(i).split('\n') for i in cbb_driver.find_elements_by_class_name('ais-refinement-list')][0]

In [None]:
beer_hits = cbb_driver.find_elements_by_class_name('hit-content')
tabs = []
beers = []

for hits in beer_hits:
    for beer in hits.find_elements_by_tag_name('a'):
        beer.send_keys(Keys.CONTROL + Keys.RETURN)
        tabs.append(cbb_driver.window_handles[-1])

In [None]:
for tab in tabs:
    cbb_driver.switch_to.window(tab)

    review_meta = cbb_driver.find_element_by_class_name('review-meta-holder').find_elements_by_tag_name('p')
    style, abv, ibu = parse_review_meta(review_meta)
    
    scores = cbb_driver.find_element_by_class_name('main-score-overall-container').text
    score, aroma, appearance, flavor, mouthfeel = parse_scores(scores)
    
    beer = cbb_driver.find_element_by_id('article-body').find_element_by_tag_name('h1').text
    
    reviews = cbb_driver.find_element_by_id('article-body').find_elements_by_tag_name('p')
    total_review = parse_reviews(reviews)
    
    beer_data = {
                'beer': beer,
                'style': style,
                'abv': abv,
                'ibu': ibu,
                'total_score': score,
                 'aroma_score': aroma,
                 'appearance_score': appearance,
                 'flavor_score': flavor,
                 'mouthfeel_score': mouthfeel,
                'total_review': total_review
                }
    beers.append(beer_data)

In [None]:
## Write the CBB data to a CSV

today = datetime.datetime.today()
now = str(today).replace(' ','_').replace(':','')

# TODO: elegantly handle encoding
keys = beers[0].keys()
with open(f'beer_data_scrape_{now}.csv', 'w', newline='',encoding='UTF-8')  as output_file:
    dict_writer = csv.DictWriter(output_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(beers)

In [None]:
cbb_driver.quit()

### This section is for scraping Untappd.

In [5]:
# Functions to handle data processing for UNTAPPD

# def start_search(style):
#     untappd_url = f'https://untappd.com/search?q={style}'
#     return untappd_url

def search_for_beers(style):
    untappd_driver.find_element_by_id('search-term').clear()
    untappd_driver.find_element_by_id('search-term').send_keys(style)
    search = untappd_driver.find_element_by_xpath('//*[@id="slide"]/div/div[1]/div/div/div[1]/form/span/input')
    search.click()

def get_beer_data():
    beer_elements = untappd_driver.find_element_by_class_name('content')
    beer = beer_elements.find_element_by_tag_name('h1').text
    score = beer_elements.find_element_by_class_name('num').text.replace('(','').replace(')','')
    bid = beer_elements.find_element_by_class_name('label').get_attribute('href').split('/')[-1]
    
    dict_data = {
                'bid': bid,
                'beer': beer,
                'score': score
                }
    
    p_tags = beer_elements.find_elements_by_tag_name('p')
    p_tags_list = []
    for tag in p_tags:
        tag_value = tag.text 
        tag_attribute = tag.get_attribute('class')
        p_tags_list.append({tag_attribute:tag_value})
    
    for dicts in p_tags_list:
        for k,v in dicts.items():
            if k:
                dict_data[k] = v
            
    try:
        untappd_driver.find_element_by_css_selector('div.beer-descrption-read-more').click()
        description = untappd_driver.find_element_by_class_name('beer-descrption-read-less').text
    except:
        description = ''
    
    dict_data['description'] = description
    
    return dict_data

In [27]:
# This lets me re-run the script after it blows up without having to quit the driver and re-login to Untappd.
untappd_driver.switch_to.window(untappd_main_window)

In [6]:
beer_data = []
search_more_exceptions = []
beer_page_link_errors = []
beer_content_errors = []

In [7]:
# Start driver
untappd_driver = webdriver.Chrome(executable_path=DRIVER_PATH)
# Start search
untappd_url = f'https://untappd.com/search?q='
untappd_driver.get(untappd_url)

In [8]:
login = input('Have you logged in? [Y/N] ')

# untappd_driver.get(untappd_url)

if login == 'Y':

    for style in style_families: # last 3 slice added to continue where left off
        
        time.sleep(5) # increased sleep to slow it down
        
        print(f'Searching for {style}.')
        
        untappd_driver.find_element_by_id('search-term').clear()
        untappd_driver.find_element_by_id('search-term').send_keys(style)
        search = untappd_driver.find_element_by_xpath('//*[@id="slide"]/div/div[1]/div/div/div[1]/form/span/input')
        search.click()
        
#         print('Clicked Search.')
        
        untappd_main_window = untappd_driver.current_window_handle

        # Click Show More beers 3 times; removed while testing
        for i in range(5): # increased click to get more beers
            
            time.sleep(5) # new sleep to slow it down
            
            try:
#                 print('Waiting for Search More to appear.')

                wait = WebDriverWait(untappd_driver,3)
                more_beer_results = untappd_driver.find_element_by_xpath('//*[@id="slide"]/div/div[1]/div/div/div[3]/a')
                wait.until(EC.visibility_of(more_beer_results))
                more_beer_results.click()

#                 print('Clicked Search More.')
            
            except NoSuchElementException as error:
#                 print('Could not find Search More.')
#                 print(error)
                search_more_exceptions.append(error)
                continue

        untappd_driver.switch_to.window(untappd_main_window)

        beer_details = untappd_driver.find_elements_by_class_name('beer-details')
        
        for detail in beer_details:
            
            time.sleep(5) # new sleep to slow it down
            
            try:
#                 print('Looking for beer page link.')
                
                wait = WebDriverWait(untappd_driver,90)
                wait.until(EC.visibility_of(detail.find_element_by_tag_name('a')))

                beer_page_link = detail.find_element_by_tag_name('a')
                beer_page_link.send_keys(Keys.CONTROL + Keys.RETURN)
                
                time.sleep(5) # new sleep to slow it down

                new_beer_tab = untappd_driver.window_handles[-1]
                untappd_driver.switch_to.window(new_beer_tab)
                
                time.sleep(10) # new sleep to slow it down

                try:
#                     print('Looking for beer content on beer page.')
                    
                    wait = WebDriverWait(untappd_driver,90)
                    wait.until(EC.visibility_of(untappd_driver.find_element_by_class_name('content')))
                        #(By.CLASS_NAME,'content')))
                        #untappd_driver.find_element_by_class_name('content'))

                    dict_data = get_beer_data()
                    dict_data['style_search'] = style
                    beer_data.append(dict_data)
                    
#                     print('Beer content collected.')

                except (NoSuchElementException, TimeoutException) as error:
#                     print('Could not find beer content on beer page.')
#                     print(error)
                    beer_content_errors.append(error)
                    untappd_driver.close()
                    
#                     print('Closed tab.')
                    
                    untappd_driver.switch_to.window(untappd_main_window)
                    
#                     print('Switched back to main window.')
                    continue
            
            except StaleElementReferenceException as error:
#                 print('Could not find beer page link.')
#                 print(error)
                beer_page_link_errors.append(error)
                continue

            untappd_driver.close()
#             print('Closed tab.')

            untappd_driver.switch_to.window(untappd_main_window)
#             print('Switched back to main window.')
            #time.sleep(3)

        print(f'Completed {style}.')

beer_fields = ['bid', 'beer', 'score', 'subsidiary', 'brewery', 'style', 'abv', 'ibu', 'raters', 'description', 'style_search']

## Write the Untappd data to a csv.

today = datetime.datetime.today()
now = str(today).replace(' ','_').replace(':','')

# TODO: elegantly handle encoding
keys = beer_fields
with open(f'untappd_beer_data_scrape_{now}.csv', 'w', newline='', encoding='UTF-8')  as output_file:
    dict_writer = csv.DictWriter(output_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(beer_data)
        
untappd_driver.quit()

Have you logged in? [Y/N] Y
Searching for Wheat.
Completed Wheat.
Searching for Sour.
Completed Sour.
Searching for Belgian Ale.
Completed Belgian Ale.
Searching for Pale Ale.
Completed Pale Ale.
Searching for English Bitter.


NoSuchWindowException: Message: no such window: window was already closed
  (Session info: chrome=91.0.4472.77)


##### Troubleshooting playground

In [None]:
# NoSuchElementException: Message: no such element: Unable to locate element: {"method":"css selector","selector":".content"}
#   (Session info: chrome=91.0.4472.77)

# TimeoutException: Message: timeout: Timed out receiving message from renderer: 300.000
#   (Session info: chrome=91.0.4472.77)



In [9]:
print(len(beer_data))
# beer_data[-1]
# print(beer_page_link.text)

553


In [10]:
beer_fields = []
for i in beer_data:
    if i.keys() not in beer_fields:
        beer_fields.append(i.keys())

beer_fields

[dict_keys(['bid', 'beer', 'score', 'brewery', 'style', 'abv', 'ibu', 'raters', 'description', 'style_search'])]

In [11]:
beer_fields = ['bid', 'beer', 'score', 'brewery', 'style', 'abv', 'ibu', 'raters', 'description', 'style_search']

In [12]:
## Write the Untappd data to a csv.

today = datetime.datetime.today()
now = str(today).replace(' ','_').replace(':','')

# TODO: elegantly handle encoding
keys = beer_fields
with open(f'untappd_beer_data_scrape_{now}.csv', 'w', newline='', encoding='UTF-8')  as output_file:
    dict_writer = csv.DictWriter(output_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(beer_data)

In [13]:
untappd_driver.quit()