In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import csv
import datetime

In [None]:
# To run Chrome in headless mode
# options = Options()
# options.headless = True
# options.add_argument("--window-size-1920,1200")

In [2]:
DRIVER_PATH = 'chromedriver'

In [None]:
cbb_driver = webdriver.Chrome(executable_path=DRIVER_PATH)

results = 1
pages = 0
cbb_driver.get(f'https://beerandbrewing.com/beer-reviews?q=&hPP={results}&idx=cbb_web_review_search&p={pages}')

In [None]:
cbb_main_reviews_window = cbb_driver.current_window_handle

In [3]:
# Define style families for search and categorization
style_families = ['Wheat', 'Sour', 'Belgian Ale', 'Pale Ale', 'English Bitter', 'Scottish Ale', 'Brown Ale', 'Porter', 'Stout', \
               'Pilsner', 'American Lager', 'European Lager', 'Bock', 'Alt', 'Barley Wine']

In [None]:
# Functions to handle data processing for Craft Beer & Brewing

def get_item_text(item):
    return item.text or ''

def parse_review_meta(review_meta):
    meta_keys = ['Style', 'ABV', 'IBU']
    meta_items = []
    
    for i in review_meta:
        if get_item_text(i) is not None and get_item_text(i) != '':
            if 'Style' in get_item_text(i):
                item = get_item_text(i).split(': ')
            else:
                item = get_item_text(i).replace(':','').split()
            meta_items.append(item)

    for i in meta_items:
        if 'Style' not in i:
            first_pair = ' '.join(i[:2])
            second_pair = ' '.join(i[-2:])
        else:
            first_pair = None
            second_pair = None

    transformed_meta_items = [i for i in meta_items if 'Style' in i]
    if first_pair is not None:
        transformed_meta_items.append(first_pair.split())
    if second_pair is not None:
        transformed_meta_items.append(second_pair.split())
    
    meta_items_dict = {x: y.strip() for x, y in transformed_meta_items}
    return tuple([meta_items_dict.get(key, None) for key in meta_keys])
    
def parse_scores(scores):
    split_scores = scores.split()
    score = split_scores[0].split('/')[0]
    aroma = split_scores[2]
    appearance = split_scores[4]
    flavor = split_scores[6]
    mouthfeel = split_scores[8]
    return score, aroma, appearance, flavor, mouthfeel

def parse_reviews(reviews):
#     brewers = reviews[2].text.strip('\"')
#     panel = reviews[3].text.strip('\"')
#     editors = reviews[4].text.strip('\"')
#     total_review = brewers+' '+panel+' '+editors
#     return brewers, panel, editors, total_review
    total_review = []
    for review in reviews:
        review_text = get_item_text(review)
        if 'Print Shelf Talker' not in review_text and 'How We Review' not in review_text:
            total_review.append(review_text.strip('\"').strip('\n'))
    return ' '.join(total_review)

In [29]:
# Functions to handle data processing for UNTAPPD

# def start_search(style):
#     untappd_url = f'https://untappd.com/search?q={style}'
#     return untappd_url

def search_for_beers(style):
    untappd_driver.find_element_by_id('search-term').clear()
    untappd_driver.find_element_by_id('search-term').send_keys(style)
    search = untappd_driver.find_element_by_xpath('//*[@id="slide"]/div/div[1]/div/div/div[1]/form/span/input')
    search.click()

def get_beer_data():
    beer_elements = untappd_driver.find_element_by_class_name('content')
    beer = beer_elements.find_element_by_tag_name('h1').text
    score = beer_elements.find_element_by_class_name('num').text.replace('(','').replace(')','')
    bid = beer_elements.find_element_by_class_name('label').get_attribute('href').split('/')[-1]
    
    dict_data = {
                'bid': bid,
                'beer': beer,
                'score': score
                }
    
    p_tags = beer_elements.find_elements_by_tag_name('p')
    p_tags_list = []
    for tag in p_tags:
        tag_value = tag.text 
        tag_attribute = tag.get_attribute('class')
        p_tags_list.append({tag_attribute:tag_value})
    
    for dicts in p_tags_list:
        for k,v in dicts.items():
            if k:
                dict_data[k] = v
            
    try:
        untappd_driver.find_element_by_css_selector('div.beer-descrption-read-more').click()
        description = untappd_driver.find_element_by_class_name('beer-descrption-read-less').text
    except:
        description = ''
    
    dict_data['description'] = description
    
    return dict_data

In [None]:
beer_styles = [get_item_text(i).split('\n') for i in cbb_driver.find_elements_by_class_name('ais-refinement-list')][0]

In [None]:
beer_hits = cbb_driver.find_elements_by_class_name('hit-content')
tabs = []
beers = []

for hits in beer_hits:
    for beer in hits.find_elements_by_tag_name('a'):
        beer.send_keys(Keys.CONTROL + Keys.RETURN)
        tabs.append(cbb_driver.window_handles[-1])

In [None]:
for tab in tabs:
    cbb_driver.switch_to.window(tab)

    review_meta = cbb_driver.find_element_by_class_name('review-meta-holder').find_elements_by_tag_name('p')
    style, abv, ibu = parse_review_meta(review_meta)
    
    scores = cbb_driver.find_element_by_class_name('main-score-overall-container').text
    score, aroma, appearance, flavor, mouthfeel = parse_scores(scores)
    
    beer = cbb_driver.find_element_by_id('article-body').find_element_by_tag_name('h1').text
    
    reviews = cbb_driver.find_element_by_id('article-body').find_elements_by_tag_name('p')
#     brewers, panel, editors, 
    total_review = parse_reviews(reviews)
    
    beer_data = {
                'beer': beer,
                'style': style,
                'abv': abv,
                'ibu': ibu,
                'total_score': score,
                 'aroma_score': aroma,
                 'appearance_score': appearance,
                 'flavor_score': flavor,
                 'mouthfeel_score': mouthfeel,
#                  'brewers': brewers,
#                  'panel': panel,
#                  'editors': editors,
                'total_review': total_review
                }
    beers.append(beer_data)

In [None]:
# for style in beer_styles:
#     for beer in beers:
#         if style in beer['style']:
#             beer['style_category'] = style
# beers

In [None]:
# styles = [beer['style_category'] for beer in beers]
# print(list(set(styles)))

In [None]:
today = datetime.datetime.today()
now = str(today).replace(' ','_').replace(':','')

# TODO: elegantly handle encoding
keys = beers[0].keys()
with open(f'beer_data_scrape_{now}.csv', 'w', newline='',encoding='UTF-8')  as output_file:
    dict_writer = csv.DictWriter(output_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(beers)

In [None]:
cbb_driver.quit()

In [24]:
untappd_driver = webdriver.Chrome(executable_path=DRIVER_PATH)
# all_beers ='https://untappd.com/beer/top_rated?country=united-states'
# search_ipa = 'https://untappd.com/search?q=ipa'
# untappd_driver.get(search_ipa)
# untappd_main_window = untappd_driver.current_window_handle

In [28]:
untappd_driver.find_element_by_id('search-term').clear() # clear current search value
untappd_driver.find_element_by_id('search-term').send_keys('lager') # input new search value
search = untappd_driver.find_element_by_xpath('//*[@id="slide"]/div/div[1]/div/div/div[1]/form/span/input') # find the search button
search.click()

In [None]:
def search_for_styles(style):
    untappd_url = f'https://untappd.com/search?q={style}'
    return untappd_url

In [10]:
untappd_driver.switch_to.window(untappd_main_window)

In [None]:
beer_data = []

# Start driver
untappd_driver = webdriver.Chrome(executable_path=DRIVER_PATH)
# Start search
untappd_url = f'https://untappd.com/search?q='
untappd_driver.get(untappd_url)
untappd_driver.implicitly_wait(15)

for style in style_families[2:5]: #slice added for limiting results while testing
    # function not working, returning search as a string that cannot be clicked
    # moving search.click() within the function did not work either, moved code within loop
#     search_for_styles(style) 
    untappd_driver.find_element_by_id('search-term').clear()
    untappd_driver.find_element_by_id('search-term').send_keys(style)
    search = untappd_driver.find_element_by_xpath('//*[@id="slide"]/div/div[1]/div/div/div[1]/form/span/input')
    search.click()
    untappd_main_window = untappd_driver.current_window_handle
    
    # Click Show More beers 3 times; removed while testing
#     for i in range(3):
#         more_beer_results = untappd_driver.find_element_by_xpath('//*[@id="slide"]/div/div[1]/div/div/div[3]/a')
#         more_beer_results.click()
        
#     untappd_driver.switch_to.window(untappd_main_window)

    beer_details = untappd_driver.find_elements_by_class_name('beer-details')
    for detail in beer_details:
        beer_page_link = detail.find_element_by_tag_name('a')
        beer_page_link.send_keys(Keys.CONTROL + Keys.RETURN)

        new_beer_tab = untappd_driver.window_handles[-1]
        untappd_driver.switch_to.window(new_beer_tab)
        
        untappd_driver.implicitly_wait(15)

        dict_data = get_beer_data()
        beer_data.append(dict_data)

        untappd_driver.close()

        untappd_driver.switch_to.window(untappd_main_window)
    
    print(f'Completed {style}. Collected {len(beer_data)} beers.')

untappd_driver.quit()

In [21]:
# untappd_driver.switch_to.window(untappd_main_window)
# beer_details = untappd_driver.find_elements_by_class_name('beer-details')
# beer_data = []
# for detail in beer_details:
#     beer_page_link = detail.find_element_by_tag_name('a')
#     beer_page_link.send_keys(Keys.CONTROL + Keys.RETURN)
    
#     new_beer_tab = untappd_driver.window_handles[-1]
#     untappd_driver.switch_to.window(new_beer_tab)
    
#     dict_data = get_beer_data()
#     beer_data.append(dict_data)
    
#     untappd_driver.close()
    
#     untappd_driver.switch_to.window(untappd_main_window)

# untappd_driver.quit()

In [34]:
beer_data

[{'bid': '3839',
  'beer': 'Belgian White',
  'score': '3.5',
  'brewery': 'Blue Moon Brewing Company',
  'style': 'Wheat Beer - Witbier',
  'abv': '5.4% ABV',
  'ibu': '9 IBU',
  'raters': '503,089 Ratings',
  'description': 'Blue Moon Belgian White, Belgian-style wheat ale, is a refreshing, medium-bodied, unfiltered Belgian-style wheat ale spiced with fresh coriander and orange peel for a uniquely complex taste and an uncommonly smooth finish. Show Less'},
 {'bid': '3916',
  'beer': 'Samuel Adams Summer Ale',
  'score': '3.4',
  'brewery': 'Boston Beer Company',
  'style': 'Wheat Beer - American Pale Wheat',
  'abv': '5.3% ABV',
  'ibu': '7 IBU',
  'raters': '310,701 Ratings',
  'description': 'Samuel Adams® Summer Ale is an American wheat ale. This hazy golden unfiltered brew combines crisp wheat with the bright citrus of lemon peel and the subtle spice of Grains of Paradise. This rare African pepper, closely related to cardamom, was first used as a brewing spice in the 13th Century

In [205]:
today = datetime.datetime.today()
now = str(today).replace(' ','_').replace(':','')

# TODO: elegantly handle encoding
keys = beer_data[10].keys()
with open(f'untappd_beer_data_scrape_{now}.csv', 'w', newline='', encoding='UTF-8')  as output_file:
    dict_writer = csv.DictWriter(output_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(beer_data)

In [38]:
untappd_driver.quit()