# Coffeereview.com Roast Scraper

This notebook is for scraping coffee roast data from https://www.coffeereview.com


In [163]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from ipywidgets import IntProgress
from IPython.display import display

In [422]:
def scrape_coffee_reviews():
    """Request roast search page of coffeereviews.com, traverse all pages 
       and pull available roast data on that page including URL for detailed roast reviews.
       
       Returns a list of data dicts"""
    
    def scrape_page(page_number):
        
        page_data = [] 
        
        # This header is required. 
        headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'}
        url = 'https://www.coffeereview.com/advanced-search/page/' + str(page_number) + '/'
    
        # Send a GET request to the URL
        response = requests.get(url, headers=headers)
 
        # Create a BeautifulSoup object
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the relevant HTML elements and extract the desired data
        main = soup.find('main')
        results = main.find_all('div', class_='entry-content')
        
        # Loop through list of entries and extract data 
        for result in results: 
            rating = result.find('span', class_='review-template-rating').text

            # Extract the coffee roaster name
            roaster = result.find('p', class_='review-roaster').a.text

            # Extract the coffee name
            name = result.find('h2', class_='review-title').a.text

            # Extract the review date
            review_date = result.find('div', class_='column col-3').strong.next_sibling.strip()

            # Extract the price
            price_element = soup.find('div', class_='column col-3').find('strong', string='Price:')
            price = price_element.next_sibling.strip() if price_element else None

            # Extract the review description
            description = result.find('div', class_='row row-2').p.string


            # Extract the URL for complete review   - TODO
            row_3 = result.find('div', class_='row row-3').find_all('div')
            try:
                complete_review_url = row_3[0].a['href']  
            except:
                complete_review_url = None

            # Extract the URL for the roaster's website
            try:
                roaster_website_url = row_3[1].a['href']
            except:
                roaster_website_url = None
    
            row_data = {
                'Rating': rating,
                'Roaster': roaster,
                'Coffee_Name': name,
                'Review_Date': review_date,
                'Price': price,
                'Review_Description': description,
                'Complete_Review_URL': complete_review_url,
                'Roaster_Website_URL': roaster_website_url
                }
        
            page_data.append(row_data)
        
        return page_data
    
    data = []
    
    for i in range(0, 121):
        page_data = scrape_page(i)
        data.extend(page_data)

    return data


In [421]:
def scrape_roast_data(url, session):
    
    """Get review page from given url. Scrape all extra data that was not available on the search page.
    """
    # This header is required 
    headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'}
    response = session.get(url=url, headers=headers)
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    def scrape_feature(feature):
        if soup.find('td', string= feature + ':'):
            data = soup.find('td', string=feature + ':').find_next_sibling().text
        elif soup.find('h2', string=feature):
            data = soup.find('h2', string=feature).find_next_sibling().text
        else:
            data = None  
        return data
    
    
    feature_list = ['Roaster Location',
                   'Coffee Origin',
                   'Roast Level',
                   'Aroma',
                   'Acidity/Structure',
                   'Acidity',
                   'Body',
                   'Flavor',
                   'Aftertaste',
                   'Agtron',
                   'Blind Assessment',
                   'Notes',
                   'Bottom Line']

    data = {feature: scrape_feature(feature) for feature in feature_list}
    data['url'] = url
    
    return data

In [423]:
data = scrape_coffee_reviews()


In [424]:
df_temp = pd.DataFrame(data)

In [376]:
f = IntProgress(min=0, max=max_count) # instantiate the bar
display(f) # display the bar


roast_data = []
max_count = len(urls)

urls = df['Complete_Review_URL']
session = requests.Session()

for url in urls:
    roast = scrape_roast_data(url, session)
    roast_data.append(roast)
    f.value += 1

IntProgress(value=0, max=2409)

In [426]:
df_roast = pd.DataFrame(roast_data)
df_roast.head()

Unnamed: 0,Roaster Location,Coffee Origin,Roast Level,Aroma,Acidity/Structure,Acidity,Body,Flavor,Aftertaste,Agtron,Blind Assessment,Notes,Bottom Line,url
0,"Glendale, California","Tenejapa, Central Highlands, Chiapas, Mexico",Medium,8,8.0,,8\t\t\t\t\t\t,8,8,44/60,"Roasty-sweet, crisply chocolaty. Baking chocol...",Produced by smallholding members of the Kulati...,A deep-toned Chiapas cup with a throughline of...,https://www.coffeereview.com/review/mexico-chi...
1,"Madison, Wisconsin","Saraguro, Loja growing region, Ecuador",Medium-Light,9,9.0,,9\t\t\t\t\t\t,10,9,59/75,"High-toned, lush, complexly fruity. Lychee, st...","Produced by Juan Peña of Finca La Papaya, enti...",A fine example of what a Geisha natural from E...,https://www.coffeereview.com/review/la-papaya-...
2,"Hilo, Big Island of Hawai’i",Brazil; Thailand; Nicaragua; Ecuador,Medium,9,,,8\t\t\t\t\t\t,9,8,52/64,"Evaluated as espresso. Sweetly tart, richly br...","A blend of Arabica coffees from Brazil, Thaila...","A trifecta of tart fruit, deep chocolate and s...",https://www.coffeereview.com/review/espresso-c...
3,"Taitung, Taiwan","Tolima, Colombia",Light,9,9.0,,9\t\t\t\t\t\t,9,9,63/81,"Complex, multi-layered. Green oolong, rosehips...","Produced by Marta Bayter and her sons, Shady a...",An enticingly rich Colombia Geisha aromatic fi...,https://www.coffeereview.com/review/colombia-t...
4,"Changhua, Taiwan","Quindio Department, Pijao Village, Colombia",Medium-Light,9,9.0,,9\t\t\t\t\t\t,9,9,60/77,"Fruit-toned, tropical. Strawberry-guava, frees...","Produced at Finca Buenos Aires, entirely of th...","A powerfully aromatic, flavor-saturated, exper...",https://www.coffeereview.com/review/colombia-b...


In [428]:
df = df_temp.merge(df_roast_data, left_on="Complete_Review_URL", right_on="url")
df.head()

Unnamed: 0,Rating,Roaster,Coffee_Name,Review_Date,Price,Review_Description,Complete_Review_URL,Roaster_Website_URL,Roaster Location,Coffee Origin,...,Acidity/Structure,Acidity,Body,Flavor,Aftertaste,Agtron,Blind Assessment,Notes,Bottom Line,url
0,90,Regent Coffee,Mexico Chiapas Kulatik,Jun 2023,$19.50/12 ounces,"Roasty-sweet, crisply chocolaty. Baking chocol...",https://www.coffeereview.com/review/mexico-chi...,https://bit.ly/3vqsQg7,"Glendale, California","Tenejapa, Central Highlands, Chiapas, Mexico",...,8,,8\t\t\t\t\t\t,8,8,44/60,"Roasty-sweet, crisply chocolaty. Baking chocol...",Produced by smallholding members of the Kulati...,A deep-toned Chiapas cup with a throughline of...,https://www.coffeereview.com/review/mexico-chi...
1,90,Regent Coffee,Mexico Chiapas Kulatik,Jun 2023,$19.50/12 ounces,"Roasty-sweet, crisply chocolaty. Baking chocol...",https://www.coffeereview.com/review/mexico-chi...,https://bit.ly/3vqsQg7,"Glendale, California","Tenejapa, Central Highlands, Chiapas, Mexico",...,8,,8\t\t\t\t\t\t,8,8,44/60,"Roasty-sweet, crisply chocolaty. Baking chocol...",Produced by smallholding members of the Kulati...,A deep-toned Chiapas cup with a throughline of...,https://www.coffeereview.com/review/mexico-chi...
2,90,Regent Coffee,Mexico Chiapas Kulatik,Jun 2023,$19.50/12 ounces,"Roasty-sweet, crisply chocolaty. Baking chocol...",https://www.coffeereview.com/review/mexico-chi...,https://bit.ly/3vqsQg7,"Glendale, California","Tenejapa, Central Highlands, Chiapas, Mexico",...,8,,8\t\t\t\t\t\t,8,8,44/60,"Roasty-sweet, crisply chocolaty. Baking chocol...",Produced by smallholding members of the Kulati...,A deep-toned Chiapas cup with a throughline of...,https://www.coffeereview.com/review/mexico-chi...
3,90,Regent Coffee,Mexico Chiapas Kulatik,Jun 2023,$19.50/12 ounces,"Roasty-sweet, crisply chocolaty. Baking chocol...",https://www.coffeereview.com/review/mexico-chi...,https://bit.ly/3vqsQg7,"Glendale, California","Tenejapa, Central Highlands, Chiapas, Mexico",...,8,,8\t\t\t\t\t\t,8,8,44/60,"Roasty-sweet, crisply chocolaty. Baking chocol...",Produced by smallholding members of the Kulati...,A deep-toned Chiapas cup with a throughline of...,https://www.coffeereview.com/review/mexico-chi...
4,96,JBC Coffee Roasters,La Papaya Ecuador Geisha Natural,Jun 2023,$19.50/12 ounces,"High-toned, lush, complexly fruity. Lychee, st...",https://www.coffeereview.com/review/la-papaya-...,https://www.jbccoffeeroasters.com/product-cate...,"Madison, Wisconsin","Saraguro, Loja growing region, Ecuador",...,9,,9\t\t\t\t\t\t,10,9,59/75,"High-toned, lush, complexly fruity. Lychee, st...","Produced by Juan Peña of Finca La Papaya, enti...",A fine example of what a Geisha natural from E...,https://www.coffeereview.com/review/la-papaya-...


In [430]:
df.to_csv('data/raw-coffee-reviews.csv', index=False)