# Coffeereview.com Roast Scraper

This notebook is for scraping coffee roast data from https://www.coffeereview.com


In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm



In [10]:
def scrape_roast_list(session: requests.Session) -> list[dict]:
    
    """
    Scrape coffee reviews from coffeereview.com review list and return a list of data dict.

    Args:
        session (requests.Session): A requests session object
    Returns:
        list[dict]: A list of data dicts for all coffee reviews
    """
    
    def scrape_page(page_number: int) -> list[dict]:
        """ Return a list of data dicts for a given page number

        Args:
            page_number (int): The page number to scrape

        Returns:
            list[dict]: A list of data dicts for a given page number
        """
        page_data = [] 
        
        # This header is required. 
        headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'}
        url = 'https://www.coffeereview.com/advanced-search/page/' + str(page_number) + '/'
    
        # Send a GET request to the URL
        response = session.get(url, headers=headers)
 
        # Create a BeautifulSoup object
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the relevant HTML elements and extract the desired data
        main = soup.find('main')
        results = main.find_all('div', class_='entry-content')
        
        # Loop through list of entries and extract data 
        for result in results: 
            rating = result.find('span', class_='review-template-rating').text

            # Extract the coffee roaster name
            roaster = result.find('p', class_='review-roaster').a.text

            # Extract the coffee name
            name = result.find('h2', class_='review-title').a.text

            # Extract the review date
            review_date = result.find('div', class_='column col-3').strong.next_sibling.strip()

            # Extract the price
            price_element = soup.find('div', class_='column col-3').find('strong', string='Price:')
            price = price_element.next_sibling.strip() if price_element else None

            # Extract the review description
            description = result.find('div', class_='row row-2').p.string


            # Extract the URL for complete review   - TODO
            row_3 = result.find('div', class_='row row-3').find_all('div')
            try:
                complete_review_url = row_3[0].a['href']  
            except:
                complete_review_url = None

            # Extract the URL for the roaster's website
            try:
                roaster_website_url = row_3[1].a['href']
            except:
                roaster_website_url = None
    
            row_data = {
                'Rating': rating,
                'Roaster': roaster,
                'Coffee_Name': name,
                'Review_Date': review_date,
                'Review_Description': description,
                'Complete_Review_URL': complete_review_url,
                'Roaster_Website_URL': roaster_website_url
                }
        
            page_data.append(row_data)
        
        return page_data
    
    # Initialize list to hold all data
    data = []
    
    # Loop through all pages and scrape data
    for i in tqdm(range(0, 121)):
        page_data = scrape_page(i)
        data.extend(page_data)
        
    return data


In [11]:
def scrape_roast_page(url: str, session: requests.Session) -> dict:
    """Scrape the data from a single review page.

    Args:
        url (str): The URL of the review page.
        session (requests.Session): The session object to use for the request.

    Returns:
        dict: The scraped roast data.
    """
    
    # This header is required 
    headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'}
    response = session.get(url=url, headers=headers)
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    def scrape_feature(feature):
        """Scrape a single feature from the review page.

        Args:
            feature (str): The feature to scrape.

        Returns:
            str: The scraped data.
        """
        if soup.find('td', string= feature + ':'):
            data = soup.find('td', string=feature + ':').find_next_sibling().text
        elif soup.find('h2', string=feature):
            data = soup.find('h2', string=feature).find_next_sibling().text
        else:
            data = None  
        return data
    
    
    feature_list = ['Roaster Location',
                   'Coffee Origin',
                   'Roast Level',
                   'Aroma',
                   'Acidity/Structure',
                   'Acidity',
                   'Body',
                   'Flavor',
                   'Aftertaste',
                   'Agtron',
                   'Blind Assessment',
                   'Notes',
                   'Bottom Line',
                   'Est. Price']

    data = {feature: scrape_feature(feature) for feature in feature_list}
    data['url'] = url
    
    return data

In [34]:
with requests.Session() as session:
    data = scrape_roast_list(session=session)

100%|██████████| 121/121 [05:18<00:00,  2.63s/it]


In [35]:
df_roasts = pd.DataFrame(data)
df_roasts.to_csv('data/raw-coffee-roasts.csv', index=False)

In [13]:
with open('data/raw-coffee-roasts.csv', 'r') as f:
    df_temp = pd.read_csv(f)


urls = list(df_temp['Complete_Review_URL'])

roast_data = []
max_count = len(urls)

# f = IntProgress(min=0, max=max_count) # instantiate the bar
# display(f) # display the bar

session = requests.Session()

for url in tqdm(urls):
    roast = scrape_roast_page(url, session)
    roast_data.append(roast)
    #f.value += 1

  2%|▏         | 40/2420 [00:12<11:56,  3.32it/s]


KeyboardInterrupt: 

In [15]:
df_roast = pd.DataFrame(roast_data)
df_roast.head()

Unnamed: 0,Roaster Location,Coffee Origin,Roast Level,Aroma,Acidity/Structure,Acidity,Body,Flavor,Aftertaste,Agtron,Blind Assessment,Notes,Bottom Line,Est. Price,url
0,"Glendale, California","Tenejapa, Central Highlands, Chiapas, Mexico",Medium,8,8.0,,8\t\t\t\t\t\t,8,8,44/60,"Roasty-sweet, crisply chocolaty. Baking chocol...",Produced by smallholding members of the Kulati...,A deep-toned Chiapas cup with a throughline of...,$19.50/12 ounces,https://www.coffeereview.com/review/mexico-chi...
1,"Madison, Wisconsin","Saraguro, Loja growing region, Ecuador",Medium-Light,9,9.0,,9\t\t\t\t\t\t,10,9,59/75,"High-toned, lush, complexly fruity. Lychee, st...","Produced by Juan Peña of Finca La Papaya, enti...",A fine example of what a Geisha natural from E...,$29.00/12 ounces,https://www.coffeereview.com/review/la-papaya-...
2,"Hilo, Big Island of Hawai’i",Brazil; Thailand; Nicaragua; Ecuador,Medium,9,,,8\t\t\t\t\t\t,9,8,52/64,"Evaluated as espresso. Sweetly tart, richly br...","A blend of Arabica coffees from Brazil, Thaila...","A trifecta of tart fruit, deep chocolate and s...",$19.00/12 ounces,https://www.coffeereview.com/review/espresso-c...
3,"Taitung, Taiwan","Tolima, Colombia",Light,9,9.0,,9\t\t\t\t\t\t,9,9,63/81,"Complex, multi-layered. Green oolong, rosehips...","Produced by Marta Bayter and her sons, Shady a...",An enticingly rich Colombia Geisha aromatic fi...,NT $550/200 grams,https://www.coffeereview.com/review/colombia-t...
4,"Changhua, Taiwan","Quindio Department, Pijao Village, Colombia",Medium-Light,9,9.0,,9\t\t\t\t\t\t,9,9,60/77,"Fruit-toned, tropical. Strawberry-guava, frees...","Produced at Finca Buenos Aires, entirely of th...","A powerfully aromatic, flavor-saturated, exper...",NT $520/200 grams,https://www.coffeereview.com/review/colombia-b...


In [18]:
df = df_temp.merge(df_roast, left_on="Complete_Review_URL", right_on="url")
df.sample(4)

Unnamed: 0,Rating,Roaster,Coffee_Name,Review_Date,Price,Review_Description,Complete_Review_URL,Roaster_Website_URL,Roaster Location,Coffee Origin,...,Acidity,Body,Flavor,Aftertaste,Agtron,Blind Assessment,Notes,Bottom Line,Est. Price,url
149,91,Kakalove Cafe,Ethiopia Washed Brew Bag,Apr 2023,NT $380/8 ounces,Evaluated in brew bag format with a steeping t...,https://www.coffeereview.com/review/ethiopia-w...,http://www.kakalovecafe.com.tw/,"Chia-Yi, Taiwan",Ethiopia,...,,8\t\t\t\t\t\t,9,8,0/0,Evaluated in brew bag format with a steeping t...,Produced by smallholding farmers from selectio...,"A balanced, delicately floral-toned washed Eth...",NT $30/1 single-serve brew bag,https://www.coffeereview.com/review/ethiopia-w...
1459,92,Kauai Coffee Company,Rum Barrel Aged Kauai Coffee,Dec 2020,NT $520/200 grams,"Deep-toned, sweetly fermenty. Oak barrel, choc...",https://www.coffeereview.com/review/rum-barrel...,,"Kalaheo, Kauai, Hawaii","Kalaheo, Kauai, Hawaii",...,,8\t\t\t\t\t\t,9,8,50/70,"Deep-toned, sweetly fermenty. Oak barrel, choc...","Kauai Coffee, Hawaii’s largest coffee producer...",A nicely balanced barrel-aged coffee from Kaua...,$50.00/10 ounces,https://www.coffeereview.com/review/rum-barrel...
2031,94,Mudhouse Coffee Roasters,Gesha Village 1931 Lot 27,Sep 2019,$19.95/8 ounces,"Richly sweet-savory, deeply floral-toned. Hone...",https://www.coffeereview.com/review/gesha-vill...,,"Charlottesville, Virginia","Bench-Maji Zone, southern Ethiopia",...,,9\t\t\t\t\t\t,9,8,60/78,"Richly sweet-savory, deeply floral-toned. Hone...",This coffee tied for the fourth-highest rating...,A pleasingly coherent coffee that is both swee...,$50.00/8 ounces,https://www.coffeereview.com/review/gesha-vill...
262,92,Mostra Coffee,Philippines Sitio Kisbong #3011 Experimental W...,Jan 2023,$25.00/12 ounces,"Delicately fruit-forward. Goji berry, lime, co...",https://www.coffeereview.com/review/philippine...,https://mostracoffee.com/collections/coffee,"San Diego, California","Atok Benguet, northern Philippines",...,,9\t\t\t\t\t\t,9,8,59/76,"Delicately fruit-forward. Goji berry, lime, co...",Produced by Kalsada Coffee from trees of the T...,"A gently fruity, experimentally processed Phil...",$17.50/6 ounces,https://www.coffeereview.com/review/philippine...


In [39]:
df.to_csv('data/raw-coffee-reviews.csv', index=False)