# Coffeereview.com Roast Scraper

This notebook is for scraping coffee roast data from https://www.coffeereview.com


In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm



In [10]:
def scrape_roast_list(session: requests.Session) -> list[dict]:
    
    """
    Scrape coffee reviews from coffeereview.com review list and return a list of data dict.

    Args:
        session (requests.Session): A requests session object
    Returns:
        list[dict]: A list of data dicts for all coffee reviews
    """
    
    def scrape_page(page_number: int) -> list[dict]:
        """ Return a list of data dicts for a given page number

        Args:
            page_number (int): The page number to scrape

        Returns:
            list[dict]: A list of data dicts for a given page number
        """
        page_data = [] 
        
        # This header is required. 
        headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'}
        url = 'https://www.coffeereview.com/advanced-search/page/' + str(page_number) + '/'
    
        # Send a GET request to the URL
        response = session.get(url, headers=headers)
 
        # Create a BeautifulSoup object
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the relevant HTML elements and extract the desired data
        main = soup.find('main')
        results = main.find_all('div', class_='entry-content')
        
        # Loop through list of entries and extract data 
        for result in results: 
            rating = result.find('span', class_='review-template-rating').text

            # Extract the coffee roaster name
            roaster = result.find('p', class_='review-roaster').a.text

            # Extract the coffee name
            name = result.find('h2', class_='review-title').a.text

            # Extract the review date
            review_date = result.find('div', class_='column col-3').strong.next_sibling.strip()

            # Extract the price
            price_element = soup.find('div', class_='column col-3').find('strong', string='Price:')
            price = price_element.next_sibling.strip() if price_element else None

            # Extract the review description
            description = result.find('div', class_='row row-2').p.string


            # Extract the URL for complete review   - TODO
            row_3 = result.find('div', class_='row row-3').find_all('div')
            try:
                complete_review_url = row_3[0].a['href']  
            except:
                complete_review_url = None

            # Extract the URL for the roaster's website
            try:
                roaster_website_url = row_3[1].a['href']
            except:
                roaster_website_url = None
    
            row_data = {
                'Rating': rating,
                'Roaster': roaster,
                'Coffee_Name': name,
                'Review_Date': review_date,
                'Review_Description': description,
                'Complete_Review_URL': complete_review_url,
                'Roaster_Website_URL': roaster_website_url
                }
        
            page_data.append(row_data)
        
        return page_data
    
    # Initialize list to hold all data
    data = []
    
    # Loop through all pages and scrape data
    for i in tqdm(range(0, 121)):
        page_data = scrape_page(i)
        data.extend(page_data)
        
    return data


In [11]:
def scrape_roast_page(url: str, session: requests.Session) -> dict:
    """Scrape the data from a single review page.

    Args:
        url (str): The URL of the review page.
        session (requests.Session): The session object to use for the request.

    Returns:
        dict: The scraped roast data.
    """
    
    # This header is required 
    headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'}
    response = session.get(url=url, headers=headers)
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    def scrape_feature(feature):
        """Scrape a single feature from the review page.

        Args:
            feature (str): The feature to scrape.

        Returns:
            str: The scraped data.
        """
        if soup.find('td', string= feature + ':'):
            data = soup.find('td', string=feature + ':').find_next_sibling().text
        elif soup.find('h2', string=feature):
            data = soup.find('h2', string=feature).find_next_sibling().text
        else:
            data = None  
        return data
    
    
    feature_list = ['Roaster Location',
                   'Coffee Origin',
                   'Roast Level',
                   'Aroma',
                   'Acidity/Structure',
                   'Acidity',
                   'Body',
                   'Flavor',
                   'Aftertaste',
                   'Agtron',
                   'Blind Assessment',
                   'Notes',
                   'Bottom Line',
                   'Est. Price']

    data = {feature: scrape_feature(feature) for feature in feature_list}
    data['url'] = url
    
    return data

In [34]:
with requests.Session() as session:
    data = scrape_roast_list(session=session)

100%|██████████| 121/121 [05:18<00:00,  2.63s/it]


In [35]:
df_roasts = pd.DataFrame(data)
df_roasts.to_csv('data/raw-coffee-roasts.csv', index=False)

In [14]:
with open('data/raw-coffee-roasts.csv', 'r') as f:
    df_temp = pd.read_csv(f)


urls = list(df_temp['Complete_Review_URL'])

roast_data = []
max_count = len(urls)

# f = IntProgress(min=0, max=max_count) # instantiate the bar
# display(f) # display the bar

session = requests.Session()

for url in tqdm(urls):
    roast = scrape_roast_page(url, session)
    roast_data.append(roast)
    #f.value += 1

100%|██████████| 2420/2420 [19:50<00:00,  2.03it/s]


In [16]:
df_roast = pd.DataFrame(roast_data)
df_roast.sample(10)

Unnamed: 0,Roaster Location,Coffee Origin,Roast Level,Aroma,Acidity/Structure,Acidity,Body,Flavor,Aftertaste,Agtron,Blind Assessment,Notes,Bottom Line,Est. Price,url
340,"Cleveland, Ohio","Matagalpa growing region, Nicaragua",Medium-Light,9,8,,8\t\t\t\t\t\t,9,8,60/78,"Crisply sweet-savory. White peach, hop flowers...",Produced by Benjamin Weiner at Finca Idealista...,A carbonic-macerated Yellow Pacamara grown in ...,$160.00/12 ounces,https://www.coffeereview.com/review/brix-break...
1393,"Osaka, Japan","Yunlin, Taiwan",Light,9,9,,9\t\t\t\t\t\t,9,8,79/92,"Delicate, sweetly tart, richly aromatic. Lemon...",Produced at Songyue Coffee Manor in the highla...,"A fine Taiwan-grown Geisha, processed by the n...","¥ 2,640/50 grams",https://www.coffeereview.com/review/taiwan-son...
1871,"Bellingham, Washington","Gakenke, Northern Province, Rwanda",Medium-Light,9,8,,9\t\t\t\t\t\t,9,8,56/76,"Crisply sweet, deep-toned. Cocoa nib, tangerin...",This coffee tied for the third-highest rating ...,A fine Rwanda cup with deep sweetness animated...,$14.00/12 ounces,https://www.coffeereview.com/review/rwanda-bum...
2059,"Durango, Colorado","Valle de Cauca, Colombia",Medium,9,8,,8\t\t\t\t\t\t,9,8,50/64,"Delicate, richly sweet, fruit-centered. Dried ...",Produced at Potosí Farm by Luis and Rigoberto ...,A clean natural-processed Colombia cup from an...,$19.95/8 ounces,https://www.coffeereview.com/review/colombia-l...
136,"Huntington Beach, California","San Martin Jilotepeque, Chimaltenango Departme...",Medium-Light,9,8,,9\t\t\t\t\t\t,9,8,57/72,"Bright, crisp, sweet-toned. Almond brittle, Pi...","Produced by Anthony Medina of Finca La Colina,...","A straight-ahead, comfortable Guatemala cup wi...",$19.00/12 ounces,https://www.coffeereview.com/review/guatemala-...
2066,"Boulder, Colorado","Boquete growing region, western Panama",Medium-Light,9,9,,9\t\t\t\t\t\t,9,9,51/71,Impeccably floral and cocoa-toned with a savor...,This coffee earned the third-highest rating in...,Extravagant florals and spice-toned cocoa note...,$60.00/8 ounces,https://www.coffeereview.com/review/panama-hac...
1574,"Durango, Colorado","Valle de Cauca, Colombia",Medium-Light,9,9,,9\t\t\t\t\t\t,9,8,54/70,"Fruit- and herb-toned. Lemon thyme, strawberry...",This exceptional coffee was selected as the No...,A berry-driven Colombia cup with background no...,$16.95/12 ounces,https://www.coffeereview.com/review/la-esperan...
2249,"Fort Bragg, California",Ethiopia; Sumatra,Medium-Light,9,8,,8\t\t\t\t\t\t,9,8,48/78,"Deep, chocolaty, cleanly fruit-toned. Dark cho...",The components of this blend are certified org...,"A balanced, richly sweet-tart Ethiopia-Sumatra...",$14.50/12 ounces,https://www.coffeereview.com/review/mocha-java...
2394,"Billings, Montana",South-central Kenya,Medium-Light,9,9,,8\t\t\t\t\t\t,9,8,53/79,"Deeply sweet, spice-toned. Red currant, tanger...",Produced by the Kabare Cooperative from trees ...,"A spice-driven, floral-toned Kenya cup with ba...",$17.50/12 ounces,https://www.coffeereview.com/review/kenya-kaba...
1184,"Manlius, New York","Aratoca, Santander Department, Colombia",Medium-Light,9,9,,9\t\t\t\t\t\t,9,9,58/78,"Resonantly bright and juicy, deeply balanced. ...",Produced by Mildred Muñoz of Finca Santa Maria...,"A tropical-inclined, natural-processed, women-...",$18.00/12 ounces,https://www.coffeereview.com/review/colombia-f...


In [18]:
df = df_temp.merge(df_roast, left_on="Complete_Review_URL", right_on="url")
df.sample(4)

Unnamed: 0,Rating,Roaster,Coffee_Name,Review_Date,Review_Description,Complete_Review_URL,Roaster_Website_URL,Roaster Location,Coffee Origin,Roast Level,...,Acidity,Body,Flavor,Aftertaste,Agtron,Blind Assessment,Notes,Bottom Line,Est. Price,url
2334,92,JBC Coffee Roasters,Gayo Aceh Sumatra,Dec 2018,"Richly sweet, spicy, tobacco-toned. Pipe tobac...",https://www.coffeereview.com/review/gayo-aceh-...,https://www.jbccoffeeroasters.com/product-cate...,"Madison, Wisconsin","Aceh growing region, Aceh Province, northern S...",Medium-Light,...,,8\t\t\t\t\t\t,9,8,56/76,"Richly sweet, spicy, tobacco-toned. Pipe tobac...",JBC Coffee Roasters’ vision is simple: “let th...,Expressive and resonant: Spice-toned florals a...,$16.00/12 ounces,https://www.coffeereview.com/review/gayo-aceh-...
800,92,RamsHead Coffee Roasters,Guatemala Antigua Pastores,Apr 2022,"Sweetly nut-toned. Hazelnut butter, red apple,...",https://www.coffeereview.com/review/guatemala-...,https://bit.ly/2P8dTQf,"Bozeman, Montana","Antigua growing region, Guatemala",Medium-Light,...,,9\t\t\t\t\t\t,8,8,58/76,"Sweetly nut-toned. Hazelnut butter, red apple,...",Produced at Pastores Mill and processed by the...,"A friendly, balanced Guatemala cup with rich, ...",$16.95/12 ounces,https://www.coffeereview.com/review/guatemala-...
1418,90,Lina Premium Coffee,Estate Finca La Lina,Mar 2021,"Sweetly nut-toned. Roasted almond, date, bakin...",https://www.coffeereview.com/review/estate-fin...,,"Bogota, Colombia","Risaralda Department, Colombia",Medium-Light,...,,8\t\t\t\t\t\t,9,7,52/70,"Sweetly nut-toned. Roasted almond, date, bakin...",Produced at El Retiro Farm and processed by th...,"A friendly, easy-drinking Colombia cup with cr...",$15.75/12 ounces,https://www.coffeereview.com/review/estate-fin...
1036,96,SOT Coffee Roaster,SÖT Coffee Roaster Colombia Wush Wush Dynamic ...,Nov 2021,"Savory-sweet, intensely aromatic and flavor-sa...",https://www.coffeereview.com/review/sot-coffee...,https://www.sotcoffee.com/,"Osaka, Japan","San Augustín, Huila Department, Colombia",Light,...,,9\t\t\t\t\t\t,10,8,76/92,"Savory-sweet, intensely aromatic and flavor-sa...",This exceptional coffee was selected as the No...,"An exciting, intensely distinctive coffee for ...",¥1680/100 grams,https://www.coffeereview.com/review/sot-coffee...


In [19]:
df.to_csv('data/raw-roast-reviews.csv', index=False)