# Web scraping in coffee review (top-rated coffees)
---
- Obtain the top-rated roaster and their coffee bean origin
- All data is from [Coffee review](https://www.coffeereview.com/highest-rated-coffees/)
- May need to put your own header in the requests.get(url, header = # YOUR OWN HEADER)

In [109]:
# Import relevant libraries
from bs4 import BeautifulSoup as soup
import requests
import pandas as pd
import time

## Identifying all the coffee name to negative into the coffee review pages
---

In [130]:
# getting html from the page using requests, and then formatting using beautifulsoup
coffee_catalog_url = "https://www.coffeereview.com/highest-rated-coffees"
coffee_catalog_page = requests.get(coffee_catalog_url).text
coffee_list_soup = soup(coffee_catalog_page, "html.parser")

In [131]:
# obtaining the total pages number
total_pages = int(coffee_list_soup.find("li", class_= "pagination-omission").find_next_sibling("li").text[-2:])
total_pages

82

In [148]:
# count the page number
page_counter = 1 
# create a new list for store all the url
coffee_url_list = [] 

for page in range(total_pages):
    update_url = f"{coffee_catalog_url}/page/{page_counter}" # changing pages
    coffee_catalog_page = requests.get(update_url).text
    coffee_list_soup = soup(coffee_catalog_page, "html.parser")
    # locating each coffee name url
    coffee_in_page = coffee_list_soup.find_all("div", class_= "column col-1")

    page_counter += 1

    for item in range(len(coffee_in_page)):
        try:
            url = coffee_in_page[item].a["href"] # reviewed cofffee page url
            coffee_url_list.append(url) # putting into coffee url list
        except:
            pass

    time.sleep(2)

# identify how many coffee species were found
len(coffee_url_list)

1640

## Getting required information from the coffee review page
---

In [167]:
data = [] # create a empty list for storing all the data

# naming the header of dataframe
df_header = ["coffee name", "coffee rating", "roaster name", "roaster country/state", "roaster city", "coffee origin country",
          "coffee origin city", "roast level", "whole bean (agtron)", "after grinding (agtron)", "price", "pricing unit",
          "review date", "aroma", "acidity", "body", "flavor", "aftertaste", "blind ass (paragraph)", "species (paragraph)"]

In [168]:
print("-------------------------------------")
print("Start Web scraping....")
print("-------------------------------------")

# for loop: srcaping all the 1600 data from coffee review
for review_url in coffee_url_list:
    row = [] # create a new row list

    # obtain a review page and formatting into soup format
    coffee_review_response = requests.get(review_url).text
    coffee_review_soup = soup(coffee_review_response, "html.parser")

    # storing the data into different variables and putting into row list
    try:

        coffee_name = coffee_review_soup.find("h1", class_= "review-title").text
        row.append(coffee_name)

        rating = coffee_review_soup.find("span", class_= "review-template-rating").text
        row.append(rating)

        roaster_name = coffee_review_soup.find("p", class_= "review-roaster").text
        row.append(roaster_name)
    except:
        print(f"{coffee_name}: basic info Error")
        pass
    
    
    try:
        # extract the whole table that contains roaster and coffee beans information
        roaster_table = coffee_review_soup.find_all("div" , class_= "column col-1")[1]

        # organise the roaster table data and putting into row list
        roaster_country_state = roaster_table.find_all("td")[1].text.split(",")[-1].strip()
        row.append(roaster_country_state)
        roaster_city = roaster_table.find_all("td")[1].text.split(",")[:-1]
        row.append(roaster_city)
        coffee_origin_country = roaster_table.find_all("td")[3].text.split(",")[-1].strip()
        row.append(coffee_origin_country)
        coffee_origin_city = roaster_table.find_all("td")[3].text.split(",")[:-1]
        row.append(coffee_origin_city)
        roast_level = roaster_table.find_all("td")[5].text
        row.append(roast_level)
            # agtron data is present as xx/xx; before slash is the whole bean agtron and after slash is agtron after grinding 
        whole_bean_agtron = roaster_table.find_all("td")[7].text.split("/")[0] # select the number before slash
        row.append(whole_bean_agtron)
        after_grinding_agtron = roaster_table.find_all("td")[7].text.split("/")[1] # select the number after slash
        row.append(after_grinding_agtron)
        try:
            # seperate the price and its unit into two column
            price = roaster_table.find_all("td")[9].text.split("/")[0] # pricing
            row.append(price)
            pricing_unit = roaster_table.find_all("td")[9].text.split("/")[1] # pricing_unit
            row.append(pricing_unit)
        except:
            price = None
            row.append(price)
            pricing_unit = None
            row.append(pricing_unit)
    except:
        print(f"{coffee_name}: roaster table Error")
        pass


    try:
        # extract the whole table that contains coffee beans taste rating
        beans_table = coffee_review_soup.find_all("div", class_ = "column col-2")[1]

        # organise the beans table data and putting into row list
        review_date = beans_table.find_all("td")[1].text
        row.append(review_date)
        aroma = beans_table.find_all("td")[3].text
        row.append(aroma)
        acidity = beans_table.find_all("td")[5].text
        row.append(acidity)
        body = beans_table.find_all("td")[7].text.strip()
        row.append(body)
        flavor = beans_table.find_all("td")[9].text
        row.append(flavor)
        aftertaste = beans_table.find_all("td")[11].text
        row.append(aftertaste)
    except:
        print(f"{coffee_name}: beans table Error")
        pass
    

    try:
        # blind assessment (flavor of the coffee) in paragraph
        blind_assess = coffee_review_soup.find("div", class_= "review-template").find_all("p")[1].text
        row.append(blind_assess)
    except:
        print(f"{coffee_name}: blind assessment error")

    try:
        # coffee plant species used in paragraph
        species_paragraph = coffee_review_soup.find("div", class_= "review-template").find_all("p")[2].text
        row.append(species_paragraph)
    except:
        print(f"{coffee_name}: species Error")
        pass

    # putting the whole row into data list
    data.append(row)
    print(f"{coffee_name}: Done")
    time.sleep(2)

print("-------------------------------------")
print("Coffee Bean Data Retrieval Complete")
print("-------------------------------------")

-------------------------------------
Start Web scraping....
-------------------------------------
Colombia Pink Bourbon: Done
Kenya Kiambu Mandela Estate AA Washed Process: Done
Costa Rica Volcán Azul Geisha Yeast-Washed: Done
Ethiopia Yirgacheffe Adame G1 Natural: Done
Kenya Gichathaini: Done
Guatemala Retiro del Quisaya Natural Process: Done
Ethiopia Sidama Karamo Anaerobic Natural: Done
Colombia Geiner Montano: Done
Kabiufa Papua New Guinea: Done
Ka‘u Red Catuai Peaberry: Done
Ethiopia Kayon Mountain: Done
Panama Finca Las Nubes Geisha Microlot: Done
Guatemala El Milagro Bourbon Especial: Done
Costa Rica William Mora: Done
Ethiopia Duromina Agaro Gera: Done
Winter 2022 Allocation Colombia: Done
Laura’s Reserve SL34: Done
El Salvador 2022 COE#17 El Mirador Washed Gesha: Done
Panama ABU Washed Gesha Lot GW57: Done
Guatemala Natural El General Bistro Lot Espresso: Done
Ethiopia 2022 COE#25 Natural Tadesse 74112: Done
Ethiopia Washed Sidama Rumudamo: Done
Lemon Sugar Wash: Done
Kenya K

In [172]:
df = pd.DataFrame(data, columns = df_header)

In [173]:
df

Unnamed: 0,coffee name,coffee rating,roaster name,roaster country/state,roaster city,coffee origin country,coffee origin city,roast level,whole bean (agtron),after grinding (agtron),price,pricing unit,review date,aroma,acidity,body,flavor,aftertaste,blind ass (paragraph),species (paragraph)
0,Colombia Pink Bourbon,95,modcup,New Jersey,[Jersey City],Colombia,"[Piendamó, Cauca Department]",Light,64,82,$30.00,250 grams,March 2023,9,9,9,9,9,"Wildly tropical, fruity and deep. Passion frui...",Produced by Wilton Benitez entirely of the Pin...
1,Kenya Kiambu Mandela Estate AA Washed Process,94,Buon Caffe,Taiwan,[ Taipei],south-central Kenya,[Kiambu County],Medium-Light,60,77,NT $349,8 ounces,March 2023,9,9,9,9,8,"Complex, nuanced, multi-layered. Black currant...",Produced by the Kariruki family from trees of ...
2,Costa Rica Volcán Azul Geisha Yeast-Washed,94,Kafe Coffee Roastery,Taiwan,[Zhubei],Costa Rica,[West Valley],Light,62,80,NT $349,100 grams,March 2023,9,9,9,9,8,"Richly sweet-tart, fruit-toned. Concord grape,...",Produced by Alejo Castro of Volcán Azul entire...
3,Ethiopia Yirgacheffe Adame G1 Natural,94,Caoban Coffee,Taiwan,[Taipei],south-central Ethiopia,[Yirgacheffe growing region],Light,63,79,NT $550,8 ounces,March 2023,9,9,9,9,8,"Gently sweet-tart, floral-toned. Pomegranate, ...",Produced by members of the Adame Garbota Coope...
4,Kenya Gichathaini,94,Temple Coffee,California,[Sacramento],Kenya,"[Mathira West District, Nyeri growing region]",Light,64,82,$25.00,12 ounces,February 2023,9,9,9,9,8,"Richly sweet-savory, spice-toned. Dried fig, p...",Produced by smallholding members of the Gikand...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1635,Ethiopia Yirgacheffe,94,Alaska Coffee Roasting Company,Alaska,[ Fairbanks],,[],/,September 1999,8,7,6,8,"A light, bright breath of acidity shimmers ins...",Many outstanding Ethiopias are currently comin...,,,,,
1636,Elm City House Blend,NR,Elm City Roasters,Connecticut,[New Haven],Dark,[],38/43,October 1998,6,5,7,6,About half the cups of this low-toned but forc...,Not rated due to a taste defect called baggine...,,,,,
1637,Brazil Fazenda Vista Alegre,NR,City Bean Coffee,California,[West Hollywood],Medium-Dark,[],45/53,October 1998,6,5,6,5,Some cups display a muted but disturbingly har...,Not rated due to a taste defect in some of the...,,,,,
1638,Vienna Roast,NR,Alpen Sierra Coffee Roasters,California,[Lake Tahoe],Medium-Dark,[],38/44,October 1998,7,6,5,7,Some cups of the sample are marred by a slight...,Not rated due to a taste fault in some of the ...,,,,,


In [171]:
df.to_csv("output_data_csv/coffee_bean.csv", index= False)