#### To scrape the coffee data from the website
- Data Collection
- Data Organization
- Data Definition
- Data Cleaning

In [3]:
#  Data Collection
#  To scrape the coffee data from the website
# first : Need to get the 'Coffe Name'
# second : Use Coffee Name to get the Coffee Review Detail

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random

In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random

df = []
try:
    #for page in range(1, 140): # total page = 139
    for page in range(1, 3): #test
        
        url = f'https://www.coffeereview.com/advanced-search/page/{page}/?keyword=&search=Search+Now&locations=all&score_all=on&score_96_100=on&score_93_95=on&score_90_92=on&score_85_89=on&score_85=on#results'
        headers = {'User-Agent': 'Mozilla/5.0'}  
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        #print(soup)
        #print(soup.prettify()) 
        
        reviews = soup.select('.review-template')  # First, find 'the container' of each review
        
        for review in reviews:
            coffee_name = review.select_one('.entry-content h2 a')
            coffee_brand = review.select_one('.row-3 .col-2 a')
            
            if coffee_name and coffee_brand:
                time.sleep(random.uniform(2, 5))
                
                df.append({
                    'coffee_name': coffee_name.text,
                    'coffee_review_url': coffee_name['href'],
                    'coffee_brand_url': coffee_brand['href']
                })
                
        time.sleep(3)
        
except Exception as e:
    print(f"Error: {e}")

print(df)
df_to_csv = pd.DataFrame(df)
df_to_csv.to_csv('coffee_brand.csv', index=False) # save the data to csv



KeyboardInterrupt: 

In [5]:
# read the coffee brand data
df_coffee_brand = pd.read_csv('coffee_brand.csv')

# extracte coffee_web_name to new column 'coffee_web_name'  from 'coffee_review_url'
df_coffee_brand['coffee_web_name'] = df_coffee_brand['coffee_review_url'].apply(lambda x: x.split('review/')[-1].rstrip('/'))


print(df_coffee_brand.head())

# save the data to csv
df_coffee_brand.to_csv('coffee_brand.csv', index=False)

                                 coffee_name  \
0             Kenya AA Aries Return Espresso   
1         Kenya All’s Well Peaberry Espresso   
2        Espresso No. 6 Finca San José Ocaña   
3  Guatemala Finca Santa Isabel SL28 COE #13   
4                              Kahiko Orange   

                                   coffee_review_url  \
0  https://www.coffeereview.com/review/kenya-aa-a...   
1  https://www.coffeereview.com/review/kenya-alls...   
2  https://www.coffeereview.com/review/espresso-n...   
3  https://www.coffeereview.com/review/guatemala-...   
4  https://www.coffeereview.com/review/kahiko-ora...   

                                  coffee_brand_url  \
0  https://www.facebook.com/4ArtsZeroDefectCoffees   
1  https://www.facebook.com/4ArtsZeroDefectCoffees   
2                             https://elgran.cafe/   
3                             https://elgran.cafe/   
4                            http://bit.ly/2oBDGmP   

                            coffee_web_name  
0  

In [6]:
# To get the detailed review data from each coffee review url

import requests
from bs4 import BeautifulSoup
import time
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

coffee_review_data = []


def create_session_with_retries():
    """
    Creates a requests session with automatic retry mechanism, it will retry maximum 5 times
    
    """
    
    session = requests.Session()
    
    retries = Retry(
        total=5,  # total times of retries
        backoff_factor=1,  # wait  (2 ** retry times)--> 1s, 2s, 4s, 8s, 16s between retries
        status_forcelist=[500, 502, 503, 504],  # HTTP status codes that need to be retried
        allowed_methods=["GET"]
    )
    
    adapter = HTTPAdapter(max_retries=retries)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    
    return session



def scrape_detailed_review(review_url):
    """suScrape detailed coffee review information from a given URL
    
    Args:
        review_url (str): The URL of the coffee review page
    Returns:
        None
    Raises:
        Exception: If there is an error during the request or processing
    """
    
    session = create_session_with_retries()
    headers = {'User-Agent': 'Mozilla/5.0'}
    #response = requests.get(review_url, headers=headers)  #--> it is not working, timeout error

    try:
        #response = session.get(review_url, headers=headers, timeout=(10, 60))  
        # ConnectionError: ('Connection aborted.', TimeoutError(60, 'Operation timed out'))" 

        # Change to use session to send request
        response = session.get(
            review_url, 
            headers=headers, 
            timeout=(30, 60)  #if connection >30s or read >60s, it will raise error and retry
        )

        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')

            try:
                time.sleep(random.uniform(1, 2))

                # -First: Find the title and roaster company
                # Find the roaster company
                roaster_element = soup.find('p', class_='review-roaster')
                if roaster_element:
                    roaster = roaster_element.text.strip()
                else:
                    roaster = 'No data'

                # to find h1 /h2 tag , and the class 'review-title' or 'entry-title'
                title_element = soup.find(['h1', 'h2'], class_=['review-title', 'entry-title'])
                
                # if the title element is found, extract the text and remove whitespace; if not, use 'No Title'

                if title_element:
                    title = title_element.text.strip()
                else:
                    title = 'No Title'
                    
                # -Second: Find the score
                # Find the score in the webpage
                score_element = soup.find('span', class_='review-template-rating')
                # If the score element is found, extract the text and remove whitespace; if not, use 'No Score'
                
                if score_element:
                    score = score_element.text.strip()
                else:
                    score = 'No data'

                # -Third: Find the detailed information table
                # Find all tables in the webpage
                
                tables = soup.find_all('table')  # Find all tables in the webpage

                attributes = {} # initialize an empty dictionary


                # go through all tables, and to find the table that has two columns
                for table in tables:
                    rows = table.find_all('tr')
                    for row in rows:
                        cells = row.find_all('td')
                        if len(cells) == 2:
                            key = cells[0].text.strip().rstrip(':')  
                            value = cells[1].text.strip()
                            attributes[key] = value
                

                # Print  Table Data
                print(f"Roaster: {roaster}")
                print(f"Title: {title}")
                print(f"Score: {score}")
                
                #print("Other Attributes:")
                #for key, value in attributes.items():

                #    print(f"  {key}: {value}")
                    
            
                print("-" * 40)

                #get all data in a list
                coffee_review_data.append({
                    'title': title,
                    'roaster': roaster,
                    'score': score,
                    **attributes # attributes dictionary unpacking
                
                })
            except AttributeError as e:
    
                print(f"Failed to get detail data from {review_url}: {e}")

    except Exception as e:
        print(f"Request Error: {review_url}: {e}")
    finally:
        session.close()

def main():
    """Main function to orchestrate the coffee review data scraping process
    
    Args:
        None
        
    Returns:
        None: Saves scraped data to 'coffee_review_data.csv'
        
    Process:
        1. Retrieves coffee review URLs from df_coffee_brand
        2. Iterates through each URL to scrape review details
        3. Converts collected data to DataFrame
        4. Saves data to CSV file after each successful scrape
        5. Implements random delays between requests
    """

    review_links = df_coffee_brand['coffee_review_url']
    
    
    for link in review_links:  # for-loop to go through each link
        scrape_detailed_review(link)
         
        df_coffee_review_data = pd.DataFrame(coffee_review_data)
        df_coffee_review_data.to_csv('coffee_review_data.csv', index=False)
        
        time.sleep(random.uniform(5, 10))  # randomly delay between requests

if __name__ == "__main__":
    main()


Roaster: Simon Hsieh Aroma Roast Coffees
Title: Kenya AA Aries Return Espresso
Score: 96
----------------------------------------
Roaster: Simon Hsieh Aroma Roast Coffees
Title: Kenya All’s Well Peaberry Espresso
Score: 96
----------------------------------------
Roaster: El Gran Cafe
Title: Espresso No. 6 Finca San José Ocaña
Score: 96
----------------------------------------
Roaster: El Gran Cafe
Title: Guatemala Finca Santa Isabel SL28 COE #13
Score: 93
----------------------------------------
Roaster: Hula Daddy Kona Coffee
Title: Kahiko Orange
Score: 98
----------------------------------------
Roaster: Kakalove Cafe
Title: Kenya Washed Yara Estate PB TOP
Score: 97
----------------------------------------
Roaster: Hula Daddy Kona Coffee
Title: Kona Orange
Score: 96
----------------------------------------
Roaster: Utopian Coffee
Title: Colombia Wilton Benitez Sudan Rume
Score: 96
----------------------------------------
Roaster: Kakalove Cafe
Title: Ethiopia Washed Gute 74110 Lot J

In [None]:
#### After scraping, we already got the coffee review data. Secondly, We need to merge the coffee brand data with the coffee review data.

In [3]:
# read the coffee review data
df_coffee_review_data = pd.read_csv('coffee_review_data.csv')

# read the coffee brand data
df_coffee_brand = pd.read_csv('coffee_brand.csv')

# merge two dataframe
df_merged = pd.merge(
    df_coffee_review_data, 
    df_coffee_brand[['coffee_name', 'coffee_review_url', 'coffee_brand_url']], 
    left_on='title',  # the column in df_coffee_review_data
    right_on='coffee_name',  # the column in df_coffee_brand
    how='left'
)
# print the merged dataframe
print(df_merged.head())



                                       title                          roaster  \
0             Kenya AA Aries Return Espresso  Simon Hsieh Aroma Roast Coffees   
1         Kenya All’s Well Peaberry Espresso  Simon Hsieh Aroma Roast Coffees   
2        Espresso No. 6 Finca San José Ocaña                     El Gran Cafe   
3  Guatemala Finca Santa Isabel SL28 COE #13                     El Gran Cafe   
4                              Kahiko Orange           Hula Daddy Kona Coffee   

   score              Roaster Location  \
0     96               Taoyuan, Taiwan   
1     96               Taoyuan, Taiwan   
2     96  Antigua Guatemala, Guatemala   
3     93  Antigua Guatemala, Guatemala   
4     98             Holualoa, Hawai’i   

                                       Coffee Origin   Roast Level Agtron  \
0          Nyeri growing region, south-central Kenya        Medium  44/60   
1          Nyeri growing region, south-central Kenya        Medium  46/72   
2  San Juan Sacatepéquez, Ant

In [None]:
#### In new dataframe, we had already merged two dataframe. Then, We have to find the Geographic Location of the `Coffee Origin`.
#### To check the `Coffee Origin` is in which country, if it is not in the country, we need to find the country of the `Coffee Origin`.


In [4]:
# Take a look at the unique value of 'Coffee Origin'
df_merged['Coffee Origin'].unique()
df_merged['Coffee Origin'].value_counts()


Coffee Origin
Piendamo, Cauca Department, Colombia                                  3
Sidamo growing region, southern Ethiopia                              3
Nyeri growing region, south-central Kenya                             2
Holualoa, North Kona growing district, “Big Island” of Hawai’i        2
Alishan, Taiwan                                                       2
Yirgacheffe growing region, south-central Ethiopia                    2
Aceh, Sumatra, Indonesia                                              1
Guatemala; Colombia; Brazil; Myanmar                                  1
Taiwan                                                                1
Mexico, Papua New Guinea                                              1
Brazil; Guatemala; Ethiopia                                           1
Ethiopia                                                              1
Guatemala; Ethiopia                                                   1
Cauca Department, Colombia                        

In [None]:
# From the result, 
# we could see that there are some coffee from few locations. 
# Aslo, there are no logical rules to classify the coffee origin, 
# so try to use fuzzy matching and regular expression to match the region.

# For example: 
# 1) Brazil; Guatemala; Ethiopia  (country ; country ; country)
# 2) Mexico, Papua New Guinea   (country , country)
# 3) Kiambu County, south-central Kenya        (county name, country)



# Step 1: Identify single-origin coffees
# Add a new column [is_blend]in df_merged to indicate whether it's single-origin or blend (0:single-origin, 1:blend)

# Step 2: Region Classification Logic:
# Create binary indicators (0 or 1) for these columns:
# - region_africa_arabia
# - region_caribbean
# - region_central_america
# - region_hawaii
# - region_asia_pacific

#Step 3: To classify the coffee origin
# For single-origin coffees (separated by ','): Direct classification
# For blended coffees (separated by ';'): 

# - First identify all origin regions (2 or 3)
# - Check each region against the 5 geographical areas
# - Mark '1' for matching regions

# Step 4: Display results


In [5]:
# import regular expressions package
import re

def create_region_patterns():
    """Create region matching patterns

    Args:
        None
        
    Returns:
        dict: A dictionary containing region patterns and associated keywords   
    
    Process:
        if the country is in the 'countries' list, return True
        otherwise, use the keywords to check if the country matches the pattern (fuzzy matching and regular expression)

    """



    regions = {
        'africa_arabia': {
            'countries': ['Ethiopia', 'Kenya', 'Tanzania', 'Rwanda', 'Uganda', 'Yemen', 'Burundi', 'Congo'],
            'keywords': ['ethiopia', 'ethiopian', 'yirgacheffe', 'sidamo', 'guji',
                         'kenya', 'kenyan', 'nyeri', 'kiambu',
                         'tanzania', 'tanzanian',
                         'rwanda', 'rwandan',
                         'uganda', 'ugandan',
                         'yemen', 'yemeni',
                         'burundi', 'burundian',
                         'congo', 'congolese']
        },
        'caribbean': {
            'countries': ['Jamaica', 'Haiti', 'Dominican Republic', 'Cuba'],
            'keywords': ['jamaica', 'jamaican', 'blue mountain',
                         'haiti', 'haitian',
                         'dominican', 'dominica',
                         'cuba', 'cuban']
        },
        'central_america': {
            'countries': ['Guatemala', 'Costa Rica', 'Honduras', 'El Salvador', 'Nicaragua', 'Panama', 'Mexico'],
            'keywords': ['guatemala', 'guatemalan', 'antigua', 'huehuetenango', 'atitlan',
                         'costa rica', 'costa rican', 'tarrazu',
                         'honduras', 'honduran',
                         'el salvador', 'salvadoran',
                         'nicaragua', 'nicaraguan',
                         'panama', 'panamanian',
                         'mexico', 'mexican', 'chiapas', 'oaxaca']
        },
        'hawaii': {
            'countries': ['Hawaii', 'Kona', 'Maui', 'Hawai'],
            'keywords': ['hawaii', 'hawaiian', 'kona', 'maui', 'big island']
        },
        'asia_pacific': {
            'countries': ['Indonesia', 'Papua New Guinea', 'Taiwan', 'Vietnam', 'Thailand', 'India', 'Myanmar'],
            'keywords': ['indonesia', 'indonesian', 'sumatra', 'sulawesi', 'java',
                         'papua', 'new guinea',
                         'taiwan', 'taiwanese', 'alishan',
                         'vietnam', 'vietnamese',
                         'thailand', 'thai',
                         'india', 'indian', 'malabar',
                         'myanmar', 'burmese']
        }
    }
    # 將關鍵詞列表轉為正則表達式
    for region, info in regions.items():
        info['patterns'] = [r'|'.join(info['keywords'])]
    return regions


def is_in_region(place, region_info):
    """檢查地點是否屬於特定地區"""
    if pd.isna(place):
        return False
        
    place = str(place).lower()
    
    # 2 Ways to mapping:
    # a) Direct matching -> check the exact country name
    if any(country.lower() in place for country in region_info['countries']):
        return True
    
    # b) Fuzzy matching -> use the pattern to check if the country matches the pattern
    return any(re.search(pattern, place, re.IGNORECASE) 
              for pattern in region_info['patterns'])


def classify_region(origin):
    """Classify coffee origins into specific geographical regions
    
    Args:
        origin (str): Coffee origin string, can be single origin or blend
                     Format examples: 
                     - Single origin: "Ethiopia, Yirgacheffe"
                     - Blend: "Ethiopia; Colombia"
    
    Returns:
        dict: Dictionary containing binary indicators for each region:
            {
                'region_africa_arabia': 0 or 1,
                'region_caribbean': 0 or 1,
                'region_central_america': 0 or 1,
                'region_hawaii': 0 or 1,
                'region_asia_pacific': 0 or 1,
                'is_blend': 0 or 1  # 1 if origin contains ';'
            }
    """
    
    if pd.isna(origin):
        return {
            'region_africa_arabia': 0,
            'region_caribbean': 0,
            'region_central_america': 0,
            'region_hawaii': 0,
            'region_asia_pacific': 0,
            'is_blend': 0
        }
    
    # Use the function to get the region matching 
    regions = create_region_patterns()
    
    # To check if the origin is blend 
    is_blend = 1 if ';' in str(origin) else 0
    
    # Use ';' or ',' to split the origin
    if is_blend:
        origins = [o.strip() for o in str(origin).split(';')]
    else:
        origins = [o.strip() for o in str(origin).split(',')]
    
    # Initialize 
    result = {
        'region_africa_arabia': 0,
        'region_caribbean': 0,
        'region_central_america': 0,
        'region_hawaii': 0,
        'region_asia_pacific': 0,
        'is_blend': is_blend
    }
    
    # To check each origin in the list
    for place in origins:
        if is_in_region(place, regions['africa_arabia']):
            result['region_africa_arabia'] = 1
        if is_in_region(place, regions['caribbean']):
            result['region_caribbean'] = 1
        if is_in_region(place, regions['central_america']):
            result['region_central_america'] = 1
        if is_in_region(place, regions['hawaii']):
            result['region_hawaii'] = 1
        if is_in_region(place, regions['asia_pacific']):
            result['region_asia_pacific'] = 1
    
    return result


# Apply the classify_region to DataFrame
results = df_merged['Coffee Origin'].apply(classify_region)

# Add the results to DataFrame
for key in ['region_africa_arabia', 'region_caribbean', 'region_central_america', 
           'region_hawaii', 'region_asia_pacific', 'is_blend']:
    df_merged[key] = results.apply(lambda x: x[key])

# Print the results
print(f"\Result:")
print(df_merged[['Coffee Origin', 'is_blend', 'region_africa_arabia', 
                'region_caribbean', 'region_central_america', 
                'region_hawaii', 'region_asia_pacific']].head(10))

# Save the results to csv
df_merged.to_csv('coffee_data_with_regions.csv', index=False)


分類結果:
                                       Coffee Origin  is_blend  \
0          Nyeri growing region, south-central Kenya         0   
1          Nyeri growing region, south-central Kenya         0   
2  San Juan Sacatepéquez, Antigua Department, Gua...         0   
3  San Cristobal Verapaz, Alta Verapaz, Antigua D...         0   
4  Holualoa, North Kona growing district, “Big Is...         0   
5          Ruiru, Kiambu County, south-central Kenya         0   
6  Holualoa, North Kona growing district, “Big Is...         0   
7               Piendamo, Cauca Department, Colombia         0   
8           Sidamo growing region, southern Ethiopia         0   
9           Sidamo growing region, southern Ethiopia         0   

   region_africa_arabia  region_caribbean  region_central_america  \
0                     1                 0                       0   
1                     1                 0                       0   
2                     0                 0                  

In [None]:
#!pip install geopy

In [9]:
# Convert the price to USD
def convert_price_to_usd(price_str):
    """Convert price string to USD format
    
    Args:
        price_str (str): Price string that may contain NT$ or $ with amount
        
    Returns:
        str: Converted price in USD format (e.g., "$15.99/12 oz")
        None: If input is NaN or invalid
        

    """

    if pd.isna(price_str): # if the price is NaN, return None
        return None
        
    #  Convert to string and remove whitespace
    price_str = str(price_str).strip()
    
    # to check if the price is in NT$
    if 'NT' in price_str:
        # split the price and unit
        price_part = price_str.replace('NT$', '').split('/')[0]
        unit_part = price_str.split('/')[-1]
        
        # convert the price to float
        amount = float(price_part.replace(',', ''))
        return f"${amount * 0.03:.1f}/{unit_part}"
    else:
        # if the price is in USD, return the price
        return price_str

# read the data
data = pd.read_csv('coffee_data_with_regions.csv')

# convert the price to USD
data['Est. Price USD'] = data['Est. Price'].apply(convert_price_to_usd)

# print the results
print(f"\Covert Result:")
print(data[['Est. Price', 'Est. Price USD']].head(10))

# save to csv
data.to_csv('coffee_data_with_usd_prices.csv', index=False)


價格轉換結果:
         Est. Price    Est. Price USD
0  NT $950/8 ounces   $28.50/8 ounces
1  NT $900/8 ounces   $27.00/8 ounces
2  $25.00/12 ounces  $25.00/12 ounces
3  $47.00/12 ounces  $47.00/12 ounces
4   $99.95/8 ounces   $99.95/8 ounces
5  NT $300/4 ounces    $9.00/4 ounces
6   $79.95/8 ounces   $79.95/8 ounces
7   $29.00/8 ounces   $29.00/8 ounces
8  NT $480/8 ounces   $14.40/8 ounces
9  NT $480/8 ounces   $14.40/8 ounces


In [11]:
# Drop the data that has null value in 'Score'
data = data[data['score'].notna()]


# Drop the data that has null value in [Aroma, Body, Flavor, Aftertaste]
data = data[data['Aroma'].notna()]
data = data[data['Body'].notna()]
data = data[data['Flavor'].notna()]
data = data[data['Aftertaste'].notna()]


# save the data to csv
data.to_csv('coffee_data_cleaned.csv', index=False)



In [17]:
# Covert 'Roast location' to Geolocation (latitude and longitude)
df_cleaned = pd.read_csv('coffee_data_cleaned.csv')

import geopy
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="geoapiExercises")

for index, row in df_cleaned.iterrows():
    try:
        time.sleep(random.uniform(1, 2))
        
        location = geolocator.geocode(row['Roaster Location'])
        df_cleaned.at[index, 'Latitude'] = location.latitude
        df_cleaned.at[index, 'Longitude'] = location.longitude  
    except Exception as e:
        print(f"Error: {e}")

print(df_cleaned.head())

# save the data to csv
df_cleaned.to_csv('coffee_data_with_location.csv', index=False)

                                       title                          roaster  \
0             Kenya AA Aries Return Espresso  Simon Hsieh Aroma Roast Coffees   
1         Kenya All’s Well Peaberry Espresso  Simon Hsieh Aroma Roast Coffees   
2        Espresso No. 6 Finca San José Ocaña                     El Gran Cafe   
3  Guatemala Finca Santa Isabel SL28 COE #13                     El Gran Cafe   
4                              Kahiko Orange           Hula Daddy Kona Coffee   

   score              Roaster Location  \
0     96               Taoyuan, Taiwan   
1     96               Taoyuan, Taiwan   
2     96  Antigua Guatemala, Guatemala   
3     93  Antigua Guatemala, Guatemala   
4     98             Holualoa, Hawai’i   

                                       Coffee Origin   Roast Level Agtron  \
0          Nyeri growing region, south-central Kenya        Medium  44/60   
1          Nyeri growing region, south-central Kenya        Medium  46/72   
2  San Juan Sacatepéquez, Ant

In [None]:
# based on the location, to get the weather data
# Not able to get the weather data based on the geolocation data yet

