# Coffee Review Cleaning and EDA

In [27]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

import re

%matplotlib inline
%config InlineBackend.figure_format = 'svg'
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_colwidth', 100)
plt.style.use("ggplot")
mpl.rcParams['figure.figsize'] = (12, 6)

df = pd.read_json('data/raw-roast-reviews.json').replace('NA', np.nan)
df.sample(5)

Unnamed: 0,Roaster Location:,Coffee Origin:,Roast Level:,Agtron:,Est. Price:,Review Date:,Aroma:,Acidity/Structure:,Body:,Flavor:,Aftertaste:,rating,roaster,name,url,With Milk:,Acidity:
6491,"Topeka, Kansas",Not disclosed.,Medium,51/65,,July 2006,8,,7\t\t\t\t\t\t,8,7.0,89,PT's Coffee Roasting Co.,Kansas Sunrise Blend,https://www.coffeereview.com/review/kansas-sunrise-blend/,,8.0
2268,"Boulder, Colorado",Central America; South America; Indonesia,Medium-Light,52/70,$14.00/12 ounces,June 2019,8,,9\t\t\t\t\t\t,9,8.0,93,Dragonfly Coffee Roasters,Crema de Dolce Espresso,https://www.coffeereview.com/review/crema-de-dolce-espresso/,9.0,
6879,"North Hollywood, California","Chanchamayo growing region, south-central Peru.",Medium-Dark,34/49,,March 2004,8,,7\t\t\t\t\t\t,7,,85,Supreme Bean Coffee Roasters,Organic Peru “La Florida”,https://www.coffeereview.com/review/organic-peru-la-florida-2/,,7.0
1642,"Minneapolis, Minnesota","Suchitan, Guatemala",Light,64/82,$19.95/4 ounces,November 2020,9,8.0,8\t\t\t\t\t\t,9,8.0,92,Paradise Roasters,Coffea Diversa Yellow Dalle Honey,https://www.coffeereview.com/review/coffea-diversa-yellow-dalle-honey/,,
5005,"Madison, Wisconsin",Not disclosed.,Medium,47/59,$12.95/12 ounces,November 2011,8,,9\t\t\t\t\t\t,9,9.0,93,JBC Coffee Roasters,Twisted,https://www.coffeereview.com/review/twisted-2/,8.0,


In [28]:
def split_price_currency(df):
     # Regular expression pattern to extract the price value from the 'price' column
    currency_pattern = r'\$?(\d+\.\d+|\d+)'
    
    # Extract the 'price' column and split it into 'price' and 'currency' columns
    price = df['price']
    df['price'] = price.apply(lambda x: re.search(currency_pattern, str(x)).group(1) if re.search(currency_pattern, str(x)) else None)
    df['currency'] = price.apply(lambda x: re.sub(currency_pattern, '', str(x)) if re.search(currency_pattern, str(x)) else str(x)).str.strip()
    return df
    
def currency_clean(df):
    
    # Mapping of currency symbols to ISO 4217 codes
    curr_map = {'#': 'GBP', '£':'GBP', '¥': 'CNY', '':'USD', 'pesos':'MXN', 'RMB':'CNY'}

    # Standardize currency values by replacing symbols and mapping to ISO codes
    df.loc[(df['currency'] == '$') | (df['currency'].str.lower().str.contains('us', na=False)), 'currency'] = 'USD'
    df.loc[df['currency'].str.lower().str.contains('nt', na=False), 'currency'] = 'TWD'
    df['currency'] = (df['currency']
                      .str.replace('$', '')
                      .replace(curr_map)
                      .str.strip())
    return df
    
def create_quantity_and_units(df):
    """Separate Amount column into a quantity and the units for that quantity. Then 
       converts all quantities to grams."""

    # Regular expression patterns to extract quantity and unit from the 'amount' column
    amount_pat = r'([\d.]+)\s*([a-zA-Z]+)'
    currency_pattern = r'(\d+\.\d+)'

    def convert_to_grams(df):
        # Conversion factors for pounds to grams and ounces to grams
        POUND_TO_GRAM = 453.59
        OUNCE_TO_GRAM = 28.3495

        # Convert quantities to grams based on unit
        df.loc[df['unit'] == 'ounces', 'quantity'] = df['quantity'] * OUNCE_TO_GRAM
        df.loc[df['unit'] == 'pounds', 'quantity'] = df['quantity'] * POUND_TO_GRAM
        df['unit'] = 'grams'
        
        return df
                        
    return (df
            .assign(quantity = df['amount'].str.extract(amount_pat)[0],
                     unit = df['amount'].str.extract(amount_pat)[1],
                    )
            .drop('amount', axis=1)
            .astype({'quantity': 'float'})
            .pipe(convert_to_grams)
            .dropna(subset=['quantity']))

def create_price_usd(df):
    """Create a column converting all prices to USD using recent exchange rates."""
    # Exchange rate mapping for various currencies to USD
    exchange_rate_map = {'TWD' : 0.03, 
                         'CNY': 0.03, 
                         'GBP': 1.27, 
                         'AED':0.27, 
                         'KRW': 0.000752,
                         'HKD':0.13, 
                         'CAD': 0.74,
                         'MXN': 0.06,
                         'IDR': 0.000065,
                         'AUD': 0.65,
                         'USD': 1.0}
    df['price_usd'] = np.round(df['price'] * df['currency'].map(exchange_rate_map), 2)
    return df
    
def tweak_coffee(df):
    return (df
        # Rename columns, remove spaces, and convert to lowercase
        .rename(columns = lambda c: c.replace(' ', '_'))
        .rename(columns = lambda c: c.replace(':', ''))
        .rename(columns = lambda c: c.lower().strip())
        .rename(columns = {'name': 'roast_name', 'est._price': 'price'})
        .drop_duplicates(subset=['roaster', 'roast_name'])
        # Convert 'review_date' to datetime, handle missing 'acidity/structure' values
        # split agtron measurements, split price and amount
        .assign(review_date = lambda df_: pd.to_datetime(df_['review_date'], format='mixed'),
                acidity = lambda df_: df_['acidity/structure'].fillna(df_['acidity']),
                agtron_external = lambda df_: df_['agtron'].str.split('/', expand=True)[0],
                agtron_ground = lambda df_: df_['agtron'].str.split('/', expand=True)[1],
                amount = lambda df_: df_['price'].str.split('/', expand=True)[1],
                price = lambda df_: df_['price'].str.replace(',', '').str.split('/', expand=True)[0],
                )
        .applymap(lambda x: x.strip() if isinstance(x, str) else x)
        .dropna(subset=['coffee_origin'])
        # Perform price and currency-related processing
        .pipe(split_price_currency)
        .pipe(currency_clean)
        # Extract quantity and units, convert quantities to grams
        .pipe(create_quantity_and_units)
        .drop(['acidity/structure', 'agtron', 'with_milk'], axis=1)
        .replace('NA', np.nan)
        .dropna(subset=['acidity', 'roast_level', 'aroma', 'price','agtron_external', 'agtron_ground',
                        'aftertaste'])
        .astype({'roaster': 'category', 'rating': 'int8', 'roast_level':'category',
                 'aftertaste': 'int8', 'aroma': 'int8', 'body': 'int8',
                 'flavor': 'int8', 'currency':'category', 'acidity': 'int8',
                 'agtron_external': 'int16', 'agtron_ground': 'int16', 'price':'float',
                 'unit': 'category'})
        # Agtron values must fall between 0 and 120 
        .query('agtron_external >= 0 and agtron_external < 120')
        .query('agtron_ground >=0 and agtron_ground < 120')
        # Convert prices to USD and calculate $USD/gram
        .pipe(create_price_usd)
        .assign(price_per_g_usd = lambda df_: np.round(df_['price_usd']/df_['quantity'], 2))
        .reset_index(drop=True)
    )
        
roasts = tweak_coffee(df)
roasts.sample(10)


Unnamed: 0,roaster_location,coffee_origin,roast_level,price,review_date,aroma,body,flavor,aftertaste,rating,roaster,roast_name,url,acidity,agtron_external,agtron_ground,currency,quantity,unit,price_usd,price_per_g_usd
2366,"Spokane, Washington","Sidama (also Sidamo) growing region, southern Ethiopia",Light,25.0,2016-12-01,9,9,9,8,93,Roast House,Gigisa Ethiopia,https://www.coffeereview.com/review/gigisa-ethiopia/,8,60,82,USD,340.0,grams,25.0,0.07
3371,"Ramsey, Minnesota","Chiapas State, Mexico",Medium-Light,14.95,2013-05-01,9,8,8,8,90,Paradise Roasters,Fair Trade Organic Mexico Chiapas,https://www.coffeereview.com/review/fair-trade-organic-mexico-chiapas/,7,61,70,USD,340.194,grams,14.95,0.04
3009,"Kansas City, Missouri","Gedeo Zone, Yirgacheffe growing region, southern Ethiopia.",Medium-Light,16.0,2014-09-01,9,8,10,8,93,Parisi Artisan Coffee,Ethiopia Yirgacheffe Gutiti,https://www.coffeereview.com/review/ethiopia-yirgacheffe-gutiti/,8,55,69,USD,340.194,grams,16.0,0.05
1887,"Portland, Oregon","Santa Bárbara, Honduras",Medium-Light,15.0,2019-01-01,9,8,9,8,93,Coava Coffee Roasters,Porfirio Castellanos Honduras,https://www.coffeereview.com/review/porfirio-castellanos-honduras/,9,56,80,USD,250.0,grams,15.0,0.06
3450,"Los Angeles, California","Nueva Segovia, Nicaragua.",Medium-Light,13.95,2013-02-01,9,9,9,8,93,Klatch Coffee,Nicaragua Adeprofoca Maragogype,https://www.coffeereview.com/review/nicaragua-adeprofoca-maragogype/,8,56,73,USD,340.194,grams,13.95,0.04
1738,"Madison, Wisconsin","Murang'a County, central Kenya",Medium-Light,18.7,2019-08-01,8,9,9,8,93,JBC Coffee Roasters,Murang’a Kenya,https://www.coffeereview.com/review/muranga-kenya/,9,54,73,USD,340.194,grams,18.7,0.05
1688,"San Diego, California","Tarrazu, Costa Rica",Light,18.0,2019-11-01,9,9,9,8,93,Manzanita Roasting Company,Costa Rica Anaerobic,https://www.coffeereview.com/review/costa-rica-anaerobic/,8,64,88,USD,340.194,grams,18.0,0.05
597,"Madison, Wisconsin","Nyeri County, Kenya",Medium-Light,23.0,2022-06-01,9,9,9,9,95,JBC Coffee Roasters,Karatina Kenya,https://www.coffeereview.com/review/karatina-kenya-2/,9,58,76,USD,340.194,grams,23.0,0.07
1602,"Thermopolis, Wyoming","Karatu, Tanzania",Medium-Light,12.0,2020-02-01,9,8,9,8,93,Jackrabbit Java,Tanzania AA Karatu Estate,https://www.coffeereview.com/review/tanzania-aa-karatu-estate/,9,56,72,USD,340.194,grams,12.0,0.04
945,"Fitchburg, Wisconsin","Minas Gerais State, Brazil",Medium-Light,13.01,2021-09-01,9,8,9,8,92,True Coffee Roasters,Brazil Peixoto Peaberry,https://www.coffeereview.com/review/brazil-peixoto-peaberry/,8,56,74,USD,358.33768,grams,13.01,0.04


In [94]:
# Cleaning up roaster location data

us_states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 
             'Connecticut', 'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 
             'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 
             'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 
             'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 
             'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 
             'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 
             'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 
             'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 
             'West Virginia', 'Wisconsin','Wyoming']

canadian_provinces = ["Alberta","British Columbia", "Manitoba","New Brunswick", 
                      "Newfoundland and Labrador", "Nova Scotia", "Ontario",
                      "Prince Edward Island", "Quebec", "Saskatchewan",
                      "Northwest Territories","Nunavut","Yukon"]

us_states = [s.lower() for s in us_states]
canadian_provinces = [p.lower() for p in canadian_provinces]   

# Process roaster location by splitting city, state, and country into their own columns.
# Fix typos and standardize.
locations = (roasts['roaster_location']
             .str.strip()
             .str.lower()
             .str.replace("’", "")
             .str.replace("'", "")
             .str.replace(".","")
             .str.split(',', expand=True)
             .apply(lambda row: row.str.strip())
             .replace({'californiaa': 'california', 'calfornia': 'california',
                       'washingto': 'washington', 'virginia and floyd': 'virginia'})
            )

# Cleaning up typos and other small idiosyncracies
locations.loc[locations[0] == "branford connecticut", 1] = "connecticut"
locations.loc[locations[0] == "branford connecticut", 0] = "branford"
locations.loc[locations[0] == "los angeles", 1] = "california"
locations[2] = locations[2].fillna(locations[1])
locations.loc[locations[1].isin(us_states),2 ] = 'usa'
locations.loc[locations[1].isin(canadian_provinces), 2] = 'canada'
locations[1] = locations[1].where(~(locations[1] == locations[2]), np.nan)
locations.loc[locations[2].isin(['hawaii', 'big island of hawaii', 'kona']), 1] = 'hawaii'
locations.loc[locations[2].isin(['hawaii', 'big island of hawaii', 'kona']), 2] = 'usa'
locations.loc[locations[2] == 'dc', 1] = 'dc'
locations.loc[locations[2] == 'dc', 2] = 'usa'
locations.loc[locations[0] == 'peoria illinois', 1] = 'illinois'
locations.loc[locations[0] == 'peoria illinois', 0] = 'peoria'
locations.loc[locations[2] == 'british colombia', 1] = 'british columbia'
locations.loc[locations[2] == 'british colombia', 2] = 'canada'

locations = locations.rename(columns = {0:'roaster_city', 1:'roaster_state', 2:'roaster_country'})

In [30]:
# Cleaning up coffee origin location. 
origin = (roasts["coffee_origin"]
 .str.split(',').str[-1]
 .str.lower()
 .str.strip()
 .str.replace(r'[-"‘\'.\“\”\’]', '', regex=True)
 .str.replace('southern', '')
 .str.replace('south-central', '')
 .str.replace('western', '')
 .str.replace('southwest', '')
 .str.replace('central', '')
 .str.replace('northern', '')
 .str.replace('eastern', '')
 .str.replace('southern', '')
 .str.replace('north', '')
 .str.replace('west', '')
 .str.replace('east', '')
 .str.replace('far', '')
 .str.replace('papua', '')
 .str.replace('the', '')
 .apply(lambda s: s.replace('south', '') if 'africa' not in s else s)
 .str.strip()
 .rename('origin')
)

origin.loc[origin.str.contains(';')] = 'multiple'
origin.loc[origin.str.contains('hawaii')] = 'hawaii'
origin.loc[origin.str.contains('ethiopia')] = 'ethiopia'
origin.loc[origin.str.contains('congo')] = 'democratic republic of the congo'
origin.loc[origin.str.contains('apaneca')] = 'el salvador'
origin.loc[origin.str.contains('sumatra')] = 'sumatra'
origin.loc[origin.str.contains('colombia')] = 'colombia'
origin.loc[origin.str.contains('latin america')] = 'latin america'
origin.loc[origin.str.contains('ecuador')] = 'ecuador'
origin.loc[origin.str.contains('kenya')] = 'kenya'
origin.loc[origin.str.contains('new guinea')] = 'new guinea'
origin.loc[origin.str.contains('and')] = 'multiple'
origin = origin.replace({'america':'usa', '': np.nan, 'gedeo zone':'ethiopia',
                         'coastal  california':'usa', 'not disclosed':np.nan,
                        'minas gerais state':'brazil', 'guatemalae':'guatemala',})

origin

0                 multiple
1                  ecuador
2                   panama
3               costa rica
4                 honduras
               ...        
4179                hawaii
4180    dominican republic
4181                hawaii
4182                hawaii
4183                   usa
Name: origin, Length: 4184, dtype: object

In [31]:
# Combine cleaned roaster location and coffee origin columns with main dataframe
df = (pd
      .concat([roasts, origin, locations], axis=1)
      .drop(['roaster_location', 'coffee_origin'], axis=1)
      .astype({'origin':'category', 'roaster_city':'category',
               'roaster_state':'category', 'roaster_country':'category'})
     )


df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4184 entries, 0 to 4183
Data columns (total 23 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   roast_level      4184 non-null   category      
 1   price            4184 non-null   float64       
 2   review_date      4184 non-null   datetime64[ns]
 3   aroma            4184 non-null   int8          
 4   body             4184 non-null   int8          
 5   flavor           4184 non-null   int8          
 6   aftertaste       4184 non-null   int8          
 7   rating           4184 non-null   int8          
 8   roaster          4184 non-null   category      
 9   roast_name       4184 non-null   object        
 10  url              4184 non-null   object        
 11  acidity          4184 non-null   int8          
 12  agtron_external  4184 non-null   int16         
 13  agtron_ground    4184 non-null   int16         
 14  currency         4184 non-null   categor

In [32]:
df.to_csv('data/clean-roast-reviews.csv')

## EDA

### Descriptive Statistics:
Calculate **basic statistics** like **mean, median, and standard deviation** for the 'Rating,' 'Aroma,' 'Acidity,' 'Body,' 'Flavor,' 'Aftertaste,' and 'Price' columns to get an overall understanding of the dataset. Find out how often different roasters and locations appear in the dataset. 

### Distributions:
Visualize the **distributions** of 'Rating,' 'Aroma,' 'Acidity,' 'Body,' 'Flavor,' and 'Aftertaste' using to understand the spread of values.

### Correlations:
Investigate the **correlations** between different attributes such as 'Rating,' 'Aroma,' 'Acidity,' 'Body,' 'Flavor,' and 'Aftertaste.' Identify which attributes tend to go together or have an impact on the overall rating.

### Top Roasters and Coffees:
Identify the top-rated roasters and coffee names based on the 'Rating' column. 

### Quantity Analysis:
Investigate the 'Quantity' and 'Unit' columns to understand the different packaging sizes and units in which coffee is sold. Analyze how these factors relate to pricing and consumer preferences.

### Roaster Performance:
Evaluate roasters' performance based on their ratings and the origin of the coffee beans. Are there specific regions or origins associated with higher ratings for particular roasters?

## Deeper Analysis:

### Geospatial Analysis:
Analyze the 'Roaster_Location' and 'Origin' columns to understand where the roasters are located and where the coffee beans are sourced from. You can use geospatial tools to create maps or investigate the relationship between origin and rating.

### Currency Analysis:
Analyze the 'Currency' column to understand the currencies used for pricing. You can convert prices to a common currency (e.g., USD) for comparison.

### Price Analysis:
Analyze the relationship between 'Price' and 'Rating.' Do higher-priced coffees tend to have higher ratings? You can also look for outliers in pricing.Investigate the relationship between pricing ('Price' and 'Currency') and sensory attributes ('Aroma,' 'Acidity,' 'Body,' 'Flavor,' 'Aftertaste'). Are there pricing strategies associated with higher ratings?

### Text Analysis:
Perform natural language processing (NLP) on the 'Review_Description,' 'Blind_Assessment,' and 'Notes' columns to extract insights about the sensory descriptions, flavor profiles, and unique characteristics of the coffees.


In [34]:
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4184 entries, 0 to 4183
Data columns (total 23 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   roast_level      4184 non-null   category      
 1   price            4184 non-null   float64       
 2   review_date      4184 non-null   datetime64[ns]
 3   aroma            4184 non-null   int8          
 4   body             4184 non-null   int8          
 5   flavor           4184 non-null   int8          
 6   aftertaste       4184 non-null   int8          
 7   rating           4184 non-null   int8          
 8   roaster          4184 non-null   category      
 9   roast_name       4184 non-null   object        
 10  url              4184 non-null   object        
 11  acidity          4184 non-null   int8          
 12  agtron_external  4184 non-null   int16         
 13  agtron_ground    4184 non-null   int16         
 14  currency         4184 non-null   categor

Unnamed: 0,price,review_date,aroma,body,flavor,aftertaste,rating,acidity,agtron_external,agtron_ground,quantity,price_usd,price_per_g_usd
count,4184.0,4184,4184.0,4184.0,4184.0,4184.0,4184.0,4184.0,4184.0,4184.0,4184.0,4171.0,4171.0
mean,324.741656,2017-09-22 15:14:27.304015104,8.71152,8.459369,8.869264,8.056405,92.456979,8.35588,54.445268,73.193834,289.228528,20.826389,0.144853
min,3.3,2009-02-01 00:00:00,4.0,6.0,1.0,3.0,67.0,3.0,0.0,18.0,1.0,1.8,0.01
25%,15.5,2014-03-01 00:00:00,8.0,8.0,9.0,8.0,92.0,8.0,52.0,69.0,226.796,14.0,0.04
50%,19.0,2018-02-01 00:00:00,9.0,8.0,9.0,8.0,93.0,8.0,56.0,76.0,340.194,17.5,0.06
75%,30.6375,2021-06-01 00:00:00,9.0,9.0,9.0,8.0,94.0,9.0,60.0,79.0,340.194,21.66,0.09
max,120000.0,2023-09-01 00:00:00,10.0,10.0,10.0,10.0,98.0,10.0,86.0,105.0,2721.54,280.0,7.5
std,3927.17374,,0.574543,0.563681,0.562491,0.611213,2.263444,0.652162,10.196603,9.548344,120.303908,15.311203,0.340774


['taiwan', 'usa', 'guatemala', 'china', 'big island of hawaii', ..., 'netherlands', 'sweden', 'italy', 'germany', 'france']
Length: 33
Categories (32, object): ['australia', 'big island of hawaii', 'canada', 'china', ..., 'uganda', 'united arab emirates', 'united kingdom', 'usa']