# Coffee Review Cleaning and EDA

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Clean
- review date to date dtype
- acidity/structure & acidity into one column
- agtron into 2 columns
- split origin and roasdter locations into columns for states and country
- price into float
- maybe: roaster into cat, location into cat
- aroma, acidity, body, flavor, aftertaste, agtron's into int
- drop NA

### EDA ideas
- visualize histograms
- visualize correlations
- word clouds
- tokenize descriptions
- plot locations, roasters, roast level, agtron vs other characterisitcs
- cluster analysis
- standardize ratings?? 



In [2]:
# Read in raw coffee review data
df = pd.read_csv('data/raw-roast-reviews.csv')
df.head()

def tweak_coffee(df):
    return (df
            .rename(columns = lambda c: c.replace(' ', '_'))
            .rename(columns = {'Coffee_Origin': 'Origin',
                               'Est._Price': 'Price'})
            .assign(Review_Date = pd.to_datetime(df['Review_Date'], format='%b %Y'),
                    Acidity = lambda df_: df_['Acidity/Structure'].fillna(df_['Acidity']),
                    Agtron_External = lambda df_: df_['Agtron'].str.split('/', expand=True)[0],
                    Agtron_Ground = lambda df_: df_['Agtron'].str.split('/', expand=True)[1],
                    Price = lambda df_: df_['Price'].str.replace(',', ''),
                     )
            .applymap(lambda x: x.strip() if isinstance(x, str) else x)
            .applymap(lambda x: np.nan if x == 'NA' else x)
            .drop(['Acidity/Structure', 'Agtron'], axis=1)
            .dropna(subset=['Acidity', 'Roast_Level', 'Aroma', 'Price', 'Bottom_Line', 
                            'Agtron_External', 'Agtron_Ground'])
            .astype({'Roaster': 'category', 'Rating': 'int64', 'Roast_Level':'category',
                     'Roaster_Location':'category',
                     'Agtron_External':'int64', 'Agtron_Ground':'int64',
                     'Aftertaste': 'int64', 'Aroma': 'int64', 'Body': 'int64',
                     'Flavor': 'int64', 'Acidity': 'int64', 'Review_Date': 'datetime64[ns]'
                     })
            
           )

coffee = tweak_coffee(df)
coffee.sample(5)


Unnamed: 0,Rating,Roaster,Coffee_Name,Review_Date,Review_Description,Complete_Review_URL,Roaster_Website_URL,Roaster_Location,Origin,Roast_Level,...,Body,Flavor,Aftertaste,Blind_Assessment,Notes,Bottom_Line,Price,url,Agtron_External,Agtron_Ground
27,95,Speckled Ax,Sumatra Lintong Kardon,2023-08-01,"Richly earthy, deeply spice-toned. Graphite, p...",https://www.coffeereview.com/review/sumatra-li...,,"Portland, Maine","Lintong growing region, North Sumatra Province...",Medium-Light,...,9,9,9,"Richly earthy, deeply spice-toned. Graphite, p...",Produced by 200 widows with support and traini...,A classic Sumatra cup that highlights the orig...,$21.00/12 ounces,https://www.coffeereview.com/review/sumatra-li...,56,74
1262,94,JBC Coffee Roasters,Las Flores Colombia,2021-06-01,"Sweet, bright, elegant. Dark chocolate, hazeln...",https://www.coffeereview.com/review/las-flores...,https://www.jbccoffeeroasters.com/product-cate...,"Madison, Wisconsin","San Isidro, Acevedo, Huila, Colombia",Medium-Light,...,9,9,8,"Sweet, bright, elegant. Dark chocolate, hazeln...",Produced by Jhoan Manuel Vergara Ayure of Finc...,"A deep, chocolaty Colombia cup with ballast fr...",$20.00/12 ounces,https://www.coffeereview.com/review/las-flores...,58,78
1958,92,JBC Coffee Roasters,San Sebastian Colombia,2020-02-01,"Soft, deep, crisply chocolaty. Baking chocolat...",https://www.coffeereview.com/review/san-sebast...,https://www.jbccoffeeroasters.com/product-cate...,"Madison, Wisconsin","Huila Department, southwestern Colombia",Medium-Light,...,9,9,8,"Soft, deep, crisply chocolaty. Baking chocolat...",Produced by small-holding farmers with the sup...,A big-bodied Colombia driven by crisp chocolat...,$17.75/12 ounces,https://www.coffeereview.com/review/san-sebast...,59,76
1175,93,Peach Coffee Roasters,Sumatra Lintong,2021-08-01,"Sweetly savory, deep-toned. Pipe tobacco, sand...",https://www.coffeereview.com/review/sumatra-li...,,"Johns Creek, Georgia","Lintong growing region, North Sumatra Province...",Medium-Light,...,9,9,8,"Sweetly savory, deep-toned. Pipe tobacco, sand...",Coffees from the northern part of the Indonesi...,"A classic Sumatra profile: rich, deep, sweetly...",$18.99/12 ounces,https://www.coffeereview.com/review/sumatra-li...,54,72
111,92,Mostra Coffee,Philippine Rosemarie Rupecio,2023-07-01,"Sweetly nut-toned, richly bittersweet. Almond ...",https://www.coffeereview.com/review/philippine...,https://mostracoffee.com/collections/coffee,"San Diego, California","Sitio Pluto, Balutakay, Managa, Davao del Sur,...",Medium,...,8,9,8,"Sweetly nut-toned, richly bittersweet. Almond ...",Produced by Rosemarie Rupecio of the BACOFA Co...,"A honey-processed Philippine coffee, deeply sw...",$34.99/12 ounces,https://www.coffeereview.com/review/philippine...,53,65


In [48]:
coffee.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2095 entries, 0 to 2453
Data columns (total 22 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   Rating               2095 non-null   int64         
 1   Roaster              2095 non-null   category      
 2   Coffee_Name          2095 non-null   object        
 3   Review_Date          2095 non-null   datetime64[ns]
 4   Review_Description   2095 non-null   object        
 5   Complete_Review_URL  2095 non-null   object        
 6   Roaster_Website_URL  1265 non-null   object        
 7   Roaster_Location     2095 non-null   category      
 8   Origin               2095 non-null   object        
 9   Roast_Level          2095 non-null   category      
 10  Aroma                2095 non-null   int64         
 11  Acidity              2095 non-null   int64         
 12  Body                 2095 non-null   int64         
 13  Flavor               2095 non-null   i

In [10]:
pattern_1 = r'([A-Za-z¥£$]+)\s*([\d,.]+)'
pattern_2 = r'(\d+(?:,\d+)?)\s*(\w+)'

price_unit = (coffee['Price']
              .str.split('/', n=1, expand=True)
              .assign(currency = lambda df_: df_[0].str.extract(pattern_1)[0],
                      price = lambda df_: df_[0].str.extract(pattern_1)[1].str.replace(',', '')
                                                  .str.strip().astype('float'),
                      amount = lambda df_: df_[1].str.extract(pattern_2)[0],
                      unit = lambda df_: df_[1].str.extract(pattern_2)[1]
                       )
              .drop([0, 1], axis=1)
              .dropna(axis=0)
              .astype({'amount': 'int64', 'price':'float'}
                       )
             )

# NT$ - taiwan
# £ - british pound 
# ¥ -  Chinese Yuan
# pesos - mexican
# CAD - canadian
price_unit['currency'].value_counts()

mapping_dict = {'$NT$': 'NT$', '$NT':'NT$'}
price_unit['currency'] = price_unit['currency'].replace(mapping_dict)

price_unit

Unnamed: 0,currency,price,amount,unit
0,$,22.00,8,ounces
1,$,22.00,8,ounces
2,$,22.00,8,ounces
3,$,22.00,8,ounces
8,$,18.00,12,ounces
...,...,...,...,...
2445,$,13.34,4,ounces
2446,$,13.34,4,ounces
2448,$,18.00,12,ounces
2452,$,14.99,12,ounces


In [24]:
df.head()

Unnamed: 0,Rating,Roaster,Coffee_Name,Review_Date,Review_Description,Complete_Review_URL,Roaster_Website_URL,Roaster Location,Coffee Origin,Roast Level,...,Acidity,Body,Flavor,Aftertaste,Agtron,Blind Assessment,Notes,Bottom Line,Est. Price,url
0,94,JBC Coffee Roasters,Wilton Benitez Sidra,Aug 2023,"Delicately fruity, richly floral-toned. Froot ...",https://www.coffeereview.com/review/wilton-ben...,https://www.jbccoffeeroasters.com/product-cate...,"Madison, Wisconsin","Piendamó, Cauca Department, Colombia",Medium-Light,...,,9.0,9.0,8.0,58/74,"Delicately fruity, richly floral-toned. Froot ...","Produced by Wilton Benitez, entirely of the Si...","A complex, deep-toned, very fruit-forward anae...",$22.00/8 ounces,https://www.coffeereview.com/review/wilton-ben...
1,94,JBC Coffee Roasters,Wilton Benitez Sidra,Aug 2023,"Delicately fruity, richly floral-toned. Froot ...",https://www.coffeereview.com/review/wilton-ben...,https://www.jbccoffeeroasters.com/product-cate...,"Madison, Wisconsin","Piendamó, Cauca Department, Colombia",Medium-Light,...,,9.0,9.0,8.0,58/74,"Delicately fruity, richly floral-toned. Froot ...","Produced by Wilton Benitez, entirely of the Si...","A complex, deep-toned, very fruit-forward anae...",$22.00/8 ounces,https://www.coffeereview.com/review/wilton-ben...
2,94,JBC Coffee Roasters,Wilton Benitez Sidra,Aug 2023,"Delicately fruity, richly floral-toned. Froot ...",https://www.coffeereview.com/review/wilton-ben...,https://www.jbccoffeeroasters.com/product-cate...,"Madison, Wisconsin","Piendamó, Cauca Department, Colombia",Medium-Light,...,,9.0,9.0,8.0,58/74,"Delicately fruity, richly floral-toned. Froot ...","Produced by Wilton Benitez, entirely of the Si...","A complex, deep-toned, very fruit-forward anae...",$22.00/8 ounces,https://www.coffeereview.com/review/wilton-ben...
3,94,JBC Coffee Roasters,Wilton Benitez Sidra,Aug 2023,"Delicately fruity, richly floral-toned. Froot ...",https://www.coffeereview.com/review/wilton-ben...,https://www.jbccoffeeroasters.com/product-cate...,"Madison, Wisconsin","Piendamó, Cauca Department, Colombia",Medium-Light,...,,9.0,9.0,8.0,58/74,"Delicately fruity, richly floral-toned. Froot ...","Produced by Wilton Benitez, entirely of the Si...","A complex, deep-toned, very fruit-forward anae...",$22.00/8 ounces,https://www.coffeereview.com/review/wilton-ben...
4,93,JBC Coffee Roasters,Twisted V.8 Espresso,Aug 2023,"Evaluated as espresso. Chocolaty, fruity, nutt...",https://www.coffeereview.com/review/twisted-v-...,https://www.jbccoffeeroasters.com/product-cate...,"Madison, Wisconsin",South and Central America; Africa,Medium-Light,...,,8.0,9.0,8.0,54/72,"Evaluated as espresso. Chocolaty, fruity, nutt...",A blend of coffees from undisclosed South and ...,A very chocolaty espresso blend with lift from...,$18.00/12 ounces,https://www.coffeereview.com/review/twisted-v-...


In [27]:
df[df['url'] == 'https://www.coffeereview.com/review/guatemala-acatenango-gesha-7/']

Unnamed: 0,Rating,Roaster,Coffee_Name,Review_Date,Review_Description,Complete_Review_URL,Roaster_Website_URL,Roaster Location,Coffee Origin,Roast Level,...,Acidity,Body,Flavor,Aftertaste,Agtron,Blind Assessment,Notes,Bottom Line,Est. Price,url
254,94,Lin Jen-Wei’s Black Jar Coffee,Guatemala Acatenango Gesha,Mar 2023,"Floral, balanced, deep-toned. Star jasmine, Bi...",https://www.coffeereview.com/review/guatemala-...,,"Taichung, Taiwan","Acatenango growing region, Guatemala",Light,...,,9.0,9.0,8.0,67/86,"Floral, balanced, deep-toned. Star jasmine, Bi...",Produced entirely of the Gesha (also Geisha) v...,A lovely Guatemala Gesha cup with bright stone...,"NT $1,200/227 grams",https://www.coffeereview.com/review/guatemala-...
