# Coffee Review Cleaning and EDA

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Clean
- review date to date dtype
- acidity/structure & acidity into one column
- agtron into 2 columns
- split origin and roasdter locations into columns for states and country
- price into float
- maybe: roaster into cat, location into cat
- aroma, acidity, body, flavor, aftertaste, agtron's into int
- drop NA

### EDA ideas
- visualize histograms
- visualize correlations
- word clouds
- tokenize descriptions
- plot locations, roasters, roast level, agtron vs other characterisitcs
- cluster analysis
- standardize ratings?? 



In [47]:
# Read in raw coffee review data
df = pd.read_csv('data/raw-roast-reviews.csv')
df.head()

def tweak_coffee(df):
    return (df
            .rename(columns = lambda c: c.replace(' ', '_'))
            .rename(columns = {'Coffee_Origin': 'Origin',
                               'Est._Price': 'Price'})
            .assign(Review_Date = pd.to_datetime(df['Review_Date'], format='%b %Y'),
                    Acidity = lambda df_: df_['Acidity/Structure'].fillna(df_['Acidity']),
                    Agtron_External = lambda df_: df_['Agtron'].str.split('/', expand=True)[0],
                    Agtron_Ground = lambda df_: df_['Agtron'].str.split('/', expand=True)[1],
                    Price = lambda df_: df_['Price'].str.replace(',', ''),
                     )
            .applymap(lambda x: x.strip() if isinstance(x, str) else x)
            .applymap(lambda x: np.nan if x == 'NA' else x)
            .drop(['Acidity/Structure', 'Agtron'], axis=1)
            .dropna(subset=['Acidity', 'Roast_Level', 'Aroma', 'Price', 'Bottom_Line', 
                            'Agtron_External', 'Agtron_Ground'])
            .astype({'Roaster': 'category', 'Rating': 'int64', 'Roast_Level':'category',
                     'Roaster_Location':'category',
                     'Agtron_External':'int64', 'Agtron_Ground':'int64',
                     'Aftertaste': 'int64', 'Aroma': 'int64', 'Body': 'int64',
                     'Flavor': 'int64', 'Acidity': 'int64', 'Review_Date': 'datetime64[ns]'
                     })
            
           )

coffee = tweak_coffee(df)
coffee.sample(5)


Unnamed: 0,Rating,Roaster,Coffee_Name,Review_Date,Review_Description,Complete_Review_URL,Roaster_Website_URL,Roaster_Location,Origin,Roast_Level,...,Body,Flavor,Aftertaste,Blind_Assessment,Notes,Bottom_Line,Price,url,Agtron_External,Agtron_Ground
1087,95,Euphora Coffee,Yemen Sheba Auction Lot 1: Wadi Al Mahjr,2021-10-01,"Focused, gently bright, complex. Dried goji be...",https://www.coffeereview.com/review/yemen-sheb...,https://www.euphoracoffeestudio.com/categories...,"Taipei, Taiwan","Al Mahjr, Al Hayma District, Yemen",Medium-Light,...,9,9,9,"Focused, gently bright, complex. Dried goji be...",This exceptional coffee was selected as the No...,"Kenya-like in its pungent sweet-tart balance, ...",NT $500/4 ounces,https://www.coffeereview.com/review/yemen-sheb...,56,78
100,90,Lee Dong Coffee,Ethiopian Yirgacheffe 74110 Solar,2023-07-01,"Crisply sweet, balanced. Baking chocolate, plu...",https://www.coffeereview.com/review/ethiopian-...,https://www.facebook.com/LiDongcoffee/,"Taichung, Taiwan","Yirgacheffe growing region, south-central Ethi...",Light,...,8,9,7,"Crisply sweet, balanced. Baking chocolate, plu...","Produced by smallholding farmers, entirely of ...",A balanced washed-process Ethiopia cup with pl...,NT $240/227 grams,https://www.coffeereview.com/review/ethiopian-...,64,81
1938,94,Kona Farm Direct,Kona Laurina Pointu,2020-02-01,"Intricate, pungent, deeply floral. Peach, pear...",https://www.coffeereview.com/review/kona-lauri...,,"Holualoa, Hawaii","Kona, Hawaii",Medium-Light,...,9,9,8,"Intricate, pungent, deeply floral. Peach, pear...","Bourbon Pointu, botanical name var. Laurina, i...",An unusual and compelling coffee: brisk but ve...,$50.00/ 7 ounces,https://www.coffeereview.com/review/kona-lauri...,56,70
1600,93,Bird Rock Coffee Roasters,Tigesit Waqa,2020-11-01,"Richly fruit-forward, chocolate-toned. Dried b...",https://www.coffeereview.com/review/tigesit-waqa/,http://bit.ly/2CcjgEd,"San Diego, California","Yirgacheffe growing region, south-Central Ethi...",Medium-Light,...,9,9,8,"Richly fruit-forward, chocolate-toned. Dried b...",Southern Ethiopia coffees like this one are la...,"Concentrated, berry-driven natural Ethiopia wi...",$20.00/12 ounces,https://www.coffeereview.com/review/tigesit-waqa/,56,74
2213,93,Origin Coffee Roasters,Maui Kupa’a Orange Bourbon,2019-05-01,"Bright, crisply sweet-tart. Peach, honeysuckle...",https://www.coffeereview.com/review/maui-kupaa...,,"Kihei, Maui, Hawaii","Kula, Island of Maui, Hawai'i",Medium-Light,...,8,9,8,"Bright, crisply sweet-tart. Peach, honeysuckle...",This exceptional coffee was selected as the No...,A window into the great potential of higher-gr...,$40.00/8 ounces,https://www.coffeereview.com/review/maui-kupaa...,58,82


In [48]:
coffee.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2095 entries, 0 to 2453
Data columns (total 22 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   Rating               2095 non-null   int64         
 1   Roaster              2095 non-null   category      
 2   Coffee_Name          2095 non-null   object        
 3   Review_Date          2095 non-null   datetime64[ns]
 4   Review_Description   2095 non-null   object        
 5   Complete_Review_URL  2095 non-null   object        
 6   Roaster_Website_URL  1265 non-null   object        
 7   Roaster_Location     2095 non-null   category      
 8   Origin               2095 non-null   object        
 9   Roast_Level          2095 non-null   category      
 10  Aroma                2095 non-null   int64         
 11  Acidity              2095 non-null   int64         
 12  Body                 2095 non-null   int64         
 13  Flavor               2095 non-null   i

In [44]:
coffee.Body.unique()

array([ 9.,  8., 10.,  7.])

In [58]:
pattern_1 = r'([A-Za-z¥£$]+)\s*([\d,.]+)'
pattern_2 = r'(\d+(?:,\d+)?)\s*(\w+)'

price_unit = (coffee['Price']
              .str.split('/', n=1, expand=True)
              .assign(currency = lambda df_: df_[0].str.extract(pattern_1)[0],
                      price = lambda df_: df_[0].str.extract(pattern_1)[1].str.replace(',', '')
                                                  .str.strip().astype('float'),
                      amount = lambda df_: df_[1].str.extract(pattern_2)[0],
                      unit = lambda df_: df_[1].str.extract(pattern_2)[1]
                       )
              .drop([0, 1], axis=1)
    
             )

# NT$ - taiwan
# £ - british pound 
# ¥ -  Chinese Yuan
# pesos - mexican
# CAD - canadian
price_unit['currency'].value_counts()
mapping_dict = {'$NT$': 'NT$', '$NT':'NT$'}
price_unit['currency'] = price_unit['currency'].replace(mapping_dict)
price_unit.currency.value_counts()

coffee[coffee['Price'].str.contains('¥')]

Unnamed: 0,Rating,Roaster,Coffee_Name,Review_Date,Review_Description,Complete_Review_URL,Roaster_Website_URL,Roaster_Location,Origin,Roast_Level,...,Body,Flavor,Aftertaste,Blind_Assessment,Notes,Bottom_Line,Price,url,Agtron_External,Agtron_Ground
1036,96,SOT Coffee Roaster,SÖT Coffee Roaster Colombia Wush Wush Dynamic ...,2021-11-01,"Savory-sweet, intensely aromatic and flavor-sa...",https://www.coffeereview.com/review/sot-coffee...,https://www.sotcoffee.com/,"Osaka, Japan","San Augustín, Huila Department, Colombia",Light,...,9,10,8,"Savory-sweet, intensely aromatic and flavor-sa...",This exceptional coffee was selected as the No...,"An exciting, intensely distinctive coffee for ...",¥1680/100 grams,https://www.coffeereview.com/review/sot-coffee...,76,92
1228,94,SOT Coffee Roaster,Brazil Gauriroba Natural,2021-07-01,"Citrusy-bright, juicy-sweet, spice-toned. Pink...",https://www.coffeereview.com/review/brazil-gau...,https://www.sotcoffee.com/,"Osaka, Japan","Minas Gerais State, southeastern Brazil",Light,...,9,9,8,"Citrusy-bright, juicy-sweet, spice-toned. Pink...",Produced by Homero Aguiar Paiva at Fazenda Gau...,"An intriguingly complex Brazil natural cup, re...",¥2980/100 grams,https://www.coffeereview.com/review/brazil-gau...,82,104
1277,93,Mamechamame Coffee,Rwanda Simbi,2021-06-01,"Balanced, elegant, gently stated. Cocoa nib, p...",https://www.coffeereview.com/review/rwanda-simbi/,,"Tochigi, Japan","Huye District, Rwanda",Light,...,9,9,8,"Balanced, elegant, gently stated. Cocoa nib, p...",This coffee tied for the second-highest rating...,"A confidently structured, nuanced Rwanda cup, ...","¥1,050/100 grams",https://www.coffeereview.com/review/rwanda-simbi/,74,94
1333,92,Sunny's Coffee,Ethiopia Guji Uraga Yabitu,2021-05-01,"Citrusy and sweetly herbaceous, cocoa-toned. C...",https://www.coffeereview.com/review/ethiopia-g...,,"Tochigi, Japan","Guji Zone, Oromia Region, southern Ethiopia",Light,...,8,9,8,"Citrusy and sweetly herbaceous, cocoa-toned. C...",Guji is nestled next to Ethiopia's better-know...,"A quietly complex washed Guji coffee: crisp, s...",¥1280/100 grams,https://www.coffeereview.com/review/ethiopia-g...,63,81
1334,91,Sunny's Coffee,Ethiopia Guji Hambela Dabaye,2021-05-01,"Gently fruit-toned, sweetly tart. Plum, cocoa ...",https://www.coffeereview.com/review/ethiopia-g...,,"Tochigi, Japan","Guji Zone, Oromia Region, southern Ethiopia",Light,...,8,9,8,"Gently fruit-toned, sweetly tart. Plum, cocoa ...",Guji is nestled next to Ethiopia's better-know...,"A friendly, accessible washed Ethiopia cup ani...",¥1280/100 grams,https://www.coffeereview.com/review/ethiopia-g...,66,84
1335,90,Sunny's Coffee,Ethiopia Dry-Process Guji Hambela,2021-05-01,"Balanced, delicate, floral-toned. Dogwood flow...",https://www.coffeereview.com/review/ethiopia-d...,,"Tochigi, Japan","Guji Zone, Oromia Region, southern Ethiopia",Light,...,8,8,8,"Balanced, delicate, floral-toned. Dogwood flow...",Guji is nestled next to Ethiopia's better-know...,A natural-processed Guji characterized by plea...,¥1280/100 grams,https://www.coffeereview.com/review/ethiopia-d...,62,82
1336,90,Sunny's Coffee,Ethiopia Dry-Process Benti Nenka,2021-05-01,"Delicate, crisply sweet. Prairie flowers, fres...",https://www.coffeereview.com/review/ethiopia-d...,,"Tochigi, Japan","Yirgacheffe growing region, southern Ethiopia",Light,...,8,9,7,"Delicate, crisply sweet. Prairie flowers, fres...",The Yirgacheffe region produces distinctive co...,A quietly pleasing natural-processed Yirgachef...,¥1280/100 grams,https://www.coffeereview.com/review/ethiopia-d...,64,82
1432,96,SOT Coffee Roaster,Colombia Cauca El Paraiso,2021-03-01,"Lyrically sweet, juicy, elegantly bright. Stra...",https://www.coffeereview.com/review/colombia-c...,https://www.sotcoffee.com/,"Osaka, Japan","Cauca, Colombia",Light,...,9,10,9,"Lyrically sweet, juicy, elegantly bright. Stra...","SOT Coffee Roaster, based in Osaka, Japan, is ...",An exquisitely refined ultra-light-roasted cof...,"¥ 1,580/100 grams",https://www.coffeereview.com/review/colombia-c...,85,105
1433,94,SOT Coffee Roaster,Taiwan Songyue Geisha,2021-03-01,"Delicate, sweetly tart, richly aromatic. Lemon...",https://www.coffeereview.com/review/taiwan-son...,https://www.sotcoffee.com/,"Osaka, Japan","Yunlin, Taiwan",Light,...,9,9,8,"Delicate, sweetly tart, richly aromatic. Lemon...",Produced at Songyue Coffee Manor in the highla...,"A fine Taiwan-grown Geisha, processed by the n...","¥ 2,640/50 grams",https://www.coffeereview.com/review/taiwan-son...,79,92
1527,93,Sunny's Coffee,Guatemala Acatenango,2021-01-01,"Crisply sweet, gently resonant. Freesia, apric...",https://www.coffeereview.com/review/guatemala-...,,"Tochigi, Japan","Acatenango, Guatemala",Medium-Light,...,9,9,8,"Crisply sweet, gently resonant. Freesia, apric...",Comprised entirely of the Geisha variety of Ar...,"A satisfying, richly sweet Guatemala Geisha cu...","¥ 2,420/200 grams",https://www.coffeereview.com/review/guatemala-...,58,76


0       9.0
1       9.0
2       9.0
3       9.0
4       9.0
       ... 
2455    7.0
2456    8.0
2457    3.0
2458    8.0
2459    2.0
Name: Aroma, Length: 2460, dtype: float64

In [24]:
df.head()

Unnamed: 0,Rating,Roaster,Coffee_Name,Review_Date,Review_Description,Complete_Review_URL,Roaster_Website_URL,Roaster Location,Coffee Origin,Roast Level,...,Acidity,Body,Flavor,Aftertaste,Agtron,Blind Assessment,Notes,Bottom Line,Est. Price,url
0,94,JBC Coffee Roasters,Wilton Benitez Sidra,Aug 2023,"Delicately fruity, richly floral-toned. Froot ...",https://www.coffeereview.com/review/wilton-ben...,https://www.jbccoffeeroasters.com/product-cate...,"Madison, Wisconsin","Piendamó, Cauca Department, Colombia",Medium-Light,...,,9.0,9.0,8.0,58/74,"Delicately fruity, richly floral-toned. Froot ...","Produced by Wilton Benitez, entirely of the Si...","A complex, deep-toned, very fruit-forward anae...",$22.00/8 ounces,https://www.coffeereview.com/review/wilton-ben...
1,94,JBC Coffee Roasters,Wilton Benitez Sidra,Aug 2023,"Delicately fruity, richly floral-toned. Froot ...",https://www.coffeereview.com/review/wilton-ben...,https://www.jbccoffeeroasters.com/product-cate...,"Madison, Wisconsin","Piendamó, Cauca Department, Colombia",Medium-Light,...,,9.0,9.0,8.0,58/74,"Delicately fruity, richly floral-toned. Froot ...","Produced by Wilton Benitez, entirely of the Si...","A complex, deep-toned, very fruit-forward anae...",$22.00/8 ounces,https://www.coffeereview.com/review/wilton-ben...
2,94,JBC Coffee Roasters,Wilton Benitez Sidra,Aug 2023,"Delicately fruity, richly floral-toned. Froot ...",https://www.coffeereview.com/review/wilton-ben...,https://www.jbccoffeeroasters.com/product-cate...,"Madison, Wisconsin","Piendamó, Cauca Department, Colombia",Medium-Light,...,,9.0,9.0,8.0,58/74,"Delicately fruity, richly floral-toned. Froot ...","Produced by Wilton Benitez, entirely of the Si...","A complex, deep-toned, very fruit-forward anae...",$22.00/8 ounces,https://www.coffeereview.com/review/wilton-ben...
3,94,JBC Coffee Roasters,Wilton Benitez Sidra,Aug 2023,"Delicately fruity, richly floral-toned. Froot ...",https://www.coffeereview.com/review/wilton-ben...,https://www.jbccoffeeroasters.com/product-cate...,"Madison, Wisconsin","Piendamó, Cauca Department, Colombia",Medium-Light,...,,9.0,9.0,8.0,58/74,"Delicately fruity, richly floral-toned. Froot ...","Produced by Wilton Benitez, entirely of the Si...","A complex, deep-toned, very fruit-forward anae...",$22.00/8 ounces,https://www.coffeereview.com/review/wilton-ben...
4,93,JBC Coffee Roasters,Twisted V.8 Espresso,Aug 2023,"Evaluated as espresso. Chocolaty, fruity, nutt...",https://www.coffeereview.com/review/twisted-v-...,https://www.jbccoffeeroasters.com/product-cate...,"Madison, Wisconsin",South and Central America; Africa,Medium-Light,...,,8.0,9.0,8.0,54/72,"Evaluated as espresso. Chocolaty, fruity, nutt...",A blend of coffees from undisclosed South and ...,A very chocolaty espresso blend with lift from...,$18.00/12 ounces,https://www.coffeereview.com/review/twisted-v-...


In [27]:
df[df['url'] == 'https://www.coffeereview.com/review/guatemala-acatenango-gesha-7/']

Unnamed: 0,Rating,Roaster,Coffee_Name,Review_Date,Review_Description,Complete_Review_URL,Roaster_Website_URL,Roaster Location,Coffee Origin,Roast Level,...,Acidity,Body,Flavor,Aftertaste,Agtron,Blind Assessment,Notes,Bottom Line,Est. Price,url
254,94,Lin Jen-Wei’s Black Jar Coffee,Guatemala Acatenango Gesha,Mar 2023,"Floral, balanced, deep-toned. Star jasmine, Bi...",https://www.coffeereview.com/review/guatemala-...,,"Taichung, Taiwan","Acatenango growing region, Guatemala",Light,...,,9.0,9.0,8.0,67/86,"Floral, balanced, deep-toned. Star jasmine, Bi...",Produced entirely of the Gesha (also Geisha) v...,A lovely Guatemala Gesha cup with bright stone...,"NT $1,200/227 grams",https://www.coffeereview.com/review/guatemala-...
