# Coffee Review Cleaning and EDA

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Clean
- acidity/structure & acidity into one column
- agtron into 2 columns
- split origin and roaster locations into columns for states and country
- fix dtypes
- drop rows with NA

### EDA ideas
- visualize histograms
- visualize correlations
- word clouds
- tokenize descriptions
- plot locations, roasters, roast level, agtron vs other characterisitcs
- cluster analysis
- standardize ratings?? 



In [2]:
# Read in raw coffee review data
df = pd.read_csv('data/raw-roast-reviews.csv')
df.head()

def tweak_coffee(df):
    return (df
            .rename(columns = lambda c: c.replace(' ', '_'))
            .rename(columns = {'Coffee_Origin': 'Origin',
                               'Est._Price': 'Price'})
            .assign(Review_Date = pd.to_datetime(df['Review_Date'], format='%b %Y'),
                    Acidity = lambda df_: df_['Acidity/Structure'].fillna(df_['Acidity']),
                    Agtron_External = lambda df_: df_['Agtron'].str.split('/', expand=True)[0],
                    Agtron_Ground = lambda df_: df_['Agtron'].str.split('/', expand=True)[1],
                    Amount = lambda df_: df_['Price'].str.split('/', expand=True)[1],
                    Price = lambda df_: df_['Price'].str.replace(',', '').str.split('/', expand=True)[0],
                     )
            .applymap(lambda x: x.strip() if isinstance(x, str) else x)
            .applymap(lambda x: np.nan if x == 'NA' else x)
            .drop(['Acidity/Structure', 'Agtron'], axis=1)
            .dropna(subset=['Acidity', 'Roast_Level', 'Aroma', 'Price', 'Bottom_Line', 
                            'Agtron_External', 'Agtron_Ground'])
            .astype({'Roaster': 'category', 'Rating': 'int64', 'Roast_Level':'category',
                     'Roaster_Location':'category',
                     'Agtron_External':'int64', 'Agtron_Ground':'int64',
                     'Aftertaste': 'int64', 'Aroma': 'int64', 'Body': 'int64',
                     'Flavor': 'int64', 'Acidity': 'int64', 'Review_Date': 'datetime64[ns]'
                     })
            
           )

coffee = tweak_coffee(df)
coffee.sample(5)


Unnamed: 0,Rating,Roaster,Coffee_Name,Review_Date,Review_Description,Complete_Review_URL,Roaster_Website_URL,Roaster_Location,Origin,Roast_Level,...,Flavor,Aftertaste,Blind_Assessment,Notes,Bottom_Line,Price,url,Agtron_External,Agtron_Ground,Amount
915,95,JBC Coffee Roasters,Kabiufa Papua New Guinea,2022-01-01,"Rich, deep-toned, complex. Toffee, pomegranate...",https://www.coffeereview.com/review/kabiufa-pa...,https://www.jbccoffeeroasters.com/product-cate...,"Madison, Wisconsin","Kabiufa, Eastern Highlands, Papua New Guinea",Light,...,9,9,"Rich, deep-toned, complex. Toffee, pomegranate...",This exceptional coffee was selected as the No...,An exciting PNG cup with a grand aromatic rang...,$18.00,https://www.coffeereview.com/review/kabiufa-pa...,60,82,12 ounces
1492,92,RamsHead Coffee Roasters,Colombia La Virgen Microlot,2021-02-01,"Deep-toned, cocoa-driven. Cocoa powder, golden...",https://www.coffeereview.com/review/colombia-l...,https://bit.ly/2P8dTQf,"Bozeman, Montana","Liboriana Valley, Antioquia Department, Colombia",Medium-Light,...,9,8,"Deep-toned, cocoa-driven. Cocoa powder, golden...",Produced at La Virgen Farms entirely of the Ca...,A straight-ahead washed Colombia cup with swee...,$15.95,https://www.coffeereview.com/review/colombia-l...,53,70,12 ounces
1465,94,Prairie Lily Coffee,Panama Pacamara Natural,2021-02-01,"Rich-toned, chocolaty, fruit-centered. Chocola...",https://www.coffeereview.com/review/panama-pac...,,"Lloydminster, Saskatchewan, Canada","Volcan growing region, western Panama",Medium-Light,...,9,8,"Rich-toned, chocolaty, fruit-centered. Chocola...",Produced at the Janson Family Farm entirely of...,"A deep, brooding, balanced Panama Pacamara cup...",CAD $50,https://www.coffeereview.com/review/panama-pac...,52,70,200 grams
1035,94,Oceana Coffee,Ethiopia Durato Bombe Natural,2021-11-01,"Richly fruit-forward, chocolaty. Dried persimm...",https://www.coffeereview.com/review/ethiopia-d...,,"Tequesta, Florida","Bombe mountains, Sidama Region, southern Ethiopia",Light,...,9,8,"Richly fruit-forward, chocolaty. Dried persimm...",Coffees from the Sidama region like this one a...,"A high-toned, deeply sweet, richly tart and ch...",$22.00,https://www.coffeereview.com/review/ethiopia-d...,64,80,12 ounces
504,93,Amavida Coffee Roasters,Burundi Natural Anaerobic Kibingo,2022-10-01,"High-toned, sweetly tart. Black cherry, dark c...",https://www.coffeereview.com/review/burundi-na...,https://bit.ly/2PGHVY6,"Santa Rosa Beach, Florida","Kayanza, Burundi",Medium-Light,...,9,8,"High-toned, sweetly tart. Black cherry, dark c...",Produced by smallholding farmers entirely of t...,A sweet-tart-savory anaerobic natural Burundi ...,$23.00,https://www.coffeereview.com/review/burundi-na...,62,78,12 ounces


In [3]:
coffee.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2095 entries, 0 to 2453
Data columns (total 23 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   Rating               2095 non-null   int64         
 1   Roaster              2095 non-null   category      
 2   Coffee_Name          2095 non-null   object        
 3   Review_Date          2095 non-null   datetime64[ns]
 4   Review_Description   2095 non-null   object        
 5   Complete_Review_URL  2095 non-null   object        
 6   Roaster_Website_URL  1265 non-null   object        
 7   Roaster_Location     2095 non-null   category      
 8   Origin               2095 non-null   object        
 9   Roast_Level          2095 non-null   category      
 10  Aroma                2095 non-null   int64         
 11  Acidity              2095 non-null   int64         
 12  Body                 2095 non-null   int64         
 13  Flavor               2095 non-null   i

In [72]:
import re

pat = r'([\d.]+)\s*([a-zA-Z]+)'
(coffee
     .assign(quantity = coffee['Amount'].str.extract(pat)[0],
             unit = coffee['Amount'].str.extract(pat)[1]
            )
     .dropna(axis=0)
     .drop('Amount', axis=1)
)


Unnamed: 0,Rating,Roaster,Coffee_Name,Review_Date,Review_Description,Complete_Review_URL,Roaster_Website_URL,Roaster_Location,Origin,Roast_Level,...,Aftertaste,Blind_Assessment,Notes,Bottom_Line,Price,url,Agtron_External,Agtron_Ground,quantity,unit
0,94,JBC Coffee Roasters,Wilton Benitez Sidra,2023-08-01,"Delicately fruity, richly floral-toned. Froot ...",https://www.coffeereview.com/review/wilton-ben...,https://www.jbccoffeeroasters.com/product-cate...,"Madison, Wisconsin","Piendamó, Cauca Department, Colombia",Medium-Light,...,8,"Delicately fruity, richly floral-toned. Froot ...","Produced by Wilton Benitez, entirely of the Si...","A complex, deep-toned, very fruit-forward anae...",$22.00,https://www.coffeereview.com/review/wilton-ben...,58,74,8,ounces
1,94,JBC Coffee Roasters,Wilton Benitez Sidra,2023-08-01,"Delicately fruity, richly floral-toned. Froot ...",https://www.coffeereview.com/review/wilton-ben...,https://www.jbccoffeeroasters.com/product-cate...,"Madison, Wisconsin","Piendamó, Cauca Department, Colombia",Medium-Light,...,8,"Delicately fruity, richly floral-toned. Froot ...","Produced by Wilton Benitez, entirely of the Si...","A complex, deep-toned, very fruit-forward anae...",$22.00,https://www.coffeereview.com/review/wilton-ben...,58,74,8,ounces
2,94,JBC Coffee Roasters,Wilton Benitez Sidra,2023-08-01,"Delicately fruity, richly floral-toned. Froot ...",https://www.coffeereview.com/review/wilton-ben...,https://www.jbccoffeeroasters.com/product-cate...,"Madison, Wisconsin","Piendamó, Cauca Department, Colombia",Medium-Light,...,8,"Delicately fruity, richly floral-toned. Froot ...","Produced by Wilton Benitez, entirely of the Si...","A complex, deep-toned, very fruit-forward anae...",$22.00,https://www.coffeereview.com/review/wilton-ben...,58,74,8,ounces
3,94,JBC Coffee Roasters,Wilton Benitez Sidra,2023-08-01,"Delicately fruity, richly floral-toned. Froot ...",https://www.coffeereview.com/review/wilton-ben...,https://www.jbccoffeeroasters.com/product-cate...,"Madison, Wisconsin","Piendamó, Cauca Department, Colombia",Medium-Light,...,8,"Delicately fruity, richly floral-toned. Froot ...","Produced by Wilton Benitez, entirely of the Si...","A complex, deep-toned, very fruit-forward anae...",$22.00,https://www.coffeereview.com/review/wilton-ben...,58,74,8,ounces
8,92,JBC Coffee Roasters,Piura Peru,2023-08-01,Sweetly chocolaty and nut-toned. Baking chocol...,https://www.coffeereview.com/review/piura-peru/,https://www.jbccoffeeroasters.com/product-cate...,"Madison, Wisconsin","El Faique, Piura Department, Peru",Medium-Light,...,8,Sweetly chocolaty and nut-toned. Baking chocol...,Produced by smallholding farmers who work dire...,"A confident, deep-toned Peru with a chocolate-...",$18.00,https://www.coffeereview.com/review/piura-peru/,56,74,12,ounces
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2442,95,Lexington Coffee Roasters,Tarime Tanzania,2018-09-01,"Deeply rich, sweet-savory, nuanced and layered...",https://www.coffeereview.com/review/tarime-tan...,http://www.lexingtoncoffee.com,"Lexington, Virginia","Tarime District, Mara Region, Tanzania",Medium-Light,...,9,"Deeply rich, sweet-savory, nuanced and layered...",This exceptional coffee was selected as the No...,"A bright, engaging, complex Tanzania coffee. I...",$14.75,https://www.coffeereview.com/review/tarime-tan...,58,82,12,ounces
2443,93,Lexington Coffee Roasters,Guachepelin Costa Rica,2018-09-01,"Delicate, crisply sweet-tart. Lime zest, baker...",https://www.coffeereview.com/review/guachepeli...,http://www.lexingtoncoffee.com,"Lexington, Virginia","Sabanilla de Alajuela, Central Valley, Costa Rica",Light,...,8,"Delicate, crisply sweet-tart. Lime zest, baker...",Certified organic. Produced at Finca Gachepeli...,A crisply elegant expression of red honey-proc...,$19.95,https://www.coffeereview.com/review/guachepeli...,60,84,12,ounces
2444,96,Barrington Coffee Roasting,Berlina Geisha,2018-09-01,"Elegant, intensely sweet-tart, resonantly flor...",https://www.coffeereview.com/review/berlina-ge...,https://barringtoncoffee.com,"Lee, Massachusetts","Horqueta, Chiriquí, Boquete, Panama",Medium-Light,...,9,"Elegant, intensely sweet-tart, resonantly flor...",This exceptional coffee was selected as the No...,"A soaring Gesha cup: balanced, both delicate a...",$13.34,https://www.coffeereview.com/review/berlina-ge...,56,84,4,ounces
2445,95,Barrington Coffee Roasting,Gatugi Triple,2018-09-01,"Bright, crisply sweet-savory. Black currant, h...",https://www.coffeereview.com/review/gatugi-tri...,https://barringtoncoffee.com,"Lee, Massachusetts","Nyeri growing region, south-central Kenya",Light,...,9,"Bright, crisply sweet-savory. Black currant, h...",This coffee is available exclusively as a part...,A confident Kenya with a classic cup profile: ...,$13.34,https://www.coffeereview.com/review/gatugi-tri...,58,86,4,ounces
