# Coffee Review Cleaning and EDA

In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

### Clean
- acidity/structure & acidity into one column
- agtron into 2 columns
- split origin and roaster locations into columns for states and country
- fix dtypes
- drop rows with NA

### EDA ideas
- visualize histograms
- visualize correlations
- word clouds
- tokenize descriptions
- plot locations, roasters, roast level, agtron vs other characterisitcs
- cluster analysis
- standardize ratings?? 



In [219]:
# Read in raw coffee review data
df = pd.read_csv('data/raw-roast-reviews.csv')
df.head()

def split_price_currency(df):
    price = df['Price']
    df['Price'] = price.apply(lambda x: re.search(r'\$?(\d+\.\d+|\d+)', str(x)).group(1) if re.search(r'\$?(\d+\.\d+|\d+)', str(x)) else None)
    df['Currency'] = price.apply(lambda x: re.sub(r'(\d+\.\d+|\d+)', '', str(x)) if re.search(r'\$?(\d+\.\d+|\d+)', str(x)) else str(x))
    return df

def tweak_coffee(df):
    return (df
            # Remove spaces from column names
            .rename(columns = lambda c: c.replace(' ', '_'))
            .rename(columns = {'Coffee_Origin': 'Origin',
                               'Est._Price': 'Price'})
            # Clean up columns. Create datetime column from Review_Date. Split Agtron into two columns.
            # Split Price into a column for cost and a column for amount.
            .assign(Review_Date = pd.to_datetime(df['Review_Date'], format='%b %Y'),
                    Acidity = lambda df_: df_['Acidity/Structure'].fillna(df_['Acidity']),
                    Agtron_External = lambda df_: df_['Agtron'].str.split('/', expand=True)[0],
                    Agtron_Ground = lambda df_: df_['Agtron'].str.split('/', expand=True)[1],
                    Amount = lambda df_: df_['Price'].str.split('/', expand=True)[1],
                    Price = lambda df_: df_['Price'].str.replace(',', '').str.split('/', expand=True)[0],
                     )
            # Remove whitespace and tranform "NA" to NaN
            .applymap(lambda x: x.strip() if isinstance(x, str) else x)
            .applymap(lambda x: np.nan if x == 'NA' else x)
            .pipe(split_price_currency)
            # Drop old columns and rows with missing values
            .drop(['Acidity/Structure', 'Agtron'], axis=1)
            .dropna(subset=['Acidity', 'Roast_Level', 'Aroma', 'Price', 'Bottom_Line', 
                            'Agtron_External', 'Agtron_Ground'])
            # Transform data types
            .astype({'Roaster': 'category', 'Rating': 'int64', 'Roast_Level':'category',
                     'Roaster_Location':'category',
                     'Agtron_External':'int64', 'Agtron_Ground':'int64',
                     'Aftertaste': 'int64', 'Aroma': 'int64', 'Body': 'int64',
                     'Flavor': 'int64', 'Acidity': 'int64'
                     })
            
           )

coffee = tweak_coffee(df)
coffee.sample(5)


Unnamed: 0,Rating,Roaster,Coffee_Name,Review_Date,Review_Description,Complete_Review_URL,Roaster_Website_URL,Roaster_Location,Origin,Roast_Level,...,Aftertaste,Blind_Assessment,Notes,Bottom_Line,Price,url,Agtron_External,Agtron_Ground,Amount,Currency
2248,95,Kakalove Cafe,Ethiopia Natural Guji Hambela G1 Bishan Fugu,2019-04-01,"Brightly sweet, deeply pungent. Fine musk, bla...",https://www.coffeereview.com/review/ethiopia-n...,http://www.kakalovecafe.com.tw/,"Chia-Yi, Taiwan","Guji Zone, Oromia Region, southern Ethiopia",Medium-Light,...,9,"Brightly sweet, deeply pungent. Fine musk, bla...",Southern Ethiopia coffees like this one are la...,"A rich-toned, resonant, cleanly fruity natural...",650.0,https://www.coffeereview.com/review/ethiopia-n...,51,73,16 ounces,NT $
433,94,Genesis Coffee Lab,Double-Anaerobic Fermentation Tamiru Tadesse,2022-11-01,"Richly floral-toned, crisply sweet. Honeysuckl...",https://www.coffeereview.com/review/double-ana...,,"Big Lake, Alaska","Sidamo growing region, south-central Ethiopia",Light,...,8,"Richly floral-toned, crisply sweet. Honeysuckl...",Produced by Tamiru Tadesse of Alo Coffee. Sout...,A floral-driven anaerobic Ethiopia cup free of...,40.0,https://www.coffeereview.com/review/double-ana...,62,80,10 ounces,$
164,93,Revel Coffee,Kenya Kagumo AB,2023-06-01,"Richly sweet-tart. Black currant, hazelnut, co...",https://www.coffeereview.com/review/kenya-kagu...,https://revelcoffee.com/,"Billings, Montana","Nyeri growing region, south-central Kenya",Medium-Light,...,8,"Richly sweet-tart. Black currant, hazelnut, co...",Produced by smallholding members of the Kagumo...,A classic Kenya cup with particularly inviting...,24.25,https://www.coffeereview.com/review/kenya-kagu...,58,75,12 ounces,$
683,92,Jaunt Coffee Roasters,Burundi Kabarore Commune,2022-06-01,"Delicate, balanced, sweet-toned. Baking chocol...",https://www.coffeereview.com/review/burundi-ka...,https://jauntcoffee.com/,"San Diego, California","Kayanza Province, Burundi",Medium-Light,...,8,"Delicate, balanced, sweet-toned. Baking chocol...",Produced by smallholding farmers of the Kaboar...,"A friendly, accessible washed Burundi cup; cri...",20.0,https://www.coffeereview.com/review/burundi-ka...,56,72,12 ounces,$
1433,94,SOT Coffee Roaster,Taiwan Songyue Geisha,2021-03-01,"Delicate, sweetly tart, richly aromatic. Lemon...",https://www.coffeereview.com/review/taiwan-son...,https://www.sotcoffee.com/,"Osaka, Japan","Yunlin, Taiwan",Light,...,8,"Delicate, sweetly tart, richly aromatic. Lemon...",Produced at Songyue Coffee Manor in the highla...,"A fine Taiwan-grown Geisha, processed by the n...",2640.0,https://www.coffeereview.com/review/taiwan-son...,79,92,50 grams,¥


In [15]:
coffee.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2095 entries, 0 to 2453
Data columns (total 23 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   Rating               2095 non-null   int64         
 1   Roaster              2095 non-null   category      
 2   Coffee_Name          2095 non-null   object        
 3   Review_Date          2095 non-null   datetime64[ns]
 4   Review_Description   2095 non-null   object        
 5   Complete_Review_URL  2095 non-null   object        
 6   Roaster_Website_URL  1265 non-null   object        
 7   Roaster_Location     2095 non-null   category      
 8   Origin               2095 non-null   object        
 9   Roast_Level          2095 non-null   category      
 10  Aroma                2095 non-null   int64         
 11  Acidity              2095 non-null   int64         
 12  Body                 2095 non-null   int64         
 13  Flavor               2095 non-null   i

In [168]:
def convert_to_grams(df):
    # Define conversion factors
    pound_to_gram = 453.59
    ounce_to_gram = 28.3495
    
    df.loc[df['Unit'] == 'ounces', 'Quantity'] = df['Quantity'] * ounce_to_gram
    df.loc[df['Unit'] == 'pounds', 'Quantity'] = df['Quantity'] * pound_to_gram
    
    df['unit'] = 'grams'
    return df
 

amount_pat = r'([\d.]+)\s*([a-zA-Z]+)'
curr_pat = r'(\d+\.\d+)'



(coffee
     .assign(Quantity = coffee['Amount'].str.extract(amount_pat)[0],
             Unit = coffee['Amount'].str.extract(amount_pat)[1],
             Price_ = coffee['Price'].str.extract(curr_pat),
            )
     .dropna(axis=0)
     .drop('Amount', axis=1)
     .astype({'Quantity': 'float'})
     .pipe(convert_to_grams)
)

#curr_map = {'NT $': 'NT$', 'Nt $': 'NT$',}

Unnamed: 0,Rating,Roaster,Coffee_Name,Review_Date,Review_Description,Complete_Review_URL,Roaster_Website_URL,Roaster_Location,Origin,Roast_Level,...,Notes,Bottom_Line,Price,url,Agtron_External,Agtron_Ground,Quantity,Unit,Price_,unit
0,94,JBC Coffee Roasters,Wilton Benitez Sidra,2023-08-01,"Delicately fruity, richly floral-toned. Froot ...",https://www.coffeereview.com/review/wilton-ben...,https://www.jbccoffeeroasters.com/product-cate...,"Madison, Wisconsin","Piendamó, Cauca Department, Colombia",Medium-Light,...,"Produced by Wilton Benitez, entirely of the Si...","A complex, deep-toned, very fruit-forward anae...",$22.00,https://www.coffeereview.com/review/wilton-ben...,58,74,226.796,ounces,22.00,grams
1,94,JBC Coffee Roasters,Wilton Benitez Sidra,2023-08-01,"Delicately fruity, richly floral-toned. Froot ...",https://www.coffeereview.com/review/wilton-ben...,https://www.jbccoffeeroasters.com/product-cate...,"Madison, Wisconsin","Piendamó, Cauca Department, Colombia",Medium-Light,...,"Produced by Wilton Benitez, entirely of the Si...","A complex, deep-toned, very fruit-forward anae...",$22.00,https://www.coffeereview.com/review/wilton-ben...,58,74,226.796,ounces,22.00,grams
2,94,JBC Coffee Roasters,Wilton Benitez Sidra,2023-08-01,"Delicately fruity, richly floral-toned. Froot ...",https://www.coffeereview.com/review/wilton-ben...,https://www.jbccoffeeroasters.com/product-cate...,"Madison, Wisconsin","Piendamó, Cauca Department, Colombia",Medium-Light,...,"Produced by Wilton Benitez, entirely of the Si...","A complex, deep-toned, very fruit-forward anae...",$22.00,https://www.coffeereview.com/review/wilton-ben...,58,74,226.796,ounces,22.00,grams
3,94,JBC Coffee Roasters,Wilton Benitez Sidra,2023-08-01,"Delicately fruity, richly floral-toned. Froot ...",https://www.coffeereview.com/review/wilton-ben...,https://www.jbccoffeeroasters.com/product-cate...,"Madison, Wisconsin","Piendamó, Cauca Department, Colombia",Medium-Light,...,"Produced by Wilton Benitez, entirely of the Si...","A complex, deep-toned, very fruit-forward anae...",$22.00,https://www.coffeereview.com/review/wilton-ben...,58,74,226.796,ounces,22.00,grams
8,92,JBC Coffee Roasters,Piura Peru,2023-08-01,Sweetly chocolaty and nut-toned. Baking chocol...,https://www.coffeereview.com/review/piura-peru/,https://www.jbccoffeeroasters.com/product-cate...,"Madison, Wisconsin","El Faique, Piura Department, Peru",Medium-Light,...,Produced by smallholding farmers who work dire...,"A confident, deep-toned Peru with a chocolate-...",$18.00,https://www.coffeereview.com/review/piura-peru/,56,74,340.194,ounces,18.00,grams
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2442,95,Lexington Coffee Roasters,Tarime Tanzania,2018-09-01,"Deeply rich, sweet-savory, nuanced and layered...",https://www.coffeereview.com/review/tarime-tan...,http://www.lexingtoncoffee.com,"Lexington, Virginia","Tarime District, Mara Region, Tanzania",Medium-Light,...,This exceptional coffee was selected as the No...,"A bright, engaging, complex Tanzania coffee. I...",$14.75,https://www.coffeereview.com/review/tarime-tan...,58,82,340.194,ounces,14.75,grams
2443,93,Lexington Coffee Roasters,Guachepelin Costa Rica,2018-09-01,"Delicate, crisply sweet-tart. Lime zest, baker...",https://www.coffeereview.com/review/guachepeli...,http://www.lexingtoncoffee.com,"Lexington, Virginia","Sabanilla de Alajuela, Central Valley, Costa Rica",Light,...,Certified organic. Produced at Finca Gachepeli...,A crisply elegant expression of red honey-proc...,$19.95,https://www.coffeereview.com/review/guachepeli...,60,84,340.194,ounces,19.95,grams
2444,96,Barrington Coffee Roasting,Berlina Geisha,2018-09-01,"Elegant, intensely sweet-tart, resonantly flor...",https://www.coffeereview.com/review/berlina-ge...,https://barringtoncoffee.com,"Lee, Massachusetts","Horqueta, Chiriquí, Boquete, Panama",Medium-Light,...,This exceptional coffee was selected as the No...,"A soaring Gesha cup: balanced, both delicate a...",$13.34,https://www.coffeereview.com/review/berlina-ge...,56,84,113.398,ounces,13.34,grams
2445,95,Barrington Coffee Roasting,Gatugi Triple,2018-09-01,"Bright, crisply sweet-savory. Black currant, h...",https://www.coffeereview.com/review/gatugi-tri...,https://barringtoncoffee.com,"Lee, Massachusetts","Nyeri growing region, south-central Kenya",Light,...,This coffee is available exclusively as a part...,A confident Kenya with a classic cup profile: ...,$13.34,https://www.coffeereview.com/review/gatugi-tri...,58,86,113.398,ounces,13.34,grams


In [170]:
price = 

In [173]:
price.unique()

array(['$22.00', '$18.00', '$20.00', '$35.00', '$21.00', 'NT $1200',
       '$19.45', 'NT $420', 'NT $500', '$25.00', '$23.00', '$26.00',
       'Puerto Rico', '$27.00', '$19.50', 'NT $450', '$15.95', 'NT $520',
       'RMB 399', '$60.00', '$19.00', 'NT $330', 'Nt $325', 'NT $310',
       '#23.90', 'NT $325', 'NT $240', '$30.00', 'NT $550', '$48.00',
       '$80.00', '$34.99', 'NT $300', 'NT $375', '$32.00', 'NT $120',
       '$39.95', '$17.50', '$24.00', 'NT $265', '$29.00', 'NT $280',
       'AED $99.75', '$17.00', 'NT $680', '$69.95', 'NT $600', 'NT $800',
       'NT $1300', '$21.25', '$15.75', '$19.95', '$21.50', '$23.50',
       '$24.25', '$42.00', 'NT $275', 'NT $399', 'NT $750', 'NT $130',
       'NT $380', '$29.50', 'NT $700', 'NT $440', '$16.95', '$16.49',
       '$40.00', '$15.99', '$26.50', 'NT $460', 'NT $290', 'NT $400',
       'NT $180', '$18.50', '$20.50', '$15.25', '$31.50', 'NT $650',
       'NT $1250', 'NT $320', '$17.95', 'NT $2000', '$38.45', '$23.95',
       '$34.5

Unnamed: 0,Numerical_Values,Non_Numerical
0,22.00,$
1,22.00,$
2,22.00,$
3,22.00,$
8,18.00,$
...,...,...
2445,13.34,$
2446,13.34,$
2448,18.00,$
2452,14.99,$
