# Coffee Review Cleaning and EDA

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
# Read in raw coffee review data
df = pd.read_csv('data/raw-roast-reviews.csv')
df.head()

Unnamed: 0,Rating,Roaster,Coffee_Name,Review_Date,Review_Description,Complete_Review_URL,Roaster_Website_URL,Roaster Location,Coffee Origin,Roast Level,...,Acidity,Body,Flavor,Aftertaste,Agtron,Blind Assessment,Notes,Bottom Line,Est. Price,url
0,94,JBC Coffee Roasters,Wilton Benitez Sidra,Aug 2023,"Delicately fruity, richly floral-toned. Froot ...",https://www.coffeereview.com/review/wilton-ben...,https://www.jbccoffeeroasters.com/product-cate...,"Madison, Wisconsin","Piendamó, Cauca Department, Colombia",Medium-Light,...,,9.0,9.0,8.0,58/74,"Delicately fruity, richly floral-toned. Froot ...","Produced by Wilton Benitez, entirely of the Si...","A complex, deep-toned, very fruit-forward anae...",$22.00/8 ounces,https://www.coffeereview.com/review/wilton-ben...
1,94,JBC Coffee Roasters,Wilton Benitez Sidra,Aug 2023,"Delicately fruity, richly floral-toned. Froot ...",https://www.coffeereview.com/review/wilton-ben...,https://www.jbccoffeeroasters.com/product-cate...,"Madison, Wisconsin","Piendamó, Cauca Department, Colombia",Medium-Light,...,,9.0,9.0,8.0,58/74,"Delicately fruity, richly floral-toned. Froot ...","Produced by Wilton Benitez, entirely of the Si...","A complex, deep-toned, very fruit-forward anae...",$22.00/8 ounces,https://www.coffeereview.com/review/wilton-ben...
2,94,JBC Coffee Roasters,Wilton Benitez Sidra,Aug 2023,"Delicately fruity, richly floral-toned. Froot ...",https://www.coffeereview.com/review/wilton-ben...,https://www.jbccoffeeroasters.com/product-cate...,"Madison, Wisconsin","Piendamó, Cauca Department, Colombia",Medium-Light,...,,9.0,9.0,8.0,58/74,"Delicately fruity, richly floral-toned. Froot ...","Produced by Wilton Benitez, entirely of the Si...","A complex, deep-toned, very fruit-forward anae...",$22.00/8 ounces,https://www.coffeereview.com/review/wilton-ben...
3,94,JBC Coffee Roasters,Wilton Benitez Sidra,Aug 2023,"Delicately fruity, richly floral-toned. Froot ...",https://www.coffeereview.com/review/wilton-ben...,https://www.jbccoffeeroasters.com/product-cate...,"Madison, Wisconsin","Piendamó, Cauca Department, Colombia",Medium-Light,...,,9.0,9.0,8.0,58/74,"Delicately fruity, richly floral-toned. Froot ...","Produced by Wilton Benitez, entirely of the Si...","A complex, deep-toned, very fruit-forward anae...",$22.00/8 ounces,https://www.coffeereview.com/review/wilton-ben...
4,93,JBC Coffee Roasters,Twisted V.8 Espresso,Aug 2023,"Evaluated as espresso. Chocolaty, fruity, nutt...",https://www.coffeereview.com/review/twisted-v-...,https://www.jbccoffeeroasters.com/product-cate...,"Madison, Wisconsin",South and Central America; Africa,Medium-Light,...,,8.0,9.0,8.0,54/72,"Evaluated as espresso. Chocolaty, fruity, nutt...",A blend of coffees from undisclosed South and ...,A very chocolaty espresso blend with lift from...,$18.00/12 ounces,https://www.coffeereview.com/review/twisted-v-...


### Clean
- review date to date dtype
- acidity/structure & acidity into one column
- agtron into 2 columns
- split origin and roasdter locations into columns for states and country
- price into float
- maybe: roaster into cat, location into cat
- aroma, acidity, body, flavor, aftertaste, agtron's into int
- drop NA

### EDA ideas
- visualize histograms
- visualize correlations
- word clouds
- tokenize descriptions
- plot locations, roasters, roast level, agtron vs other characterisitcs
- cluster analysis
- standardize ratings?? 



In [7]:
def tweak_coffee(df):
    return (df
            .rename(columns = lambda c: c.replace(' ', '_'))
            .rename(columns = {'Roaster Location': 'Location', 'Coffee Origin': 'Origin',
                               'Est. Price': 'Price'})
            .assign(Review_Date = pd.to_datetime(df['Review_Date'], format='%b %Y'),
                    Acidity = lambda df_: df_['Acidity/Structure'].fillna(df_['Acidity']),
                    Agtron_External = lambda df_: df_['Agtron'].str.split('/', expand=True)[0],
                    Agtron_Ground = lambda df_: df_['Agtron'].str.split('/', expand=True)[1]
                     )
            .drop(['Acidity/Structure', 'Agtron'], axis=1)
            .dropna(subset=['Acidity', 'Roast_Level', 'Aroma', 'Bottom_Line', 
                            'Agtron_External', 'Agtron_Ground'])
            .astype({'Roaster': 'category', 'Rating': 'int64', 'Roast_Level':'category',
                     'Roaster_Location':'category'})
            
           )

coffee = tweak_coffee(df)
coffee.sample(5)

KeyError: ['Price']

In [251]:
coffee.Agtron_External.isna().value_counts()

Agtron_External
False    2053
Name: count, dtype: int64

In [237]:
columns = ['Rating', 'Aroma', 'Body', 'Flavor', 'Aftertaste', 'Agtron_External','Agtron_Ground']
coffee[columns].corr()

ValueError: could not convert string to float: 'NA'

In [152]:
pattern_1 = r'([A-Za-z¥£$]+)\s*([\d,.]+)'
pattern_2 = r'(\d+(?:,\d+)?)\s*(\w+)'

price_unit = (df['Price']
              .str.split('/', n=1, expand=True)
              .assign(currency = lambda df_: df_[0].str.extract(pattern_1)[0],
                      price = lambda df_: df_[0].str.extract(pattern_1)[1].str.replace(',', '')
                                                  .str.strip().astype('float'),
                      amount = lambda df_: df_[1].str.extract(pattern_2)[0],
                      unit = lambda df_: df_[1].str.extract(pattern_2)[1]
                       )
              .drop([0, 1], axis=1)
    
             )

# NT - taiwan
# NT$ - taiwana
# £ - british pound 
# ¥ -  Chinese Yuan
# pesos - mexican
# CAD - canadian
price_unit['currency'].unique()


array(['$', nan, '£', 'NT$', '$NT$', 'RM', '¥', '$NT', 'NT'], dtype=object)