# Coffee Review Cleaning and EDA

In [72]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [150]:
# Read in raw coffee review data
df = pd.read_csv('data/raw-coffee-reviews.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2449 entries, 0 to 2448
Data columns (total 22 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Rating               2449 non-null   int64  
 1   Roaster              2449 non-null   object 
 2   Coffee_Name          2449 non-null   object 
 3   Review_Date          2449 non-null   object 
 4   Review_Description   2449 non-null   object 
 5   Complete_Review_URL  2449 non-null   object 
 6   Roaster_Website_URL  1365 non-null   object 
 7   Roaster Location     2449 non-null   object 
 8   Coffee Origin        2449 non-null   object 
 9   Roast Level          2376 non-null   object 
 10  Aroma                2424 non-null   float64
 11  Acidity/Structure    2108 non-null   float64
 12  Acidity              3 non-null      float64
 13  Body                 2448 non-null   float64
 14  Flavor               2448 non-null   float64
 15  Aftertaste           2448 non-null   f

### Clean
- review date to date dtype
- acidity/structure & acidity into one column
- agtron into 2 columns
- split origin and roasdter locations into columns for states and country
- price into float
- maybe: roaster into cat, location into cat
- aroma, acidity, body, flavor, aftertaste, agtron's into int
- drop NA

### EDA ideas
- visualize histograms
- visualize correlations
- word clouds
- tokenize descriptions
- plot locations, roasters, roast level, agtron vs other characterisitcs
- cluster analysis
- standardize ratings?? 



In [247]:
def tweak_coffee(df):
    return (df
            .rename(columns = lambda c: c.replace(' ', '_'))
            .assign(Review_Date = pd.to_datetime(df['Review_Date'], format='%b %Y'),
                    Acidity = lambda df_: df_['Acidity/Structure'].fillna(df_['Acidity']),
                    Agtron_External = lambda df_: df_['Agtron'].str.split('/', expand=True)[0],
                    Agtron_Ground = lambda df_: df_['Agtron'].str.split('/', expand=True)[1]
                     )
            .drop(['Acidity/Structure', 'Agtron'], axis=1)
            .dropna(subset=['Acidity', 'Roast_Level', 'Aroma', 'Price', 'Bottom_Line', 
                            'Agtron_External', 'Agtron_Ground'])
            .astype({'Roaster': 'category', 'Rating': 'int64', 'Roast_Level':'category',
                     'Roaster_Location':'category'})
            
           )

coffee = tweak_coffee(df)
coffee.sample(5)

Unnamed: 0,Rating,Roaster,Coffee_Name,Review_Date,Review_Description,Complete_Review_URL,Roaster_Website_URL,Roaster_Location,Coffee_Origin,Roast_Level,...,Body,Flavor,Aftertaste,Blind_Assessment,Notes,Bottom_Line,url,Price,Agtron_External,Agtron_Ground
2134,93,Origin Coffee Roasters,Maui Kupa’a Orange Bourbon,2019-05-01,"Bright, crisply sweet-tart. Peach, honeysuckle...",https://www.coffeereview.com/review/maui-kupaa...,,"Kihei, Maui, Hawaii","Kula, Island of Maui, Hawai'i",Medium-Light,...,8.0,9.0,8.0,"Bright, crisply sweet-tart. Peach, honeysuckle...",This exceptional coffee was selected as the No...,A window into the great potential of higher-gr...,https://www.coffeereview.com/review/maui-kupaa...,$40.00/8 ounces,58,82
1622,92,Plat Coffee Roastery,Honduras Finca Don Eusebio,2020-09-01,"Delicately sweet, floral-toned. Honeysuckle, r...",https://www.coffeereview.com/review/honduras-f...,,"Hong Kong, China","Chaguites, Honduras",Light,...,8.0,9.0,8.0,"Delicately sweet, floral-toned. Honeysuckle, r...",This coffee tied for the third-highest rating ...,"A sweetly and gently pungent, quietly rich, ba...",https://www.coffeereview.com/review/honduras-f...,$25.00/6 ounces,62,80
2362,91,JBC Coffee Roasters,ASOPEP Colombia,2018-09-01,"Cocoa-toned, deeply sweet-savory. Dark chocola...",https://www.coffeereview.com/review/asopep-col...,https://www.jbccoffeeroasters.com/product-cate...,"Madison, Wisconsin","Planadas, Tolima Department, Colombia",Medium-Light,...,8.0,9.0,8.0,"Cocoa-toned, deeply sweet-savory. Dark chocola...",This coffee is certified organically grown and...,"A sweet, chocolaty FTO Colombia cup with umami...",https://www.coffeereview.com/review/asopep-col...,$15.75/12 ounces,54,78
1135,94,Kakalove Cafe,Ethiopia Yirgacheffe Botabaa Washed G1 21/01,2021-08-01,"Sweetly tart, decidedly tropical. Ginger bloss...",https://www.coffeereview.com/review/ethiopia-y...,http://www.kakalovecafe.com.tw/,"Chia-Yi, Taiwan","ochere Woreda, Gedeo Zone, Ethiopia",Medium-Light,...,9.0,9.0,8.0,"Sweetly tart, decidedly tropical. Ginger bloss...",Southern Ethiopia produces distinctive coffees...,"A high-toned, juicy-tart, downright heady wash...",https://www.coffeereview.com/review/ethiopia-y...,NT $290/8 ounces,60,80
765,94,Nostalgia Coffee Roasters,Red Sunset by Andres Cardona,2022-03-01,"Elegant, gently fruit-forward. Black cherry, l...",https://www.coffeereview.com/review/red-sunset...,http://bit.ly/2WmKMJ9,"San Diego, California","Santa Bárbara, Antioquia Department, Colombia",Medium-Light,...,9.0,9.0,8.0,"Elegant, gently fruit-forward. Black cherry, l...",Produced by Andres Cardona at Villa Lucía Farm...,"A pretty, delicate, confident honey-processed ...",https://www.coffeereview.com/review/red-sunset...,$24.00/12 ounces,55,73


In [251]:
coffee.Agtron_External.isna().value_counts()

Agtron_External
False    2053
Name: count, dtype: int64

In [237]:
columns = ['Rating', 'Aroma', 'Body', 'Flavor', 'Aftertaste', 'Agtron_External','Agtron_Ground']
coffee[columns].corr()

ValueError: could not convert string to float: 'NA'

In [152]:
pattern_1 = r'([A-Za-z¥£$]+)\s*([\d,.]+)'
pattern_2 = r'(\d+(?:,\d+)?)\s*(\w+)'

price_unit = (df['Price']
              .str.split('/', n=1, expand=True)
              .assign(currency = lambda df_: df_[0].str.extract(pattern_1)[0],
                      price = lambda df_: df_[0].str.extract(pattern_1)[1].str.replace(',', '')
                                                  .str.strip().astype('float'),
                      amount = lambda df_: df_[1].str.extract(pattern_2)[0],
                      unit = lambda df_: df_[1].str.extract(pattern_2)[1]
                       )
              .drop([0, 1], axis=1)
    
             )

# NT - taiwan
# NT$ - taiwana
# £ - british pound 
# ¥ -  Chinese Yuan
# pesos - mexican
# CAD - canadian
price_unit['currency'].unique()


array(['$', nan, '£', 'NT$', '$NT$', 'RM', '¥', '$NT', 'NT'], dtype=object)