# Coffee Review Cleaning and EDA

In [68]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

### Clean
- acidity/structure & acidity into one column
- agtron into 2 columns
- split origin and roaster locations into columns for states and country
- fix dtypes
- drop rows with NA

### EDA ideas
- visualize histograms
- visualize correlations
- word clouds
- tokenize descriptions
- plot locations, roasters, roast level, agtron vs other characterisitcs
- cluster analysis
- standardize ratings?? 



In [58]:
df = pd.read_csv('data/raw-roast-reviews.csv')
df.head()

def split_price_currency(df):
    price = df['Price']
    df['Price'] = price.apply(lambda x: re.search(r'\$?(\d+\.\d+|\d+)', str(x)).group(1) if re.search(r'\$?(\d+\.\d+|\d+)', str(x)) else None)
    df['Currency'] = price.apply(lambda x: re.sub(r'(\d+\.\d+|\d+)', '', str(x)) if re.search(r'\$?(\d+\.\d+|\d+)', str(x)) else str(x)).str.strip()
    return df

def currency_clean(df):
    
    curr_map = {'#': 'GBP', '£':'GBP', '¥': 'CNY', '':'USD', 'pesos':'MXN', 'RMB':'CNY'}
    
    df.loc[(df['Currency'] == '$') | (df['Currency'].str.lower().str.contains('us', na=False)), 'Currency'] = 'USD'
    df.loc[df['Currency'].str.lower().str.contains('nt', na=False), 'Currency'] = 'TWD'
    df['Currency'] = df['Currency'].str.replace('$', '')
    df['Currency'] = df['Currency'].replace(curr_map)
    df['Currency'] = df['Currency'].str.strip()
    
    return df
    
amount_pat = r'([\d.]+)\s*([a-zA-Z]+)'
def tweak_coffee(df):
    return (df
            # Remove spaces from column names
            .rename(columns = lambda c: c.replace(' ', '_'))
            .rename(columns = {'Coffee_Origin': 'Origin',
                               'Est._Price': 'Price'})
            # Remove duplicates based on roaster and name of roast
            .drop_duplicates(subset=['Roaster', 'Coffee_Name'])
            # Clean up columns. Create datetime column from Review_Date. Split Agtron into two columns.
            # Split Price into a column for cost and a column for amount and unit. 
            .assign(Review_Date = pd.to_datetime(df['Review_Date'], format='%b %Y'),
                    Acidity = lambda df_: df_['Acidity/Structure'].fillna(df_['Acidity']),
                    Agtron_External = lambda df_: df_['Agtron'].str.split('/', expand=True)[0],
                    Agtron_Ground = lambda df_: df_['Agtron'].str.split('/', expand=True)[1],
                    Amount = lambda df_: df_['Price'].str.split('/', expand=True)[1],
                    Price = lambda df_: df_['Price'].str.replace(',', '').str.split('/', expand=True)[0],
                     )
            # Remove whitespace and tranform "NA" to NaN
            .applymap(lambda x: x.strip() if isinstance(x, str) else x)
            .applymap(lambda x: np.nan if x == 'NA' else x)
            # Split price value and currency 
            .pipe(split_price_currency)
            .pipe(currency_clean)
            # Drop old columns and rows with missing values
            .drop(['Acidity/Structure', 'Agtron',], axis=1)
            .dropna(subset=['Acidity', 'Roast_Level', 'Aroma', 'Price', 'Bottom_Line', 
                            'Agtron_External', 'Agtron_Ground',])
            # Transform data types
            .astype({'Roaster': 'category', 'Rating': 'int64', 'Roast_Level':'category',
                     'Agtron_External':'int64', 'Agtron_Ground':'int64',
                     'Aftertaste': 'int64', 'Aroma': 'int64', 'Body': 'int64',
                     'Flavor': 'int64', 'Acidity': 'int64', 
                     'Price': 'float', 'Currency':'category'}
                   )
            .reset_index(drop=True)
           )

coffee = tweak_coffee(df)
coffee.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1960 entries, 0 to 1959
Data columns (total 24 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   Rating               1960 non-null   int64         
 1   Roaster              1960 non-null   category      
 2   Coffee_Name          1960 non-null   object        
 3   Review_Date          1960 non-null   datetime64[ns]
 4   Review_Description   1960 non-null   object        
 5   Complete_Review_URL  1960 non-null   object        
 6   Roaster_Website_URL  1157 non-null   object        
 7   Roaster_Location     1960 non-null   object        
 8   Origin               1960 non-null   object        
 9   Roast_Level          1960 non-null   category      
 10  Aroma                1960 non-null   int64         
 11  Acidity              1960 non-null   int64         
 12  Body                 1960 non-null   int64         
 13  Flavor               1960 non-nul

In [59]:
def create_quantity_and_units(df):
    amount_pat = r'([\d.]+)\s*([a-zA-Z]+)'
    curr_pat = r'(\d+\.\d+)'

    def convert_to_grams(df):
        # Define conversion factors
        pound_to_gram = 453.59
        ounce_to_gram = 28.3495
        
        df.loc[df['Unit'] == 'ounces', 'Quantity'] = df['Quantity'] * ounce_to_gram
        df.loc[df['Unit'] == 'pounds', 'Quantity'] = df['Quantity'] * pound_to_gram
        
        df['Unit'] = 'grams'
        return df

                              
    return (
            df
            .assign(Quantity = df['Amount'].str.extract(amount_pat)[0],
                     Unit = df['Amount'].str.extract(amount_pat)[1],
                    )
            .drop('Amount', axis=1)
            .astype({'Quantity': 'float'})
            .pipe(convert_to_grams)
            .dropna(subset=['Quantity'])
        )



coffee = create_quantity_and_units(coffee)


In [60]:
coffee

Unnamed: 0,Rating,Roaster,Coffee_Name,Review_Date,Review_Description,Complete_Review_URL,Roaster_Website_URL,Roaster_Location,Origin,Roast_Level,...,Blind_Assessment,Notes,Bottom_Line,Price,url,Agtron_External,Agtron_Ground,Currency,Quantity,Unit
0,94,JBC Coffee Roasters,Wilton Benitez Sidra,2023-08-01,"Delicately fruity, richly floral-toned. Froot ...",https://www.coffeereview.com/review/wilton-ben...,https://www.jbccoffeeroasters.com/product-cate...,"Madison, Wisconsin","Piendamó, Cauca Department, Colombia",Medium-Light,...,"Delicately fruity, richly floral-toned. Froot ...","Produced by Wilton Benitez, entirely of the Si...","A complex, deep-toned, very fruit-forward anae...",22.00,https://www.coffeereview.com/review/wilton-ben...,58,74,USD,226.796,grams
1,92,JBC Coffee Roasters,Piura Peru,2023-08-01,Sweetly chocolaty and nut-toned. Baking chocol...,https://www.coffeereview.com/review/piura-peru/,https://www.jbccoffeeroasters.com/product-cate...,"Madison, Wisconsin","El Faique, Piura Department, Peru",Medium-Light,...,Sweetly chocolaty and nut-toned. Baking chocol...,Produced by smallholding farmers who work dire...,"A confident, deep-toned Peru with a chocolate-...",18.00,https://www.coffeereview.com/review/piura-peru/,56,74,USD,340.194,grams
2,95,Euphora Coffee,Panama Hacienda La Esmeralda Geisha Natural,2023-08-01,"Richly fruity, deeply chocolaty. Dark chocolat...",https://www.coffeereview.com/review/panama-hac...,https://www.euphoracoffeestudio.com/categories...,"Taipei, Taiwan","Boquete growing region, western Panama",Medium-Light,...,"Richly fruity, deeply chocolaty. Dark chocolat...","Produced at Hacienda La Esmeralda, entirely of...","Fruit, chocolate and floral notes converge in ...",22.00,https://www.coffeereview.com/review/panama-hac...,60,77,USD,113.398,grams
3,95,Euphora Coffee,Panama Hacienda La Esmeralda Geisha Washed,2023-08-01,"High-toned, juicy-sweet. Lemongrass, cocoa nib...",https://www.coffeereview.com/review/panama-hac...,https://www.euphoracoffeestudio.com/categories...,"Taipei, Taiwan","Boquete growing region, western Panama",Light,...,"High-toned, juicy-sweet. Lemongrass, cocoa nib...","Produced at Hacienda La Esmeralda, entirely of...","Elegantly spice-toned, richly floral, complex ...",20.00,https://www.coffeereview.com/review/panama-hac...,64,78,USD,113.398,grams
4,96,Rusty's Hawaiian,Grand Champion Red Bourbon Natural,2023-08-01,"Delicately fruit-forward, richly aromatic. Lyc...",https://www.coffeereview.com/review/grand-cham...,,"Pahala, Hawaii","Ka'u growing district, Big Island of Hawai’i",Medium-Light,...,"Delicately fruit-forward, richly aromatic. Lyc...",This 100% Ka’u coffee was developed by Lorie O...,"This decadently sweet, elegantly tart, complex...",35.00,https://www.coffeereview.com/review/grand-cham...,52,72,USD,113.398,grams
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1955,95,Barrington Coffee Roasting,Gatugi Triple,2018-09-01,"Bright, crisply sweet-savory. Black currant, h...",https://www.coffeereview.com/review/gatugi-tri...,https://barringtoncoffee.com,"Lee, Massachusetts","Nyeri growing region, south-central Kenya",Light,...,"Bright, crisply sweet-savory. Black currant, h...",This coffee is available exclusively as a part...,A confident Kenya with a classic cup profile: ...,13.34,https://www.coffeereview.com/review/gatugi-tri...,58,86,USD,113.398,grams
1956,92,Barrington Coffee Roasting,Sulawesi Toarco,2018-09-01,"Sweet-toned, richly savory. Baker’s chocolate,...",https://www.coffeereview.com/review/sulawesi-t...,https://barringtoncoffee.com,"Lee, Massachusetts","Toraja growing region, south-central Sulawesi,...",Medium-Light,...,"Sweet-toned, richly savory. Baker’s chocolate,...",This coffee is available exclusively as a part...,A chocolaty coffee animated by rich nut and fl...,13.34,https://www.coffeereview.com/review/sulawesi-t...,56,80,USD,113.398,grams
1957,94,Dragonfly Coffee Roasters,Ethiopia Yirgacheffe Natural,2018-09-01,"Sweet, deeply and complexly fruit-toned. Very ...",https://www.coffeereview.com/review/ethiopia-y...,,"Boulder, Colorado","Yirgacheffe growing region, southern Ethiopia",Medium-Light,...,"Sweet, deeply and complexly fruit-toned. Very ...",Yirgacheffe coffees like this one are largely ...,A fruit-and-chocolate-saturated natural-proces...,18.00,https://www.coffeereview.com/review/ethiopia-y...,55,80,USD,340.194,grams
1958,92,Good Coffee Club,Brazil Conquista,2018-09-01,"Sweetly pungent, chocolate-toned. Baker’s choc...",https://www.coffeereview.com/review/brazil-con...,,"Charlotte, North Carolina","Bahia, Brazil",Medium-Light,...,"Sweetly pungent, chocolate-toned. Baker’s choc...",Produced at Conquista Farm from trees of the R...,A chocolaty Brazil cup with sweet nut notes th...,14.99,https://www.coffeereview.com/review/brazil-con...,48,74,USD,340.194,grams


In [69]:
coffee.to_csv('roast-reviews-clean.csv')

In [70]:
exchange_to_usd = {}