# Coffee Review Cleaning and EDA

In [154]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

pd.set_option('display.max_columns', 24)

### Clean
- acidity/structure & acidity into one column
- agtron into 2 columns
- split origin and roaster locations into columns for states and country
- fix dtypes
- drop rows with NA

### EDA ideas
- visualize histograms
- visualize correlations
- word clouds
- tokenize descriptions
- plot locations, roasters, roast level, agtron vs other characterisitcs
- cluster analysis
- standardize ratings?? 



In [160]:
df = pd.read_csv('data/raw-roast-reviews.csv')
df.head()

def split_price_currency(df):
    """Split Price column into a column of the value and the currency"""
    price = df['Price']
    df['Price'] = price.apply(lambda x: re.search(r'\$?(\d+\.\d+|\d+)', str(x)).group(1) if re.search(r'\$?(\d+\.\d+|\d+)', str(x)) else None)
    df['Currency'] = price.apply(lambda x: re.sub(r'(\d+\.\d+|\d+)', '', str(x)) if re.search(r'\$?(\d+\.\d+|\d+)', str(x)) else str(x)).str.strip()
    return df

def currency_clean(df):
    """Transform currency symbols to ISO 4217 codes, and clean up"""
    
    curr_map = {'#': 'GBP', '£':'GBP', '¥': 'CNY', '':'USD', 'pesos':'MXN', 'RMB':'CNY'}
    
    df.loc[(df['Currency'] == '$') | (df['Currency'].str.lower().str.contains('us', na=False)), 'Currency'] = 'USD'
    df.loc[df['Currency'].str.lower().str.contains('nt', na=False), 'Currency'] = 'TWD'
    df['Currency'] = df['Currency'].str.replace('$', '')
    df['Currency'] = df['Currency'].replace(curr_map)
    df['Currency'] = df['Currency'].str.strip()
    
    return df

def create_quantity_and_units(df):
    """Separate Amount column into a quantity and the units for that quantity. Then 
       converts all quantities to grams."""
    
    amount_pat = r'([\d.]+)\s*([a-zA-Z]+)'
    curr_pat = r'(\d+\.\d+)'

    def convert_to_grams(df):
        
        # Define conversion factors
        pound_to_gram = 453.59
        ounce_to_gram = 28.3495
        
        df.loc[df['Unit'] == 'ounces', 'Quantity'] = df['Quantity'] * ounce_to_gram
        df.loc[df['Unit'] == 'pounds', 'Quantity'] = df['Quantity'] * pound_to_gram
        
        df['Unit'] = 'grams'
        return df
                        
    return (df
            .assign(Quantity = df['Amount'].str.extract(amount_pat)[0],
                     Unit = df['Amount'].str.extract(amount_pat)[1],
                    )
            .drop('Amount', axis=1)
            .astype({'Quantity': 'float'})
            .pipe(convert_to_grams)
            .dropna(subset=['Quantity'])
            )

def create_price_USD(df):
    """Create a column converting all prices to USD using recent exchange rates."""
    exchange_rate_map = {'TWD' : 0.03, 
                         'CNY': 0.03, 
                         'GBP': 1.27, 
                         'AED':0.27, 
                         'KRW': 0.000752,
                         'HKD':0.13, 
                         'CAD': 0.74,
                         'MXN': 0.06,
                         'IDR': 0.000065,
                         'AUD': 0.65,
                         'USD': 1.0}
    df['Price_USD'] = df['Price'] * df['Currency'].map(exchange_rate_map)
    return df

def tweak_coffee(df):
    return (df
            # Remove spaces from column names
            .rename(columns = lambda c: c.replace(' ', '_'))
            .rename(columns = {'Coffee_Origin': 'Origin',
                               'Est._Price': 'Price'})
            # Remove duplicates based on roaster and name of roast
            .drop_duplicates(subset=['Roaster', 'Coffee_Name'])
            # Clean up columns. Create datetime column from Review_Date. Split Agtron into two columns.
            # Split Price into a column for cost and a column for amount and unit. 
            .assign(Review_Date = pd.to_datetime(df['Review_Date'], format='%b %Y'),
                    Acidity = df['Acidity/Structure'].fillna(df['Acidity']),
                    Agtron_External = df['Agtron'].str.split('/', expand=True)[0],
                    Agtron_Ground = df['Agtron'].str.split('/', expand=True)[1],
                    Amount = lambda df_: df_['Price'].str.split('/', expand=True)[1],
                    Price = lambda df_: df_['Price'].str.replace(',', '').str.split('/', expand=True)[0],
                     )
            # Remove whitespace and tranform "NA" to NaN
            .applymap(lambda x: x.strip() if isinstance(x, str) else x)
            .applymap(lambda x: np.nan if x == 'NA' else x)
            # Split price value and currency 
            .pipe(split_price_currency)
            # Clean up currency and standardize
            .pipe(currency_clean)
            # Drop old columns and rows with missing values
            .drop(['Acidity/Structure', 'Agtron',], axis=1)
            .dropna(subset=['Acidity', 'Roast_Level', 'Aroma', 'Price', 'Bottom_Line', 
                            'Agtron_External', 'Agtron_Ground',])
            # Separate unit of measurment from quantity
            .pipe(create_quantity_and_units)

            # Transform data types
            .astype({'Roaster': 'category', 'Rating': 'int64', 'Roast_Level':'category',
                     'Aftertaste': 'int64', 'Aroma': 'int64', 'Body': 'int64',
                     'Flavor': 'int64', 'Currency':'category', 'Acidity': 'int64',
                     'Agtron_External': 'int64', 'Agtron_Ground':'int64', 'Price':'float',
                    'Unit': 'category'}
                   )
            # Create a column of prices in USD
            .pipe(create_price_USD)
            .reset_index(drop=True)
           )

coffee = tweak_coffee(df)
coffee.to_csv('coffee-reviews-clean.csv')
coffee.head()

Unnamed: 0,Rating,Roaster,Coffee_Name,Review_Date,Review_Description,Complete_Review_URL,Roaster_Website_URL,Roaster_Location,Origin,Roast_Level,Aroma,Acidity,...,Aftertaste,Blind_Assessment,Notes,Bottom_Line,Price,url,Agtron_External,Agtron_Ground,Currency,Quantity,Unit,Price_USD
0,94,JBC Coffee Roasters,Wilton Benitez Sidra,2023-08-01,"Delicately fruity, richly floral-toned. Froot ...",https://www.coffeereview.com/review/wilton-ben...,https://www.jbccoffeeroasters.com/product-cate...,"Madison, Wisconsin","Piendamó, Cauca Department, Colombia",Medium-Light,9,9,...,8,"Delicately fruity, richly floral-toned. Froot ...","Produced by Wilton Benitez, entirely of the Si...","A complex, deep-toned, very fruit-forward anae...",22.0,https://www.coffeereview.com/review/wilton-ben...,58,74,USD,226.796,grams,22.0
1,92,JBC Coffee Roasters,Piura Peru,2023-08-01,Sweetly chocolaty and nut-toned. Baking chocol...,https://www.coffeereview.com/review/piura-peru/,https://www.jbccoffeeroasters.com/product-cate...,"Madison, Wisconsin","El Faique, Piura Department, Peru",Medium-Light,8,8,...,8,Sweetly chocolaty and nut-toned. Baking chocol...,Produced by smallholding farmers who work dire...,"A confident, deep-toned Peru with a chocolate-...",18.0,https://www.coffeereview.com/review/piura-peru/,56,74,USD,340.194,grams,18.0
2,95,Euphora Coffee,Panama Hacienda La Esmeralda Geisha Natural,2023-08-01,"Richly fruity, deeply chocolaty. Dark chocolat...",https://www.coffeereview.com/review/panama-hac...,https://www.euphoracoffeestudio.com/categories...,"Taipei, Taiwan","Boquete growing region, western Panama",Medium-Light,9,9,...,8,"Richly fruity, deeply chocolaty. Dark chocolat...","Produced at Hacienda La Esmeralda, entirely of...","Fruit, chocolate and floral notes converge in ...",22.0,https://www.coffeereview.com/review/panama-hac...,60,77,USD,113.398,grams,22.0
3,95,Euphora Coffee,Panama Hacienda La Esmeralda Geisha Washed,2023-08-01,"High-toned, juicy-sweet. Lemongrass, cocoa nib...",https://www.coffeereview.com/review/panama-hac...,https://www.euphoracoffeestudio.com/categories...,"Taipei, Taiwan","Boquete growing region, western Panama",Light,9,9,...,9,"High-toned, juicy-sweet. Lemongrass, cocoa nib...","Produced at Hacienda La Esmeralda, entirely of...","Elegantly spice-toned, richly floral, complex ...",20.0,https://www.coffeereview.com/review/panama-hac...,64,78,USD,113.398,grams,20.0
4,96,Rusty's Hawaiian,Grand Champion Red Bourbon Natural,2023-08-01,"Delicately fruit-forward, richly aromatic. Lyc...",https://www.coffeereview.com/review/grand-cham...,,"Pahala, Hawaii","Ka'u growing district, Big Island of Hawai’i",Medium-Light,9,9,...,9,"Delicately fruit-forward, richly aromatic. Lyc...",This 100% Ka’u coffee was developed by Lorie O...,"This decadently sweet, elegantly tart, complex...",35.0,https://www.coffeereview.com/review/grand-cham...,52,72,USD,113.398,grams,35.0


In [161]:
coffee.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1957 entries, 0 to 1956
Data columns (total 26 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   Rating               1957 non-null   int64         
 1   Roaster              1957 non-null   category      
 2   Coffee_Name          1957 non-null   object        
 3   Review_Date          1957 non-null   datetime64[ns]
 4   Review_Description   1957 non-null   object        
 5   Complete_Review_URL  1957 non-null   object        
 6   Roaster_Website_URL  1156 non-null   object        
 7   Roaster_Location     1957 non-null   object        
 8   Origin               1957 non-null   object        
 9   Roast_Level          1957 non-null   category      
 10  Aroma                1957 non-null   int64         
 11  Acidity              1957 non-null   int64         
 12  Body                 1957 non-null   int64         
 13  Flavor               1957 non-nul

In [164]:
coffee['Roaster_Location'].str.split(',')

0              [Madison,  Wisconsin]
1              [Madison,  Wisconsin]
2                  [Taipei,  Taiwan]
3                  [Taipei,  Taiwan]
4                  [Pahala,  Hawaii]
                    ...             
1952           [Lee,  Massachusetts]
1953           [Lee,  Massachusetts]
1954            [Boulder,  Colorado]
1955    [Charlotte,  North Carolina]
1956              [Shanghai,  China]
Name: Roaster_Location, Length: 1957, dtype: object