# Coffee Review Cleaning and EDA

In [100]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import re
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_colwidth', 100)

df = pd.read_json('data/raw-roast-reviews.json').replace('NA', np.nan)
df.sample(5)

Unnamed: 0,Roaster Location:,Coffee Origin:,Roast Level:,Agtron:,Est. Price:,Review Date:,Aroma:,Acidity/Structure:,Body:,Flavor:,Aftertaste:,rating,roaster,name,url,With Milk:,Acidity:
5061,"San Francisco, California","Nyeri County, Central Highlands, Kenya.",Medium-Light,53/72,$21.00/12 ounces,September 2011,9,,8\t\t\t\t\t\t,9,8.0,93,Four Barrel Coffee,Kenya Tekangu Ngunguru,https://www.coffeereview.com/review/kenya-tekangu-ngunguru/,,9.0
4245,"Lee, Massachusetts","Silla Del Pando, Volcan, Panama.",Light,66/88,$72.45/12 ounces,December 2013,10,,9\t\t\t\t\t\t,10,8.0,96,Barrington Coffee Roasting Co.,Panama Gesha Perci Red,https://www.coffeereview.com/review/panama-gesha-perci-red/,,9.0
6094,"Waterbury, Vermont","Ethiopia, Indonesia",Medium-Dark,38/48,,April 2008,8,,8\t\t\t\t\t\t,7,8.0,88,Green Mountain Coffee,Organic Fair-Trade Mocha Java,https://www.coffeereview.com/review/organic-fair-trade-mocha-java/,,7.0
386,"Savannah, Georgia","Sidamo growing region, Ethiopia",Light,61/82,$27.00/10 ounces,December 2022,9,9.0,9\t\t\t\t\t\t,9,8.0,94,PERC Coffee Roasters,Ethiopia Hayissa Olocho Lot 6,https://www.coffeereview.com/review/ethiopia-hayissa-olocho-lot-6/,,
3755,"Sacramento, California","Santa Clara, Panama.",Light,63/81,$17.50/12 ounces,May 2015,10,,9\t\t\t\t\t\t,9,8.0,95,Temple Coffee and Tea,Panama Finca Hartmann,https://www.coffeereview.com/review/panama-finca-hartmann/,,9.0


In [89]:
def split_price_currency(df):
    """Split Price column into a column of the value and the currency"""
    currency_pattern = r'\$?(\d+\.\d+|\d+)'
    price = df['price']
    df['price'] = price.apply(lambda x: re.search(currency_pattern, str(x)).group(1) if re.search(currency_pattern, str(x)) else None)
    df['currency'] = price.apply(lambda x: re.sub(currency_pattern, '', str(x)) if re.search(currency_pattern, str(x)) else str(x)).str.strip()
    return df
    
def currency_clean(df):
    """Transform currency symbols to ISO 4217 codes, and clean up"""
    
    curr_map = {'#': 'GBP', '£':'GBP', '¥': 'CNY', '':'USD', 'pesos':'MXN', 'RMB':'CNY'}
    
    df.loc[(df['currency'] == '$') | (df['currency'].str.lower().str.contains('us', na=False)), 'currency'] = 'USD'
    df.loc[df['currency'].str.lower().str.contains('nt', na=False), 'currency'] = 'TWD'
    df['currency'] = (df['currency']
                      .str.replace('$', '')
                      .replace(curr_map)
                      .str.strip())
    return df
    
def create_quantity_and_units(df):
    """Separate Amount column into a quantity and the units for that quantity. Then 
       converts all quantities to grams."""
    
    amount_pat = r'([\d.]+)\s*([a-zA-Z]+)'
    currency_pattern = r'(\d+\.\d+)'

    def convert_to_grams(df):
        POUND_TO_GRAM = 453.59
        OUNCE_TO_GRAM = 28.3495
        
        df.loc[df['unit'] == 'ounces', 'quantity'] = df['quantity'] * OUNCE_TO_GRAM
        df.loc[df['unit'] == 'pounds', 'quantity'] = df['quantity'] * POUND_TO_GRAM
        df['unit'] = 'grams'
        
        return df
                        
    return (df
            .assign(quantity = df['amount'].str.extract(amount_pat)[0],
                     unit = df['amount'].str.extract(amount_pat)[1],
                    )
            .drop('amount', axis=1)
            .astype({'quantity': 'float'})
            .pipe(convert_to_grams)
            .dropna(subset=['quantity']))

def create_price_usd(df):
    """Create a column converting all prices to USD using recent exchange rates."""
    exchange_rate_map = {'TWD' : 0.03, 
                         'CNY': 0.03, 
                         'GBP': 1.27, 
                         'AED':0.27, 
                         'KRW': 0.000752,
                         'HKD':0.13, 
                         'CAD': 0.74,
                         'MXN': 0.06,
                         'IDR': 0.000065,
                         'AUD': 0.65,
                         'USD': 1.0}
    df['price_usd'] = np.round(df['price'] * df['currency'].map(exchange_rate_map), 2)
    return df
    
def tweak_coffee(df):
    return (df
        .rename(columns = lambda c: c.replace(' ', '_'))
        .rename(columns = lambda c: c.replace(':', ''))
        .rename(columns = lambda c: c.lower().strip())
        .rename(columns = {'name': 'roast_name', 'est._price': 'price'})
        .drop_duplicates(subset=['roaster', 'roast_name'])
        .replace('NA', np.nan)
        .assign(review_date = lambda df_: pd.to_datetime(df_['review_date'], format='mixed'),
                acidity = lambda df_: df_['acidity/structure'].fillna(df_['acidity']),
                agtron_external = lambda df_: df_['agtron'].str.split('/', expand=True)[0],
                agtron_ground = lambda df_: df_['agtron'].str.split('/', expand=True)[1],
                amount = lambda df_: df_['price'].str.split('/', expand=True)[1],
                price = lambda df_: df_['price'].str.replace(',', '').str.split('/', expand=True)[0],
                )
        .applymap(lambda x: x.strip() if isinstance(x, str) else x)
        .dropna(subset=['coffee_origin'])
        .pipe(split_price_currency)
        .pipe(currency_clean)
        .pipe(create_quantity_and_units)
        .drop(['acidity/structure', 'agtron', 'with_milk'], axis=1)
        .replace('NA', np.nan)
        .dropna(subset=['acidity', 'roast_level', 'aroma', 'price','agtron_external', 'agtron_ground',
                        'aftertaste'])
        .astype({'roaster': 'category', 'rating': 'int8', 'roast_level':'category',
                 'aftertaste': 'int8', 'aroma': 'int8', 'body': 'int8',
                 'flavor': 'int8', 'currency':'category', 'acidity': 'int8',
                 'agtron_external': 'int16', 'agtron_ground': 'int16', 'price':'float',
                 'unit': 'category'})
        .query('agtron_external >= 0 and agtron_external < 120')
        .query('agtron_ground >=0 and agtron_ground < 120')
        .pipe(create_price_usd)
        .assign(price_per_g_usd = lambda df_: np.round(df_['price_usd']/df_['quantity'], 2))
        .reset_index(drop=True)
    )
        
roasts = tweak_coffee(df)
roasts.sample(10)


Unnamed: 0,roaster_location,coffee_origin,roast_level,price,review_date,aroma,body,flavor,aftertaste,rating,roaster,roast_name,url,acidity,agtron_external,agtron_ground,currency,quantity,unit,price_usd,price_per_g_usd
421,"Taipei City, Taiwan","Nyeri growing region, south-central Kenya",Medium-Light,500.0,2022-10-01,9,9,9,9,95,Cozy House Coffee,Kenya Nyeri Asali Peaberry,https://www.coffeereview.com/review/kenya-nyeri-asali-peaberry/,9,60,77,TWD,227.0,grams,15.0,0.07
1556,"Kailua-Kona, Hawaii","Ka’u, Big Island of Hawai’i",Medium-Light,33.0,2020-04-01,9,8,9,8,92,Kona Roasted,Rus & Alla Ka’u,https://www.coffeereview.com/review/rus-alla-kau/,8,58,78,USD,340.194,grams,33.0,0.1
2220,"Chino, California","Antigua Valley, Guatemala",Medium,14.0,2017-09-01,8,8,9,7,90,Espresso Republic,Static San Augustin Guatemala,https://www.coffeereview.com/review/static-san-augustin-guatemala/,8,50,56,USD,340.194,grams,14.0,0.04
2897,"Chia-Yi, Taiwan","Yirgacheffe growing region, south-central Ethiopia.",Light,20.0,2015-02-01,9,8,9,8,93,Kakalove Cafe,Ethiopia Natural Yirgacheffe Worka Coop Alemu,https://www.coffeereview.com/review/ethiopia-washed-yirgacheffe-worka-coop-alemu/,9,62,80,USD,453.592,grams,20.0,0.04
16,"Antigua, Guatemala","La Democracia, Huehuetenango, Guatemala",Medium-Light,30.0,2023-08-01,9,8,9,8,92,El Gran Cafe,Finca Huixoc Geisha,https://www.coffeereview.com/review/finca-huixoc-geisha/,8,60,74,USD,340.194,grams,30.0,0.09
1524,"Floyd, Virginia","idamo (also Sidama) growing region, southern Ethiopia",Medium-Light,20.99,2020-05-01,9,9,10,9,96,Red Rooster Coffee Roaster,Ethiopia Shantawene Washed,https://www.coffeereview.com/review/ethiopia-shantawene-washed/,9,60,78,USD,340.194,grams,20.99,0.06
1083,"Tochigi, Japan","Guji Zone, Oromia Region, southern Ethiopia",Light,1280.0,2021-05-01,8,8,9,8,91,Sunny's Coffee,Ethiopia Guji Hambela Dabaye,https://www.coffeereview.com/review/ethiopia-guji-hambela-dabaye/,8,66,84,CNY,100.0,grams,38.4,0.38
3856,"London, Ontario, Canada",Nicaragua.,Medium,14.0,2011-06-01,8,8,8,8,89,Las Chicas Del Café,1971 Nicaragua Viennese Roast,https://www.coffeereview.com/review/1971-nicaragua-viennese-roast/,7,49,60,CAD,453.592,grams,10.36,0.02
387,"Yilan, Taiwan","Alishan growing region, Taiwan",Medium-Light,1500.0,2022-11-01,8,9,9,7,91,Grass Coffee Roaster,Alishan Natural,https://www.coffeereview.com/review/alishan-natural/,8,58,74,TWD,250.0,grams,45.0,0.18
3054,"Lexington, Virginia","Bungoma region, western Kenya.",Medium-Light,17.75,2014-07-01,9,9,9,9,95,Lexington Coffee Roasters,Kenya Kikai,https://www.coffeereview.com/review/kenya-kikai-2/,9,57,77,USD,340.194,grams,17.75,0.05


In [99]:
us_states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 
             'Connecticut', 'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 
             'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 
             'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 
             'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 
             'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 
             'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 
             'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 
             'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 
             'West Virginia', 'Wisconsin','Wyoming']

canadian_provinces = ["Alberta","British Columbia", "Manitoba","New Brunswick", 
                      "Newfoundland and Labrador", "Nova Scotia", "Ontario",
                      "Prince Edward Island", "Quebec", "Saskatchewan",
                      "Northwest Territories","Nunavut","Yukon"]

us_states = [s.lower() for s in us_states]
canadian_provinces = [p.lower() for p in canadian_provinces]   

# Process roaster location by splitting city, state, and country into their own columns.
# Fix typos and standardize.
locations = (roasts['roaster_location']
             .str.strip()
             .str.lower()
             .str.replace("’", "")
             .str.replace("'", "")
             .str.split(',', expand=True)
             .apply(lambda row: row.str.strip())
             .replace({'californiaa': 'california', 'calfornia': 'california',
                       'washingto': 'washington', 'virginia and floyd': 'virginia'})
            )

# Cleaning up typos and other small idiosyncracies
locations.loc[locations[0] == "branford connecticut", 1] = "connecticut"
locations.loc[locations[0] == "los angeles", 1] = "california"
locations.loc[locations[1] == 'd.c.', [1, 2]] = ['district of columbia', 'usa']
locations[2] = locations[2].fillna(locations[1])
#locations.loc[locations[2].str.contains('hawaii'), 1] = 'hawaii'
locations.loc[locations[1].isin(us_states),2 ]= 'usa'
locations.loc[locations[1].isin(canadian_provinces), 2] = 'canada'
locations[1] = locations[1].where(~(locations[1] == locations[2]), np.nan)
locations = locations.rename({0:'roaster_city', 1:'roaster_state', 2:'roaster_country'}, axis='columns')
locations.roaster_city.unique()
#roasts = pd.concat([roasts, locations], axis=1)
roasts.loc[locations[locations['roaster_city'] == 'minnesota'].index]

Unnamed: 0,roaster_location,coffee_origin,roast_level,price,review_date,aroma,body,flavor,aftertaste,rating,roaster,roast_name,url,acidity,agtron_external,agtron_ground,currency,quantity,unit,price_usd,price_per_g_usd,roaster_city,roaster_state,roaster_country
348,"Minnesota, Minnesota","Bombe Mountains, Sidama Region, Ethiopia",Light,25.0,2022-11-01,9,9,9,9,95,Paradise Roasters,Ethiopia Hamasho Washed,https://www.coffeereview.com/review/ethiopia-hamasho-washed/,9,62,81,USD,340.194,grams,25.0,0.07,minnesota,minnesota,usa
1628,"Minnesota, Minnesota",Madagascar,Medium-Light,19.95,2020-01-01,9,8,9,8,92,Paradise Roasters,Madagascar Yellow Bourbon Santatra Coop,https://www.coffeereview.com/review/madagascar-yellow-bourbon-santatra-coop/,8,58,78,USD,226.796,grams,19.95,0.09,minnesota,minnesota,usa


In [None]:
# Cleaning up coffee origin location. 
origin = (coffee["Origin"]
 .str.split(',').str[-1]
 .str.lower()
 .str.strip()
 .str.replace(r'[-"‘\'.\“\”\’]', '', regex=True)
 .str.replace('southern', '')
 .str.replace('south-central', '')
 .str.replace('western', '')
 .str.replace('southwest', '')
 .str.replace('central', '')
 .str.replace('northern', '')
 .str.replace('eastern', '')
 .str.replace('southern', '')
 .str.replace('south', '')
 .str.replace('north', '')
 .str.replace('west', '')
 .str.replace('east', '')
 .str.replace('far', '')
 .str.replace('papua', '')
 .str.replace('the', '')
 .str.strip()
)

origin.loc[origin.str.contains(';')] = 'multiple'
origin.loc[origin.str.contains('hawaii')] = 'hawaii'
origin.loc[origin.str.contains('ethiopia')] = 'ethiopia'
origin.loc[origin.str.contains('congo')] = 'democratic republic of the congo'
origin.loc[origin.str.contains('apaneca')] = 'el salvador'
origin.loc[origin.str.contains('sumatra')] = 'sumatra'
origin.loc[origin.str.contains('colombia')] = 'colombia'
origin = origin.replace({'america':'usa', '': np.nan, 'gedeo zone':'ethiopia',
                         'coastal california':'usa'})

coffee['Origin'] = origin
coffee.head()

Summary Statistics:

Calculate basic statistics like mean, median, and standard deviation for the 'Rating,' 'Aroma,' 'Acidity,' 'Body,' 'Flavor,' 'Aftertaste,' and 'Price' columns to get an overall understanding of the dataset.
Correlation Analysis:

Investigate the correlations between different attributes such as 'Rating,' 'Aroma,' 'Acidity,' 'Body,' 'Flavor,' and 'Aftertaste.' This can help identify which attributes tend to go together or have an impact on the overall rating.
Distribution Analysis:

Visualize the distributions of 'Rating,' 'Aroma,' 'Acidity,' 'Body,' 'Flavor,' and 'Aftertaste' using histograms or box plots to understand the spread of values.
Top Roasters and Coffees:

Identify the top-rated roasters and coffee names based on the 'Rating' column. This can help consumers find highly-rated options.
Time-Series Analysis:

If you have data for multiple years, analyze trends in coffee ratings over time. Are there any patterns or changes in preferences?
Geospatial Analysis:

Analyze the 'Roaster_Location' and 'Origin' columns to understand where the roasters are located and where the coffee beans are sourced from. You can use geospatial tools to create maps or investigate the relationship between origin and rating.
Currency Analysis:

Analyze the 'Currency' column to understand the currencies used for pricing. You can convert prices to a common currency (e.g., USD) for comparison.
Price Analysis:

Analyze the relationship between 'Price' and 'Rating.' Do higher-priced coffees tend to have higher ratings? You can also look for outliers in pricing.
Text Analysis:

Perform natural language processing (NLP) on the 'Review_Description,' 'Blind_Assessment,' and 'Notes' columns to extract insights about the sensory descriptions, flavor profiles, and unique characteristics of the coffees.
Website Analysis:

Analyze the 'Roaster_Website_URL' to explore which roasters have a strong online presence and whether this correlates with higher ratings or prices.
Quantity Analysis:

Investigate the 'Quantity' and 'Unit' columns to understand the different packaging sizes and units in which coffee is sold. Analyze how these factors relate to pricing and consumer preferences.
External Metrics:

Explore the 'Agtron_External' and 'Agtron_Ground' columns, which may contain external metrics related to coffee quality. Analyze how these metrics correlate with sensory ratings.
Market Basket Analysis:

Explore which attributes (e.g., 'Aroma,' 'Acidity,' 'Body,' 'Flavor') tend to co-occur in coffee reviews. This can help identify flavor profiles that are popular among consumers.
Pricing Strategy:

Investigate the relationship between pricing ('Price' and 'Currency') and sensory attributes ('Aroma,' 'Acidity,' 'Body,' 'Flavor,' 'Aftertaste'). Are there pricing strategies associated with higher ratings?
Consumer Segmentation:

Segment consumers based on their preferences and ratings. Are there distinct groups of consumers with similar taste profiles?
Roaster Performance:

Evaluate roasters' performance based on their ratings and the origin of the coffee beans. Are there specific regions or origins associated with higher ratings for particular roasters?
URL Analysis:

Analyze the 'Complete_Review_URL' and 'Roaster_Website_URL' columns for insights into the sources of reviews and roaster websites' availability and quality.
Remember that the choice of analysis depends on your specific objectives and questions you want to answer. Combining several of these analyses can provide a comprehensive understanding of the dataset and valuable insights for both consumers and coffee businesses.

### EDA ideas
- visualize histograms
- visualize correlations
- word clouds
- tokenize descriptions
- plot locations, roasters, roast level, agtron vs other characterisitcs
- cluster analysis
- standardize ratings?? 



In [None]:
num_vars = (coffee
                .select_dtypes(include='number')
                .drop(['Price', 'Price_USD', 'Quantity'], axis='columns')
            )

num_vars.describe()


In [None]:
sns.heatmap(num_vars.corr(), cmap='vlag', center=0, annot=True)

In [None]:
fig, axes = plt.subplots(len(num_vars.columns), 2, figsize=(8, 24), sharex=False)

for i, col in enumerate(num_vars.columns):
    
    data = num_vars[col]
    ax1 = axes[i, 0]
    ax1.hist(data, bins=15)
    ax1.set_title("Histogram of {}".format(col))
    
    ax2 = axes[i, 1]
    ax2.boxplot(data, vert=False)
    ax2.set_title("Boxplot of {}".format(col))
plt.tight_layout()
plt.show()

In [None]:
log_price_per_g = np.log(coffee['Price_USD_Per_G'])

In [None]:
log_price_per_g.hist(bins=40)

In [None]:
log_price_per_g[log_price_per_g < -1].hist(bins=20)