# Coffee Review Cleaning and EDA

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
pd.set_option('display.max_columns', 50)


df = pd.read_json('data/raw-roast-reviews.json').replace('NA', np.nan)
df.sample(5)

Unnamed: 0,Roaster Location:,Coffee Origin:,Roast Level:,Agtron:,Est. Price:,Review Date:,Aroma:,Acidity/Structure:,Body:,Flavor:,Aftertaste:,rating,roaster,name,url,With Milk:,Acidity:
108,"Taoyuan, Taiwan","Bench-Maji Zone, Southern Ethiopia",Medium-Light,57/75,NT $375/8 ounces,July 2023,9,8.0,9\t\t\t\t\t\t,9,8.0,93,VERYTIME,Ethiopia Lucy Geisha,https://www.coffeereview.com/review/ethiopia-l...,,
5243,"Kaliningrad, Russia","Brazil; Sumatra and Flores, Indonesia.",Dark,33/39,,March 2011,8,,7\t\t\t\t\t\t,8,8.0,90,Soyuz Coffee Roasting,Paretto Divino,https://www.coffeereview.com/review/paretto-di...,9.0,
4236,"Bishop, California","Nyeri growing region, south-central Kenya.",Medium,50/62,$18.95/12 ounces,December 2013,9,,8\t\t\t\t\t\t,9,8.0,93,Mt. Whitney Coffee Roasters,Kenya Nyeri Tegu,https://www.coffeereview.com/review/kenya-nyer...,,9.0
5271,"Portland, Oregon","Huila Department, Colombia",Medium-Light,58/70,$10.50/250 grams (8.8 oz.),January 2011,8,,8\t\t\t\t\t\t,9,8.0,92,Coava Coffee Roasters,Monserrate Competition Blend,https://www.coffeereview.com/review/monserrate...,,9.0
1523,"Lloydminster, Saskatchewan, Canada","Volcan growing region, western Panama",Medium-Light,52/70,CAD $50/200 grams,February 2021,9,9.0,9\t\t\t\t\t\t,9,8.0,94,Prairie Lily Coffee,Panama Pacamara Natural,https://www.coffeereview.com/review/panama-pac...,,


In [20]:
def tweak_coffee(df):
    return (df
        .rename(columns = lambda c: c.replace(' ', '_'))
        .rename(columns = lambda c: c.replace(':', ''))
        .rename(columns = lambda c: c.lower().strip())
        .rename(columns = {'name': 'roast_name', 'est._price': 'price'})
        .drop_duplicates(subset=['roaster', 'roast_name'])
        .replace('NA', np.nan)
        .assign(Review_Date = pd.to_datetime(df['Review_Date'], format='%b %Y'),
                Acidity = lambda df_: df_['Acidity/Structure'].fillna(df_['Acidity']),
                Agtron_External = lambda df_: df_['Agtron'].str.split('/', expand=True)[0],
                Agtron_Ground = lambda df_: df_['Agtron'].str.split('/', expand=True)[1],
                Amount = lambda df_: df_['Price'].str.split('/', expand=True)[1],
                Price = lambda df_: df_['Price'].str.replace(',', '').str.split('/', expand=True)[0],
                )
    )
        
roasts = tweak_coffee(df)

roasts.sample(5)


Unnamed: 0,roaster_location,coffee_origin,roast_level,agtron,price,review_date,aroma,acidity/structure,body,flavor,aftertaste,rating,roaster,roast_name,url,with_milk,acidity
7464,"Shaker Heights, Ohio",,Medium-Dark,40/49,,January 1999,7,,6\t\t\t\t\t\t,6,,78,Kokopelli Coffee and Tea,Espresso,https://www.coffeereview.com/review/espresso/,,6.0
1099,"Chia-Yi, Taiwan","La Libertad, Huehuetenango Department, Guatemala",Light,62/80,NT $600/4 ounces,October 2021,9,9.0,9\t\t\t\t\t\t,9,9.0,95,Kakalove Cafe,Guatemala El Injerto Natural Legendary Gesha 7,https://www.coffeereview.com/review/guatemala-...,,
7398,"San Francisco, California",,Very Dark,17/18,,June 1999,7,,5\t\t\t\t\t\t,6,,80,Jeremiah's Pick Gourmet Coffee,Fogbuster Blend,https://www.coffeereview.com/review/fogbuster-...,,5.0
2210,"Madison, Wisconsin","Yirgacheffe growing region, southern Ethiopia",Medium-Light,52/70,$17.10/12 ounces,September 2019,9,,9\t\t\t\t\t\t,9,8.0,94,JBC Coffee Roasters,Kebele Village Espresso,https://www.coffeereview.com/review/18743/,9.0,
6560,"Portland, Oregon","Narino Department, southern Colombia",Medium,50/61,,February 2006,9,,6\t\t\t\t\t\t,8,8.0,91,Stumptown Coffee Roasters,Colombia La Balvanera,https://www.coffeereview.com/review/colombia-l...,,8.0


In [None]:
def split_price_currency(df):
    """Split Price column into a column of the value and the currency"""
    currency_pattern = r'\$?(\d+\.\d+|\d+)'
    price = df['Price']
    df['Price'] = price.apply(lambda x: re.search(currency_pattern, str(x)).group(1) if re.search(currency_pattern, str(x)) else None)
    df['Currency'] = price.apply(lambda x: re.sub(currency_pattern, '', str(x)) if re.search(currency_pattern, str(x)) else str(x)).str.strip()
    return df

def currency_clean(df):
    """Transform currency symbols to ISO 4217 codes, and clean up"""
    
    curr_map = {'#': 'GBP', '£':'GBP', '¥': 'CNY', '':'USD', 'pesos':'MXN', 'RMB':'CNY'}
    
    df.loc[(df['Currency'] == '$') | (df['Currency'].str.lower().str.contains('us', na=False)), 'Currency'] = 'USD'
    df.loc[df['Currency'].str.lower().str.contains('nt', na=False), 'Currency'] = 'TWD'
    df['Currency'] = (df['Currency']
                      .str.replace('$', '')
                      .replace(curr_map)
                      .str.strip()
                     )
    return df

def create_quantity_and_units(df):
    """Separate Amount column into a quantity and the units for that quantity. Then 
       converts all quantities to grams."""
    
    amount_pat = r'([\d.]+)\s*([a-zA-Z]+)'
    currency_pattern = r'(\d+\.\d+)'

    def convert_to_grams(df):
        POUND_TO_GRAM = 453.59
        OUNCE_TO_GRAM = 28.3495
        
        df.loc[df['Unit'] == 'ounces', 'Quantity'] = df['Quantity'] * OUNCE_TO_GRAM
        df.loc[df['Unit'] == 'pounds', 'Quantity'] = df['Quantity'] * POUND_TO_GRAM
        df['Unit'] = 'grams'
        
        return df
                        
    return (df
            .assign(Quantity = df['Amount'].str.extract(amount_pat)[0],
                     Unit = df['Amount'].str.extract(amount_pat)[1],
                    )
            .drop('Amount', axis=1)
            .astype({'Quantity': 'float'})
            .pipe(convert_to_grams)
            .dropna(subset=['Quantity'])
            )

def create_price_USD(df):
    """Create a column converting all prices to USD using recent exchange rates."""
    exchange_rate_map = {'TWD' : 0.03, 
                         'CNY': 0.03, 
                         'GBP': 1.27, 
                         'AED':0.27, 
                         'KRW': 0.000752,
                         'HKD':0.13, 
                         'CAD': 0.74,
                         'MXN': 0.06,
                         'IDR': 0.000065,
                         'AUD': 0.65,
                         'USD': 1.0}
    df['Price_USD'] = np.round(df['Price'] * df['Currency'].map(exchange_rate_map), 2)
    return df

def tweak_coffee(df):
    return (df
            .rename(columns = lambda c: c.replace(' ', '_'))
            .rename(columns = lambda c: c.replace(':', '').capitalize())
            .rename(columns = {'Coffee_Origin': 'Origin', 'Est._Price': 'Price', 'name':'Name'})
            .drop_duplicates(subset=['Roaster', 'Name'])
            .assign(Review_Date = pd.to_datetime(df['Review_Date'], format='%b %Y'),
                    Acidity = lambda df_: df_['Acidity/Structure'].fillna(df_['Acidity']),
                    Agtron_External = lambda df_: df_['Agtron'].str.split('/', expand=True)[0],
                    Agtron_Ground = lambda df_: df_['Agtron'].str.split('/', expand=True)[1],
                    Amount = lambda df_: df_['Price'].str.split('/', expand=True)[1],
                    Price = lambda df_: df_['Price'].str.replace(',', '').str.split('/', expand=True)[0],
                     )
            # Remove whitespace and tranform "NA" to NaN
            .applymap(lambda x: x.strip() if isinstance(x, str) else x)
            .applymap(lambda x: np.nan if x == 'NA' else x)
            .pipe(split_price_currency)
            .pipe(currency_clean)
            .drop(['Acidity/Structure', 'Agtron',], axis=1)
            .dropna(subset=['Acidity', 'Roast_Level', 'Aroma', 'Price', 'Bottom_Line', 
                            'Agtron_External', 'Agtron_Ground',])
            .pipe(create_quantity_and_units)
            .astype({'Roaster': 'category', 'Rating': 'int64', 'Roast_Level':'category',
                     'Aftertaste': 'int64', 'Aroma': 'int64', 'Body': 'int64',
                     'Flavor': 'int64', 'Currency':'category', 'Acidity': 'int64',
                     'Agtron_External': 'int64', 'Agtron_Ground':'int64', 'Price':'float',
                    'Unit': 'category'}
                   )
            .pipe(create_price_USD)
            # Agtron values must fall between 0 and 100. These values were incorrect from the source.
            .query('Agtron_External <= 100 and Agtron_Ground <= 100')
            .assign(Price_USD_Per_G = lambda df_: np.round(df_['Price_USD']/df_['Quantity'], 2))
            .reset_index(drop=True)
           ) 

coffee = tweak_coffee(df)
coffee.to_csv('data/coffee-reviews-clean.csv')
coffee.head()

In [None]:
us_states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 
             'Connecticut', 'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 
             'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 
             'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 
             'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 
             'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 
             'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 
             'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 
             'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 
             'West Virginia', 'Wisconsin','Wyoming']

canadian_provinces = ["Alberta","British Columbia", "Manitoba","New Brunswick", 
                      "Newfoundland and Labrador", "Nova Scotia", "Ontario",
                      "Prince Edward Island", "Quebec", "Saskatchewan",
                      "Northwest Territories","Nunavut","Yukon"]

us_states = [s.lower() for s in us_states]
canadian_provinces = [p.lower() for p in canadian_provinces]   

# Process roaster location by splitting city, state, and country into their own columns.
# Fix typos and standardize.
locations = (coffee['Roaster_Location']
             .str.strip()
             .str.lower()
             .str.replace("’", "")
             .str.replace("'", "")
             .str.split(',', expand=True)
             .apply(lambda row: row.str.strip())
             .replace({'californiaa': 'california', 'calfornia': 'california',
                       'washingto': 'washington', 'virginia and floyd': 'virginia'})
            )

# Cleaning up typos and other small idiosyncracies
locations.loc[locations[0] == "branford connecticut", 1] = "connecticut"
locations.loc[locations[0] == "los angeles", 1] = "california"
locations.loc[locations[1] == 'd.c.', [1, 2]] = ['district of columbia', 'usa']
locations[2] = locations[2].fillna(locations[1])
locations.loc[locations[2].str.contains('hawaii'), 1] = 'hawaii'
locations.loc[locations[1].isin(us_states),2 ]= 'usa'
locations.loc[locations[1].isin(canadian_provinces), 2] = 'canada'
locations[1] = locations[1].where(~(locations[1] == locations[2]), np.nan)
locations = locations.rename({0:'roaster_city', 1:'roaster_state', 2:'roaster_country'}, axis='columns')

coffee = pd.concat([coffee, locations], axis=1)
coffee.head()

In [None]:
# Cleaning up coffee origin location. 
origin = (coffee["Origin"]
 .str.split(',').str[-1]
 .str.lower()
 .str.strip()
 .str.replace(r'[-"‘\'.\“\”\’]', '', regex=True)
 .str.replace('southern', '')
 .str.replace('south-central', '')
 .str.replace('western', '')
 .str.replace('southwest', '')
 .str.replace('central', '')
 .str.replace('northern', '')
 .str.replace('eastern', '')
 .str.replace('southern', '')
 .str.replace('south', '')
 .str.replace('north', '')
 .str.replace('west', '')
 .str.replace('east', '')
 .str.replace('far', '')
 .str.replace('papua', '')
 .str.replace('the', '')
 .str.strip()
)

origin.loc[origin.str.contains(';')] = 'multiple'
origin.loc[origin.str.contains('hawaii')] = 'hawaii'
origin.loc[origin.str.contains('ethiopia')] = 'ethiopia'
origin.loc[origin.str.contains('congo')] = 'democratic republic of the congo'
origin.loc[origin.str.contains('apaneca')] = 'el salvador'
origin.loc[origin.str.contains('sumatra')] = 'sumatra'
origin.loc[origin.str.contains('colombia')] = 'colombia'
origin = origin.replace({'america':'usa', '': np.nan, 'gedeo zone':'ethiopia',
                         'coastal california':'usa'})

coffee['Origin'] = origin
coffee.head()

Summary Statistics:

Calculate basic statistics like mean, median, and standard deviation for the 'Rating,' 'Aroma,' 'Acidity,' 'Body,' 'Flavor,' 'Aftertaste,' and 'Price' columns to get an overall understanding of the dataset.
Correlation Analysis:

Investigate the correlations between different attributes such as 'Rating,' 'Aroma,' 'Acidity,' 'Body,' 'Flavor,' and 'Aftertaste.' This can help identify which attributes tend to go together or have an impact on the overall rating.
Distribution Analysis:

Visualize the distributions of 'Rating,' 'Aroma,' 'Acidity,' 'Body,' 'Flavor,' and 'Aftertaste' using histograms or box plots to understand the spread of values.
Top Roasters and Coffees:

Identify the top-rated roasters and coffee names based on the 'Rating' column. This can help consumers find highly-rated options.
Time-Series Analysis:

If you have data for multiple years, analyze trends in coffee ratings over time. Are there any patterns or changes in preferences?
Geospatial Analysis:

Analyze the 'Roaster_Location' and 'Origin' columns to understand where the roasters are located and where the coffee beans are sourced from. You can use geospatial tools to create maps or investigate the relationship between origin and rating.
Currency Analysis:

Analyze the 'Currency' column to understand the currencies used for pricing. You can convert prices to a common currency (e.g., USD) for comparison.
Price Analysis:

Analyze the relationship between 'Price' and 'Rating.' Do higher-priced coffees tend to have higher ratings? You can also look for outliers in pricing.
Text Analysis:

Perform natural language processing (NLP) on the 'Review_Description,' 'Blind_Assessment,' and 'Notes' columns to extract insights about the sensory descriptions, flavor profiles, and unique characteristics of the coffees.
Website Analysis:

Analyze the 'Roaster_Website_URL' to explore which roasters have a strong online presence and whether this correlates with higher ratings or prices.
Quantity Analysis:

Investigate the 'Quantity' and 'Unit' columns to understand the different packaging sizes and units in which coffee is sold. Analyze how these factors relate to pricing and consumer preferences.
External Metrics:

Explore the 'Agtron_External' and 'Agtron_Ground' columns, which may contain external metrics related to coffee quality. Analyze how these metrics correlate with sensory ratings.
Market Basket Analysis:

Explore which attributes (e.g., 'Aroma,' 'Acidity,' 'Body,' 'Flavor') tend to co-occur in coffee reviews. This can help identify flavor profiles that are popular among consumers.
Pricing Strategy:

Investigate the relationship between pricing ('Price' and 'Currency') and sensory attributes ('Aroma,' 'Acidity,' 'Body,' 'Flavor,' 'Aftertaste'). Are there pricing strategies associated with higher ratings?
Consumer Segmentation:

Segment consumers based on their preferences and ratings. Are there distinct groups of consumers with similar taste profiles?
Roaster Performance:

Evaluate roasters' performance based on their ratings and the origin of the coffee beans. Are there specific regions or origins associated with higher ratings for particular roasters?
URL Analysis:

Analyze the 'Complete_Review_URL' and 'Roaster_Website_URL' columns for insights into the sources of reviews and roaster websites' availability and quality.
Remember that the choice of analysis depends on your specific objectives and questions you want to answer. Combining several of these analyses can provide a comprehensive understanding of the dataset and valuable insights for both consumers and coffee businesses.

### EDA ideas
- visualize histograms
- visualize correlations
- word clouds
- tokenize descriptions
- plot locations, roasters, roast level, agtron vs other characterisitcs
- cluster analysis
- standardize ratings?? 



In [None]:
num_vars = (coffee
                .select_dtypes(include='number')
                .drop(['Price', 'Price_USD', 'Quantity'], axis='columns')
            )

num_vars.describe()


In [None]:
sns.heatmap(num_vars.corr(), cmap='vlag', center=0, annot=True)

In [None]:
fig, axes = plt.subplots(len(num_vars.columns), 2, figsize=(8, 24), sharex=False)

for i, col in enumerate(num_vars.columns):
    
    data = num_vars[col]
    ax1 = axes[i, 0]
    ax1.hist(data, bins=15)
    ax1.set_title("Histogram of {}".format(col))
    
    ax2 = axes[i, 1]
    ax2.boxplot(data, vert=False)
    ax2.set_title("Boxplot of {}".format(col))
plt.tight_layout()
plt.show()

In [None]:
log_price_per_g = np.log(coffee['Price_USD_Per_G'])

In [None]:
log_price_per_g.hist(bins=40)

In [None]:
log_price_per_g[log_price_per_g < -1].hist(bins=20)