In [269]:
import os
import re

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [270]:

FILENAME = "raw-roast-reviews-openrefine.csv"
FILEPATH = os.path.join("data", FILENAME)
df_raw = pd.read_csv(FILEPATH)
df_raw.info()# Read in data processed in OpenRefine  

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8675 entries, 0 to 8674
Data columns (total 25 columns):
 #   Column                                            Non-Null Count  Dtype  
---  ------                                            --------------  -----  
 0   roaster_location                                  7556 non-null   object 
 1   country                                           7553 non-null   object 
 2   coordinate location                               7726 non-null   object 
 3   located in the administrative territorial entity  8099 non-null   object 
 4   roaster_location_identifier                       7555 non-null   object 
 5   coffee_origin                                     7061 non-null   object 
 6   origin_country_cluster                            7628 non-null   object 
 7   roast_level                                       7167 non-null   object 
 8   agtron                                            7560 non-null   object 
 9   est_price          

In [271]:
def tweak_df(df):
    return (df
            .replace('na', np.nan)
            .dropna(subset=['origin_country_cluster', 'roaster_location', 'roast_level', 'body', 'aroma'])
            .assign(origin_country=lambda df_: df_['origin_country_cluster'].str.split(',').str[-1].str.strip(),
                    origin_region=lambda df_: df_['origin_country_cluster'].str.split(',').str[:-1].str.join(',').str.strip(),
                    review_date = lambda df_: pd.to_datetime(df_['review_date'], format="%B %Y"),
                    acidity = lambda df_: df_['acidity'].fillna(df_['acidity/structure']),
                    agtron_external = lambda df_: (df_['agtron']
                                                   .str.split('/')
                                                   .str[0].str.strip()
                                                   .replace(['', 'NA', 'g', '0'], np.nan)
                                                   ),
                    agtron_ground = lambda df_: (df_['agtron']
                                                 .str.split('/')
                                                 .str[1].str.strip()
                                                 .replace(['', 'NA', 'wb', '0'], np.nan)),
            )
            .drop(columns=['acidity/structure', 'with_milk', 'bottom_line', 'agtron'])
            .dropna(subset=['acidity']) # drop rows where acidity is still NaN, these are espresso reviews
            .rename(columns={'country': 'roaster_country', 'roaster_location': 'roaster_city', 
                             'located in the administrative territorial entity': 'roaster_region',
                             'coordinate location': 'roaster_coordinates'})
            .astype({'agtron_external': 'Int64', 'agtron_ground': 'Int64', 'roast_level':'category',
                     'origin_country': 'category', 'roaster_country': 'category', 'acidity': 'float',
                     'aftertaste': 'Int64', 'aroma': 'float', 'body': 'float', 'rating': 'Int64', 'flavor': 'float',})
    )

df = tweak_df(df_raw)

In [272]:
# Cleaning up price and units 
quantity = (df['est_price']
 .str.split('[/;]', n=1, expand=True)
 .dropna()
 .rename(columns={0: 'price', 1: 'amount'})
 .loc[:, 'amount']
 .str.replace('oz.*', 'ounces')
 .str.replace('ouncues|onces|ounce$|-ounce|ounces.*$', 'ounces', regex=True)
 .str.replace(r'oz.', 'ounces', regex=False)
 .str.replace(r'g ', 'grams', regex=False)
 .str.replace(r'-gram', ' grams', regex=True)
 .str.replace(r'g. ', 'grams', regex=False)
 .str.replace(r' g.', ' grams', regex=False)
 .str.replace(r'g$', ' grams', regex=True)
 .str.replace(r'\((.*?)\)', '', regex=True)
 .str.replace(r';|\(|\$.*$', '', regex=True)
 .str.replace('tin', '', regex=False)
 .str.replace('  ', ' ', regex=True)
 .str.replace('8 18 grams pouches', '152 grams', regex=False)
 .str.replace('350 grams 12.3 ounces', '350 grams', regex=False)
 .str.strip()
 .mask(lambda x: x.str.contains('capsule|packet|pods|vue|k-cups|sticks|tubes|cups|boxed|discs|can', case=False), np.nan)
 )

In [273]:
quantity_unit= (quantity
                .str.split(' ', expand=True)
                .rename(columns={0: 'quantity', 1: 'unit'})
                # if quantity equals 'pound' set quantity to 1 and unit to pounds
                .assign(unit=lambda df_: df_['unit'].mask(df_['quantity'] == 'pound', 'pound'),
                        quantity = lambda df_: df_['quantity'].mask(df_['quantity'] == 'pound', 1)
                )
                .astype({'quantity': 'float'})

)

# NA values in quantity_unit are due to the fact that the original est_price column was contained
# values including cans, boxes, packets, etc. which we dropped for the purpose of this analysis. 

grams_conversions = {'ounces': 28.3495, 'pounds': 453.592, 'grams': 1, 'kilograms': 1000}



In [289]:
price = (df['est_price']
         .str.split('[/;]', n=1, expand=True)
         .loc[:, 0]
         .str.replace(',', '', regex=False)
         # regex to extract the price from the string
        .str.extract(r'(\d+\.\d+|\d+)')
        .loc[:, 0]
)

currency = (df['est_price']
         .str.split('[/;]', n=1, expand=True)
         .loc[:, 0]
         .str.replace(',', '', regex=False)
         # regex to extract the price from the string
        .str.replace(r'(\d+\.\d+|\d+)', '', regex=True)
        .str.strip()
        .str.replace(' ', '')
)



In [290]:
currency.unique()

array(['CNY', '$', 'NT$', 'GBP', 'HKD$', 'PuertoRico', '#', 'Nt$', 'RMB',
       'AED$', 'KRW$', '£', '$NT$', 'CAD$', 'NA(availableinstoreonly)',
       nan, '¥', 'pesos', 'USD$', 'US$', '$CAD', 'IDR$', 'AUD$',
       'Seewebsiteformoreinformation', '$NT', '', 'NTD$', 'NT', 'KRW',
       'RMB$', '$NTD', 'CNY$', 'Seereviewnote', 'THB$', 'TWD$', 'HK$',
       'E', 'GTQ', 'Notdisclosed.', 'NotAvailable', 'Price:$', 'THB'],
      dtype=object)

In [291]:
currency_map = {'CNY': 'CNY',
 '$': 'USD',
 'NT$' : 'TWD',
 'GBP' : 'GBP',
 'HKD$' : 'HKD',
 'Nt$' : 'TWD',
 'RMB' : 'CNY',
 'AED$' : 'AED',
 'KRW$' : 'KRW',
 '£' : 'GBP',
 '$NT$': 'TWD',
 'CAD$' : 'CAD',
 '¥' : 'JPY',
 'pesos' : 'MXN',
 'USD$' : 'USD',
 'US$'  : 'USD',
 '$CAD' : 'CAD',
 'IDR$' : 'IDR',
 'AUD$' : 'AUD',
 '$NT' : 'TWD',
 'NTD$' : 'TWD',
 'NT' : 'TWD',
 'KRW' : 'KRW',
 'RMB$' : 'CNY',
 '$NTD' : 'TWD',
 'CNY$' : 'CNY',
 'THB$' : 'THB',
 'TWD$' : 'TWD',
 'HK$' : 'HKD',
 'E' : 'EUR',
 'GTQ' : 'GTQ',
 'Price:$': 'USD',
 'THB' : 'THB',
 }


SyntaxError: ':' expected after dictionary key (4284738981.py, line 7)

In [304]:
df[df.est_price.str.contains('E', na=False)]

Unnamed: 0,roaster_city,roaster_country,roaster_coordinates,roaster_region,roaster_location_identifier,coffee_origin,origin_country_cluster,roast_level,est_price,review_date,...,roaster,name,blind_assessment,notes,url,acidity,origin_country,origin_region,agtron_external,agtron_ground
407,Dubai,United Arab Emirates,"25.269722222222,55.309444444444",Emirate of Dubai,Q612,"Odo Shakiso District, Guji Zone, southern Ethi...","Guji growing region, Oromia Region, Ethiopia",Medium-Light,AED $99.75/250 grams,2023-06-01,...,Jebena Coffees,Ethiopia Kayon Mountain,"Crisp, richly sweet. Cocoa nib, dried nectarin...",Produced by the Hassen family at their estate ...,https://www.coffeereview.com/review/ethiopia-k...,9.0,Ethiopia,"Guji growing region, Oromia Region",59,76
2673,Dubai,United Arab Emirates,"25.269722222222,55.309444444444",Emirate of Dubai,Q612,"Guji Zone, southern Ethiopia","Guji growing region, Oromia Region, Ethiopia",Medium-Light,AED $103.95/250 grams,2019-09-01,...,Jebena Coffees,Ethiopia Guji Girma Natural,"Crisp, richly sweet. Cocoa nib, apricot, almon...",Southern Ethiopia coffees like this one are la...,https://www.coffeereview.com/review/ethiopia-g...,8.0,Ethiopia,"Guji growing region, Oromia Region",58,76
2680,Dubai,United Arab Emirates,"25.269722222222,55.309444444444",Emirate of Dubai,Q612,"Limu growing region, southern Ethiopia","Limu growing region, Oromia Region, Ethiopia",Medium-Light,AED $99.75/250 grams,2019-09-01,...,Jebena Coffees,Ethiopia Limu Lema Edeto,"Berry-toned, chocolaty. Chocolate fudge, dried...",Limu (also spelled Limmu) is a growing region ...,https://www.coffeereview.com/review/ethiopia-l...,8.0,Ethiopia,"Limu growing region, Oromia Region",58,76
3054,Dubai,United Arab Emirates,"25.269722222222,55.309444444444",Emirate of Dubai,Q612,"Sidama growing region, southern Ethiopia","Sidama growing region, Oromia Region, Ethiopia",Medium-Light,AED $103.95/250 grams,2018-10-01,...,Jebena Coffees,Ethiopia Sidama Sasaba Natural,"Crisply sweet, rich-toned. Dried apricot, haze...",Sidama coffees like this one are produced from...,https://www.coffeereview.com/review/ethiopia-s...,8.0,Ethiopia,"Sidama growing region, Oromia Region",55,79
3525,Dubai,United Arab Emirates,"25.269722222222,55.309444444444",Emirate of Dubai,Q612,"LImmu Woreda, Oromia Zone, southern Ethiopia","Limu growing region, Oromia Region, Ethiopia",Medium-Light,AED $95.00/250 grams,2017-09-01,...,Jebena Coffees,Ethiopia Limmu Organic Kebena Kossa,"Sweetly tart, fruit-toned. Dried strawberry, s...",Certified organic. Limmu (also Limu) is a grow...,https://www.coffeereview.com/review/ethiopia-l...,8.0,Ethiopia,"Limu growing region, Oromia Region",58,78
5025,Utrecht,Netherlands,"52.090833333333336,5.121666666666667",Utrecht,Q803,"Northern Sumatra, Indonesia","Sumatra, Indonesia",Medium-Light,E 50.00/250 grams,2013-12-01,...,The Bantuan Coffee Foundation,Arabica Wild Kopi Luwak,"Richly pungent. Sandalwood, apricot, dark choc...","A version of the famous Kopi Luwak, the world'...",https://www.coffeereview.com/review/arabica-wi...,8.0,Indonesia,Sumatra,55,70
