In [83]:
import os
import re

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [6]:

FILENAME = "raw-roast-reviews-openrefine.csv"
FILEPATH = os.path.join("data", FILENAME)
df_raw = pd.read_csv(FILEPATH)
df_raw.info()# Read in data processed in OpenRefine  

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8675 entries, 0 to 8674
Data columns (total 25 columns):
 #   Column                                            Non-Null Count  Dtype  
---  ------                                            --------------  -----  
 0   roaster_location                                  7556 non-null   object 
 1   country                                           7553 non-null   object 
 2   coordinate location                               7726 non-null   object 
 3   located in the administrative territorial entity  8099 non-null   object 
 4   roaster_location_identifier                       7555 non-null   object 
 5   coffee_origin                                     7061 non-null   object 
 6   origin_country_cluster                            7628 non-null   object 
 7   roast_level                                       7167 non-null   object 
 8   agtron                                            7560 non-null   object 
 9   est_price          

In [30]:
def tweak_df(df):
    return (df
            .replace('na', np.nan)
            .dropna(subset=['origin_country_cluster', 'roaster_location', 'roast_level', 'body', 'aroma'])
            .assign(origin_country=lambda df_: df_['origin_country_cluster'].str.split(',').str[-1].str.strip(),
                    origin_region=lambda df_: df_['origin_country_cluster'].str.split(',').str[:-1].str.join(',').str.strip(),
                    review_date = lambda df_: pd.to_datetime(df_['review_date'], format="%B %Y"),
                    acidity = lambda df_: df_['acidity'].fillna(df_['acidity/structure']),
                    agtron_external = lambda df_: (df_['agtron']
                                                   .str.split('/')
                                                   .str[0].str.strip()
                                                   .replace(['', 'NA', 'g', '0'], np.nan)
                                                   ),
                    agtron_ground = lambda df_: (df_['agtron']
                                                 .str.split('/')
                                                 .str[1].str.strip()
                                                 .replace(['', 'NA', 'wb', '0'], np.nan)),
            )
            .drop(columns=['acidity/structure', 'with_milk', 'bottom_line', 'agtron'])
            .dropna(subset=['acidity']) # drop rows where acidity is still NaN, these are espresso reviews
            .rename(columns={'country': 'roaster_country', 'roaster_location': 'roaster_city', 
                             'located in the administrative territorial entity': 'roaster_region',
                             'coordinate location': 'roaster_coordinates'})
            .astype({'agtron_external': 'Int64', 'agtron_ground': 'Int64', 'roast_level':'category',
                     'origin_country': 'category', 'roaster_country': 'category', 'acidity': 'float',
                     'aftertaste': 'Int64', 'aroma': 'float', 'body': 'float', 'rating': 'Int64', 'flavor': 'float',})
    )

df = tweak_df(df_raw)

In [126]:
# Cleaning up price and units 
quantity = (df['est_price']
 .str.split('[/;]', n=1, expand=True)
 .dropna()
 .rename(columns={0: 'price', 1: 'amount'})
 .loc[:, 'amount']
 .str.replace('oz.*', 'ounces')
 .str.replace('ouncues|onces|ounce$|-ounce|ounces.*$', 'ounces', regex=True)
 .str.replace(r'oz.', 'ounces', regex=False)
 .str.replace(r'g ', 'grams', regex=False)
 .str.replace(r'-gram', ' grams', regex=True)
 .str.replace(r'g. ', 'grams', regex=False)
 .str.replace(r' g.', ' grams', regex=False)
 .str.replace(r'g$', ' grams', regex=True)
 .str.replace(r'\((.*?)\)', '', regex=True)
 .str.replace(r';|\(|\$.*$', '', regex=True)
 .str.replace('tin', '', regex=False)
 .str.replace('  ', ' ', regex=True)
 .str.replace('8 18 grams pouches', '152 grams', regex=False)
 .str.replace('350 grams 12.3 ounces', '350 grams', regex=False)
 .str.strip()
 .mask(lambda x: x.str.contains('capsule|packet|pods|vue|k-cups|sticks|tubes|cups|boxed|discs|can', case=False))
 )

Unnamed: 0,0,1
0,250,grams
1,12,ounces
2,12,ounces
3,8,ounces
4,12,ounces
...,...,...
6456,12,ounces
6468,16,ounces
6470,12,ounces
6471,16,ounces


quantity[quantity == '350 grams 12.3 ounces']

In [88]:
df.loc[5934]

roaster_city                                                            San Jose
roaster_country                                         United States of America
roaster_coordinates                             37.304166666667,-121.87277777778
roaster_region                                                Santa Clara County
roaster_location_identifier                                               Q16553
coffee_origin                      Yirgacheffe growing region, southern Ethiopia
origin_country_cluster          Yirgacheffe growing region, Gedeo Zone, Ethiopia
roast_level                                                         Medium-Light
est_price                                         $14.00/350 grams (12.3 ounces)
review_date                                                  2011-07-01 00:00:00
aroma                                                                        9.0
body                                                                         7.0
flavor                      