In [84]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [151]:
# Read in data processed in OpenRefine  
FILENAME = "raw-roast-reviews-openrefine.csv"
FILEPATH = os.path.join("data", FILENAME)
df_raw = pd.read_csv(FILEPATH)

df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8675 entries, 0 to 8674
Data columns (total 25 columns):
 #   Column                                            Non-Null Count  Dtype  
---  ------                                            --------------  -----  
 0   roaster_location                                  7556 non-null   object 
 1   country                                           7553 non-null   object 
 2   coordinate location                               7726 non-null   object 
 3   located in the administrative territorial entity  8099 non-null   object 
 4   roaster_location_identifier                       7555 non-null   object 
 5   coffee_origin                                     7061 non-null   object 
 6   origin_country_cluster                            7628 non-null   object 
 7   roast_level                                       7167 non-null   object 
 8   agtron                                            7560 non-null   object 
 9   est_price          

In [194]:

def tweak_df(df):
    return (df
            .replace('na', np.nan)
            .dropna(subset=['origin_country_cluster', 'roaster_location', 'roast_level', 'body', 'aroma'])
            .assign(origin_country=lambda df_: df_['origin_country_cluster'].str.split(',').str[-1].str.strip(),
                    origin_region=lambda df_: df_['origin_country_cluster'].str.split(',').str[:-1].str.join(',').str.strip(),
                    review_date = lambda df_: pd.to_datetime(df_['review_date'], format="%B %Y"),
                    acidity = lambda df_: df_['acidity'].fillna(df_['acidity/structure']),
                    agtron_external = lambda df_: (df_['agtron']
                                                   .str.split('/')
                                                   .str[0].str.strip()
                                                   .replace(['', 'NA', 'g', '0'], np.nan)
                                                   ),
                    agtron_ground = lambda df_: (df_['agtron']
                                                 .str.split('/')
                                                 .str[1].str.strip()
                                                 .replace(['', 'NA', 'wb', '0'], np.nan)),
            )
            .drop(columns=['acidity/structure', 'with_milk', 'bottom_line', 'agtron'])
            .dropna(subset=['acidity']) # drop rows where acidity is still NaN, these are espresso reviews
            .rename(columns={'country': 'roaster_country', 'roaster_location': 'roaster_city', 
                             'located in the administrative territorial entity': 'roaster_region',
                             'coordinate location': 'roaster_coordinates'})
            .astype({'agtron_external': 'Int64', 'agtron_ground': 'Int64', 'roast_level':'category',
                     'origin_country': 'category', 'roaster_country': 'category', 'acidity': 'float',
                     'aftertaste': 'Int64', 'aroma': 'float', 'body': 'float', 'rating': 'Int64', 'flavor': 'float',})
    )

df = tweak_df(df_raw)
df.head()


Unnamed: 0,roaster_city,roaster_country,roaster_coordinates,roaster_region,roaster_location_identifier,coffee_origin,origin_country_cluster,roast_level,est_price,review_date,...,roaster,name,blind_assessment,notes,url,acidity,origin_country,origin_region,agtron_external,agtron_ground
0,Hong Kong,People's Republic of China,"22.278333333333,114.15861111111",People's Republic of China,Q8646,multiple,multiple,Medium-Light,CNY 160/250 grams,2023-11-01,...,Nodi Coffee,Windsor Blend,"Gently fruit-toned, delicately floral. Apricot...","A blend of two coffees, one Ethiopia and one R...",https://www.coffeereview.com/review/windsor-bl...,8.0,multiple,,60,78
1,Madison,United States of America,"43.07472222222222,-89.38416666666667",Dane County,Q43788,"Nyeri growing region, south-central Kenya","Nyeri growing region, Kenya",Medium-Light,$22.00/12 ounces,2023-12-01,...,JBC Coffee Roasters,Nyeri Hill Kenya,"Richly sweet-tart. Blackberry compote, cocoa n...","Produced at Nyeri Hill Estate, from trees of t...",https://www.coffeereview.com/review/nyeri-hill...,9.0,Kenya,Nyeri growing region,56,74
2,Plymouth,United States of America,"41.95861111111111,-70.66777777777777",Plymouth County,Q326295,"Monte Verde, Santa Ana Department, central El ...","Apaneca-Ilamatepec growing region, El Salvador",Medium-Light,$16.00/12 ounces,2023-12-01,...,Speedwell Coffee,El Salvador Los Cipreses,"Richly sweet, floral-toned. Nutella, magnolia,...",Produced by Rene Contreras on the family farm ...,https://www.coffeereview.com/review/el-salvado...,9.0,El Salvador,Apaneca-Ilamatepec growing region,58,74
3,Madison,United States of America,"43.07472222222222,-89.38416666666667",Dane County,Q43788,"Tarrazu, Costa Rica","Tarrazú growing region, Costa Rica",Medium-Light,$18.00/8 ounces,2023-12-01,...,JBC Coffee Roasters,Cordillera de Fuego Costa Rica,"Richly sweet, deeply spice-toned. Cinnamon, cl...","Produced by Luis Eduardo Campos, from trees of...",https://www.coffeereview.com/review/cordillera...,9.0,Costa Rica,Tarrazú growing region,59,77
4,Peoria,United States of America,"40.6925,-89.59",Peoria County,Q233129,"Los Robles de Naranjo, West Valley, Costa Rica","West Valley growing region, Costa Rica",Medium-Light,$22.00/12 ounces,2023-12-01,...,Intuition Coffee,Costa Rica Finca Tono Natural,"Gently fruity, richly nut-toned. Hazelnut butt...","Produced by Felipe and Erasmo Aguilera, entire...",https://www.coffeereview.com/review/costa-rica...,8.0,Costa Rica,West Valley growing region,57,76
