In [1]:
%load_ext autoreload
%autoreload 2

from pathlib import Path
import json

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

mpl.rcParams['figure.dpi']= 300

In [2]:
data_dir = Path('../../data')
file_path = data_dir / 'interim' / '05052024_roast_review_merged_openrefine.csv'
df = pd.read_csv(file_path)

df.sample(3)

Unnamed: 0,rating,roaster,title,blind_assessment,bottom_line,roaster_location,coordinate location,og_roaster_location,roaster_location_identifier,territorial_entity_1,...,review_date,aroma,body,flavor,aftertaste,with_milk,url,acidity/structure,acidity,notes
5671,91,Klatch Coffee,Ethiopian Worka,"Co-cupper Miguel Meza (91) calls this coffee ""...",,San Dimas,"34.1066756,-117.8067257","San Dimas, California",Q923845,Los Angeles County,...,November 2007,9.0,7.0,9.0,8.0,,https://www.coffeereview.com/review/ethiopian-...,,8.0,"A dry-processed or ""natural"" coffee, meaning t..."
3297,94,JBC Coffee Roasters,Adame Gorbota Ethiopia,"Delicate, rich, sweetly floral. Tangerine blos...",,Madison,"43.07472222222222,-89.38416666666667","Madison, Wisconsin",Q43788,Dane County,...,February 2016,9.0,8.0,9.0,9.0,,https://www.coffeereview.com/review/adame-gorb...,,9.0,Yirgacheffe is a coffee region in southern Eth...
268,94,Big Shoulders Coffee,Colombia Wilton Benitez Thermal Shock Caturra,"Complex, fruit-toned, rich. Juicy Fruit gum, g...",A medley of both tropical and candy-like fruit...,Chicago,"41.85003,-87.65005","Chicago, Illinois",Q1297,Cook County,...,November 2023,9.0,9.0,9.0,8.0,,https://www.coffeereview.com/review/colombia-w...,9.0,,Produced by Wilton Benitez of Granja Paraiso 9...


In [3]:
def tweak_coffee(df: pd.DataFrame) -> pd.DataFrame:
    return(
        df
        .dropna(subset=['review_date', 'est_price', 'coffee_origin', 'aroma', 'roast_level', 'aftertaste',])
        .assign(review_date = lambda df_: pd.to_datetime(df_['review_date'], format="%B %Y"),
                acidity = lambda df_: df_['acidity'].fillna(df_['acidity/structure']),
                agtron_external = lambda df_: (df_['agtron']
                                                       .str.split('/')
                                                       .str[0].str.strip()
                                                       .replace(['', 'NA', 'g', '0'], np.nan)
                                                       ),
                agtron_ground = lambda df_: (df_['agtron']
                                                     .str.split('/')
                                                     .str[1].str.strip()
                                                     .replace(['', 'NA', 'wb', '0'], np.nan)
                                                     )
        )
        .dropna(subset=['agtron_external', 'agtron_ground', 'acidity'])
        .astype({'agtron_external': 'float', 'agtron_ground': 'float', 'acidity': 'float'})
        .replace('', np.nan)
    )


In [4]:
coffee_df = tweak_coffee(df)
coffee_df.sample(3)

coffee_df.review_date.drop_duplicates().to_csv(data_dir / 'processed' / 'review_dates.csv', index=False)

## Quantity and Price Data Cleaning

In [5]:
def price_quantity_split(df: pd.DataFrame) -> pd.DataFrame:
    price_quantity = (
        df
        .est_price.str.split("/", n=1, expand=True)
        .rename(columns={0: 'price', 1: 'quantity'})
        .assign(quantity = lambda df_: (df_['quantity']
                                        .str.replace(r"\(.*?\)", "", regex=True)
                                        .str.replace(r";.*", "", regex=True)
                                        .str.replace(".", "").str.strip()
                                        .str.replace(r".g$", " grams", regex=True)
                                        .str.replace(r"\sg$", "grams", regex=True)
                                        .str.replace(r"pound$", "1 pounds", regex=True)
                                        .str.replace(r"oz|onces|ounce$|ounces\*", "ounces", regex=True)
                                        .str.replace("online", "")
                                        .str.strip()
                                        ),
                price = lambda df_: df_['price']
            )
        .dropna()
        .loc[lambda df_: ~df_['quantity'].str.contains('can|box|pouch|packet|tin'), :]
        .assign(quantity_value = lambda df_: (df_['quantity'].str.extract(r'(\d+)').astype(float)),
                quantity_unit = lambda df_: df_['quantity'].str.replace(r"(\d+)", "", regex=True).str.strip(),
                price_value = lambda df_: df_['price'].str.extract(r'(\d+\.\d+|\d+)').astype(float),
                price_currency = lambda df_: df_['price'].str.replace(",", "").str.replace(r'(\d+\.\d+|\d+)', '', regex=True).str.strip()
                )
        .drop(columns=['price', 'quantity'])
    )
    return df.merge(price_quantity, left_index=True, right_index=True)

In [6]:
coffee_df = coffee_df.pipe(price_quantity_split)
coffee_df[['est_price', 'quantity_value', 'quantity_unit', 'price_value', 'price_currency']].sample(3)

Unnamed: 0,est_price,quantity_value,quantity_unit,price_value,price_currency
904,$16.99/12 ounces,12.0,ounces,16.99,$
101,NT 350/227 grams,227.0,grams,350.0,NT
1801,US $20.00/12 ounces,12.0,ounces,20.0,US $


### Cleaning Currencies



In [7]:
coffee_df.price_currency.value_counts()

price_currency
$           3458
NT $         578
CAD $         94
NT$           43
NTD $         21
HKD $         16
¥             11
RMB $          9
AUD $          9
KRW            9
NT             8
KRW $          7
TWD $          7
£              6
HK $           5
AED $          5
IDR $          3
$ NTD          3
THB $          3
THB            2
US $           2
USD $          2
#              2
RMB            2
GBP            2
$NT            1
               1
pesos          1
$NT$           1
Nt $           1
CNY $          1
KRW$           1
E              1
GTQ            1
Price: $       1
$ CAD          1
Name: count, dtype: int64

In [8]:
def clean_currency(df: pd.DataFrame) -> pd.DataFrame:
    price_currency = (
        df.price_currency
        .str.strip()
        .str.upper()
        .str.replace(r'^\$$', 'USD', regex=True)
        .str.replace('PRICE: $', 'USD')
        .str.replace('$', '')
        .str.replace('#', 'GBP')
        .str.replace('¥', 'CNY')
        .str.replace('£', 'GBP')
        .str.replace('POUND', 'GBP')
        .str.replace('PESOS', 'MXN')
        .str.replace('RMB', 'CNY')
        .str.strip()
        .mask(lambda s: s == "US", "USD")
        .mask(lambda s: s == ' ', "USD")
        .mask(lambda s: s == 'E', 'EUR')
        .mask(lambda s: s == 'NTD', 'TWD')
        .mask(lambda s: s == 'NT', 'TWD')
        .mask(lambda s: s == '', 'USD')
        .mask(lambda s: s == 'HK', 'HKD')
        .str.strip()
    )
    return df.assign(price_currency=price_currency)

In [9]:
coffee_df = coffee_df.pipe(clean_currency)
coffee_df.loc[:, ["est_price", "price_currency"]].groupby('price_currency').sample(3, replace=True)

Unnamed: 0,est_price,price_currency
2230,AED $103.95/250 grams,AED
2215,AED $99.75/250 grams,AED
473,AED $99.75/250 grams,AED
3182,AUD $23.00/400 grams,AUD
3181,AUD $24.00/300 grams,AUD
3180,AUD $18.00/250 grams,AUD
4414,CAD $15.60/12 ounces,CAD
4655,CAD $12.99/400 grams,CAD
1574,CAD $21.00/12 ounces,CAD
3545,RMB $350/75 grams,CNY


### Converting quantities to grams

In [10]:
to_grams_conversion = {"ounces": 28.3495231, "pounds":453.59237, "kilogram": 1000, "grams": 1}

coffee_df['quantity_in_grams'] = ( 
    np.round(coffee_df['quantity_value'] * 
    coffee_df['quantity_unit'].map(to_grams_conversion), 2)
)

coffee_df.loc[:, ['quantity_unit', 'quantity_value', "quantity_in_grams"]].groupby('quantity_unit').sample(3, replace=True)

Unnamed: 0,quantity_unit,quantity_value,quantity_in_grams
2099,grams,200.0,200.0
3061,grams,250.0,250.0
79,grams,200.0,200.0
4130,kilogram,1.0,1000.0
4130,kilogram,1.0,1000.0
4130,kilogram,1.0,1000.0
3391,ounces,12.0,340.19
649,ounces,12.0,340.19
4565,ounces,12.0,340.19
2312,pounds,6.0,2721.55


### Converting prices to 2024 USD
1. Convert price to USD using historical exchange rates
2. Adjust price to 2023 USD using BLS consumer price index

In [20]:
def convert_currency_to_usd(row):
    date = row['review_date'].strftime('%Y-%m-%d')
    currency = row['price_currency']
    price = row['price_value']
    if currency == 'USD':
        return price
    else:
        return np.round(price / exchange_rates[date][currency], 2)
    
with open(data_dir / 'external' / 'openex_exchange_rates.json') as f:
    exchange_rates = json.load(f)

coffee_df["price_value_usd_hist"] = coffee_df.apply(convert_currency_to_usd, axis=1)

coffee_df.loc[:, ["price_currency", "price_value", "price_value_usd_hist"]].groupby("price_currency").sample(3, replace=True)

Unnamed: 0,price_currency,price_value,price_value_usd_hist
2490,AED,103.95,28.3
2215,AED,99.75,27.16
2490,AED,103.95,28.3
3159,AUD,15.0,11.26
3185,AUD,16.0,11.6
2176,AUD,18.0,12.07
2843,CAD,30.0,24.2
4142,CAD,16.2,16.06
3550,CAD,11.0,8.8
1622,CNY,2.0,0.31


In [14]:
def clean_transform_cpi(df: pd.DataFrame) -> pd.DataFrame:
    month_map = {'Jan': 1, 'Feb': 2, 'Mar': 3,
             'Apr': 4, 'May': 5, 'Jun': 6,
             'Jul': 7, 'Aug': 8, 'Sep': 9,
             'Oct': 10, 'Nov': 11, 'Dec': 12
             }
    return (df
            .melt(id_vars='Year', var_name='Month', value_name='cpi')
            .sort_values(['Year', 'Month'])
            .assign(Month = lambda x: x['Month'].map(month_map),
                date = lambda x: pd.to_datetime(x[['Year', 'Month']].assign(day=1)))
            .dropna()
            .drop(columns=['Year', 'Month'])
            .rename(columns={'cpi': 'consumer_price_index'})
            .sort_values('date')
            .reset_index(drop=True)
            )
    
    
cpi = pd.read_csv(data_dir / 'external' / 'consumer_price_index.csv').drop(columns=['HALF1', 'HALF2'])
cpi = clean_transform_cpi(cpi)
cpi.head()

Unnamed: 0,consumer_price_index,date
0,9.8,1913-01-01
1,9.8,1913-02-01
2,9.8,1913-03-01
3,9.8,1913-04-01
4,9.7,1913-05-01


In [23]:
cpi_jan_2024 = cpi[cpi.date == '2024-01-01']['consumer_price_index'].values[0]

(
    coffee_df
    .merge(cpi, left_on="review_date", right_on="date")
    .drop(columns=['date'])
    .assign(price_usd_2024 = lambda df_: np.round(df_['price_value_usd_hist'] * cpi_jan_2024 / df_['consumer_price_index'], 2),
            price_usd_2024_per_grams = lambda df_: np.round(df_['price_usd_2024'] / df_['quantity_in_grams'], 2)
    )
).sample(5)



Unnamed: 0,rating,roaster,title,blind_assessment,bottom_line,roaster_location,coordinate location,og_roaster_location,roaster_location_identifier,territorial_entity_1,...,agtron_ground,quantity_value,quantity_unit,price_value,price_currency,quantity_in_grams,price_value_usd_hist,consumer_price_index,price_usd_2024,price_usd_2024_per_grams
3763,89,Panther Coffee,Brasil Fazenda Sertao,"Very sweet, delicately lush. Apricot, almond, ...",,Miami,"25.783333333333,-80.216666666667","Miami, Florida",Q8652,Miami-Dade County,...,70.0,8.0,ounces,10.0,USD,226.8,10.0,230.085,13.4,0.06
718,92,Magnolia Coffee,Papua New Guinea Timuza Organic,"Balanced, classic, sweetly bright. Blood orang...",This Papua New Guinea is a classic coffee in t...,Charlotte,"35.226944444444,-80.843333333333","Charlotte, North Carolina",Q16565,Mecklenburg County,...,88.0,12.0,ounces,17.0,USD,340.19,17.0,296.171,17.7,0.05
389,94,Equator Coffees & Teas,Ecuador Finca Lugmapata,"Floral-driven, deep-toned. Lavender, dark choc...","A confident, energetic washed Ecuador cup char...",San Rafael,"37.973333333333336,-122.53083333333333","San Rafael, California",Q631915,Marin County,...,79.0,6.0,ounces,22.0,USD,170.1,22.0,303.363,22.37,0.13
2493,93,Red Rooster Coffee Roaster,Milepost Peru Rayos del Sol,"Delicate, richly sweet. Lilac, roasted cacao n...",For those who enjoy the virtues of decidedly l...,Floyd,"36.9122,-80.3183","Floyd, Virginia",Q1376660,Floyd County,...,82.0,12.0,ounces,14.99,USD,340.19,14.99,244.524,18.91,0.06
2238,95,Dragonfly Coffee Roasters,Lotus by Ninety Plus,"Wildly, disconcertingly original. Sweetly tart...","For well-heeled coffee adventurers, a coffee s...",Boulder,"40.019444444444,-105.29277777778","Boulder, Colorado",Q192517,Boulder County,...,72.0,8.0,ounces,145.0,USD,226.8,145.0,250.546,178.49,0.79


In [16]:
(
    df
    .assign(price_USD_2024 = lambda df_: np.round(df_['price_usd'] * cpi_jan_2024 / df_['cpi'], 2))
    .assign(price_USD_2024_per_100g = lambda df_: np.round(df_['price_USD_2024'] / df_['quantity_grams'] * 100, 2))

)

KeyError: 'price_usd'