In [40]:
%load_ext autoreload
%autoreload 2

from pathlib import Path
import json

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

mpl.rcParams['figure.dpi']= 300


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [41]:
data_dir = Path('../../data')
df = pd.read_csv(data_dir / 'interim' / '05052024-roast-review-merged-openrefine.csv')

df.sample(3)

Unnamed: 0,rating,roaster,title,blind_assessment,bottom_line,roaster_location,coordinate location,og_roaster_location,roaster_location_identifier,territorial_entity_1,...,review_date,aroma,body,flavor,aftertaste,with_milk,url,acidity/structure,acidity,notes
1715,96,Big Shoulders Coffee,Panama Hacienda La Esmeralda Gesha,"Complex, floral- and citrus-toned. Lilac, coco...",A classic washed Geisha from the celebrated Ha...,Chicago,"41.85003,-87.65005","Chicago, Illinois",Q1297,Cook County,...,December 2020,9.0,9.0,10.0,9.0,,https://www.coffeereview.com/review/panama-hac...,9.0,,Coffee from trees of the botanical variety Gei...
2888,93,Kakalove Cafe,Ethiopia Natural Guji Kayon Mountain,"High-toned, delicately sweet. Jasmine, blackbe...","An intense, delicately rich dried-in-the-fruit...",Chiayi City,"23.48,120.44972222222","Chia-Yi, Taiwan",Q249995,Taiwan Province,...,June 2017,9.0,9.0,9.0,8.0,,https://www.coffeereview.com/review/ethiopia-n...,,8.0,Southern Ethiopia coffees like this one are pr...
945,95,PT's Coffee Roasting Co.,Panama Hacienda La Esmeralda Porton 5N49 Gesha...,"Juicy, balanced, bright. Bing cherry, wisteria...","A dynamic, exhilarating Panama Gesha driven by...",Topeka,"39.04833,-95.67804","Topeka, Kansas",Q41057,Shawnee County,...,June 2022,9.0,9.0,10.0,8.0,,https://www.coffeereview.com/review/panama-hac...,9.0,,Produced at El Velo Farm by Hacienda La Esmera...


In [42]:
def tweak_df(df: pd.DataFrame) -> pd.DataFrame:
    return(
        df
        .dropna(subset=['review_date', 'est_price', 'coffee_origin', 'aroma', 'roast_level', 'aftertaste', 'notes'])
        .assign(review_date = lambda df_: pd.to_datetime(df_['review_date'], format="%B %Y"),
                acidity = lambda df_: df_['acidity'].fillna(df_['acidity/structure']),
                agtron_external = lambda df_: (df_['agtron']
                                                       .str.split('/')
                                                       .str[0].str.strip()
                                                       .replace(['', 'NA', 'g', '0'], np.nan)
                                                       ),
                agtron_ground = lambda df_: (df_['agtron']
                                                     .str.split('/')
                                                     .str[1].str.strip()
                                                     .replace(['', 'NA', 'wb', '0'], np.nan)
                                                     )
        )
        .dropna(subset=['agtron_external', 'agtron_ground', 'acidity'])
        .astype({'agtron_external': 'float', 'agtron_ground': 'float', 'acidity': 'float'})
        .replace('', np.nan)
    )

coffee_df = tweak_df(df)
coffee_df.sample(3)

Unnamed: 0,rating,roaster,title,blind_assessment,bottom_line,roaster_location,coordinate location,og_roaster_location,roaster_location_identifier,territorial_entity_1,...,body,flavor,aftertaste,with_milk,url,acidity/structure,acidity,notes,agtron_external,agtron_ground
2926,92,Big Island Coffee Roasters,Puna Theobroma Coffee,"Bittersweet, powerfully cocoa- and floral-tone...",A beverage of contrasts. Lush but crisp in its...,Mountain View,"19.539722222222,-155.14138888889","Mountain View, Hawaii",Q2333799,Hawaii County,...,8.0,9.0,9.0,,https://www.coffeereview.com/review/puna-theob...,,7.0,A wet-processed coffee from the Puna District ...,44.0,58.0
2041,92,Big Shoulders Coffee,Burundi Kibingo,"Spice-toned, richly sweet. Baking spices, cara...","A rich, deeply sweet Burundi cup that evokes g...",Chicago,"41.85003,-87.65005","Chicago, Illinois",Q1297,Cook County,...,8.0,9.0,8.0,,https://www.coffeereview.com/review/burundi-ki...,8.0,8.0,Roasted on Tuesdays. Produced entirely of the ...,62.0,82.0
2088,92,Equator Coffees & Teas,Ethiopia FTO,"Crisp, delicately sweet-toned. Bergamot, Fig N...","A friendly, straightforward washed Ethiopia cu...",San Rafael,"37.973333333333336,-122.53083333333333","San Rafael, California",Q631915,Marin County,...,8.0,9.0,8.0,,https://www.coffeereview.com/review/ethiopia-fto/,8.0,8.0,This coffee tied for the second-highest rating...,54.0,72.0


## Quantity and Price Data Cleaning

In [82]:

price_quantity = (
    coffee_df
    .est_price.str.split("/", n=1, expand=True)
    .rename(columns={0: 'price', 1: 'quantity'})
    .assign(quantity = lambda df_: (df_['quantity']
                                    .str.replace(r"\(.*?\)", "", regex=True)
                                    .str.replace(r";.*", "", regex=True)
                                    .str.replace(".", "").str.strip()
                                    .str.replace(r".g$", " grams", regex=True)
                                    .str.replace(r"\sg$", "grams", regex=True)
                                    .str.replace(r"pound$", "1 pounds", regex=True)
                                    .str.replace(r"oz|onces|ounce$|ounces\*", "ounces", regex=True)
                                    .str.replace("online", "")
                                    .str.strip()
                                    ),
            price = lambda df_: df_['price']
           )
    .dropna()
    .loc[lambda df_: ~df_['quantity'].str.contains('can|box|pouch|packet|tin'), :]
    .assign(quantity_value = lambda df_: (df_['quantity'].str.extract(r'(\d+)').astype(float)),
            quantity_unit = lambda df_: df_['quantity'].str.replace(r"(\d+)", "", regex=True).str.strip(),
            price_value = lambda df_: df_['price'].str.extract(r'(\d+\.\d+|\d+)').astype(float),
            price_currency = lambda df_: df_['price'].str.replace(",", "").str.replace(r'(\d+\.\d+|\d+)', '', regex=True).str.strip()
            )
    .drop(columns=['price', 'quantity'])
)

price_quantity.sample(20)

Unnamed: 0,quantity_value,quantity_unit,price_value,price_currency
4455,16.0,ounces,19.99,$
2157,12.0,ounces,20.95,$
1707,12.0,ounces,23.0,$
1429,227.0,grams,450.0,NT $
2656,225.0,grams,500.0,NT $
4070,12.0,ounces,15.5,$
266,200.0,grams,350.0,NT $
1007,12.0,ounces,18.0,$
2002,12.0,ounces,18.75,$
1481,12.0,ounces,14.0,$


In [45]:
coffee_df_merged = coffee_df.merge(price_quantity, left_index=True, right_index=True)
coffee_df_merged.sample(3)

Unnamed: 0,rating,roaster,title,blind_assessment,bottom_line,roaster_location,coordinate location,og_roaster_location,roaster_location_identifier,territorial_entity_1,...,url,acidity/structure,acidity,notes,agtron_external,agtron_ground,quantity_value,quantity_unit,price_value,price_currency
3551,91,Java Blend Coffee Roasters,Sulawesi Tana Toraja,"Deep, round, gently bright. Aprium, malt, lily...",,Halifax,"44.647777777778,-63.571388888889","Halifax, Nova Scotia, Canada",Q2141,Nova Scotia,...,https://www.coffeereview.com/review/sulawesi-t...,,8.0,This coffee originates from the Toarco (TOraja...,51.0,66.0,16.0,ounces,15.0,$
245,90,VERYTIME,Ethiopia Maria Gesha,"Crisply sweet, cocoa-toned. Baking chocolate, ...","A friendly, accessible Ethiopia with crisp cho...",Taoyuan,"24.991278,121.314328","Taoyuan, Taiwan",Q115256,Taiwan,...,https://www.coffeereview.com/review/ethiopia-m...,8.0,8.0,The celebrated Geisha (also spelled Gesha) var...,64.0,82.0,8.0,ounces,350.0,NT $
1462,94,GK Coffee,Kenya AA Top Gathambi Station,"Deep-toned, richly sweet-savory. Tamarind, hop...","A classic Kenya cup, equally sweet and savory,...",Yilan County,"24.75,121.75","Yilan, Taiwan",Q237258,Taiwan Province,...,https://www.coffeereview.com/review/kenya-aa-t...,9.0,9.0,Produced by smallholding farmers near the Gath...,58.0,76.0,200.0,grams,550.0,NT $


### Cleaning Currencies



In [47]:
coffee_df_merged.price_currency.value_counts()

price_currency
$           3457
NT $         578
CAD $         94
NT$           43
NTD $         21
HKD $         16
¥             11
RMB $          9
AUD $          9
KRW            9
NT             8
KRW $          7
TWD $          7
£              6
HK $           5
AED $          5
IDR $          3
$ NTD          3
THB $          3
THB            2
US $           2
USD $          2
#              2
RMB            2
GBP            2
$NT            1
               1
pesos          1
$NT$           1
Nt $           1
CNY $          1
KRW$           1
E              1
GTQ            1
Price: $       1
$ CAD          1
Name: count, dtype: int64

In [104]:
(
    coffee_df_merged.price_currency
    .str.strip()
    .str.upper()
    .str.replace(r'^\$$', 'USD', regex=True)
    .str.replace('PRICE: $', 'USD')
    .str.replace('$', '')
    .str.replace('#', 'GBP')
    .str.replace('¥', 'CNY')
    .str.replace('£', 'GBP')
    .str.replace('POUND', 'GBP')
    .str.replace(r'NT|NTD', 'TWD', regex=True)
    .mask(lambda s: s == "US", "USD")
    .mask(lambda s: s == ' ', "USD")
    .mask(lambda s: s == 'E', 'EUR')
    .mask(lambda s: s == 'pesos', 'MXN')
    .mask(lambda s: s == 'NTD', 'TWD')
    .str.strip()
).value_counts()

price_currency
USD      3460
TWD       639
CAD        95
TWDD       24
KRW        17
HKD        16
CNY        12
RMB        11
GBP        10
AUD         9
HK          5
AED         5
THB         5
IDR         3
US          2
EUR         1
PESOS       1
            1
GTQ         1
Name: count, dtype: int64

In [89]:
coffee_df_merged[coffee_df_merged.price_currency == 'pesos']

Unnamed: 0,rating,roaster,title,blind_assessment,bottom_line,roaster_location,coordinate location,og_roaster_location,roaster_location_identifier,territorial_entity_1,...,url,acidity/structure,acidity,notes,agtron_external,agtron_ground,quantity_value,quantity_unit,price_value,price_currency
1510,93,Ilustre Specialty Coffees,Pink Bourbon Natural Anaerobic Colombia,"Floral, tropical, tangy. Star jasmine, lychee,...",A uniquely composed Colombia Pink Bourbon proc...,Tijuana,"32.53333333333333,-117.03333333333333","Tijuana, Mexico",Q2360428,Baja California,...,https://www.coffeereview.com/review/pink-bourb...,9.0,9.0,Produced entirely from trees of the admired Pi...,62.0,78.0,200.0,grams,500.0,pesos


In [None]:
with open('data/exchange_rates.json', 'r') as f:
    exchange_rates = json.load(f)
    
df = (df
      .dropna(subset=['review_date', 'currency'])
      .astype({'review_date': 'str'})
      .assign(exchange_rate = lambda df_: df_.apply(lambda x: exchange_rates[x['review_date']][x['currency']], axis=1))
      .assign(price_usd = lambda df_: df_['nominal_price'] / df_['exchange_rate'])
      .assign(review_date = lambda df_: pd.to_datetime(df_['review_date']))
      .round({'price_usd': 2})
)

In [None]:
# Read in CPI data
def cpi_date(filepath: str) -> pd.DataFrame:
    try:
        df = pd.read_csv(filepath).drop(columns=['HALF1', 'HALF2'])
    except Exception as e:
        print(e)
    return (df
          .melt(id_vars='Year', var_name='Month', value_name='cpi')
          .assign(date=lambda df_: pd.to_datetime(df_['Year'].astype(str) + '-' + df_['Month'], format='mixed'))
          .dropna(subset=['cpi'])
          .sort_values('date')
          .drop(columns=['Year', 'Month'])
    )

cpi = cpi_date('data/cpi.csv')
cpi_jan_2024 = cpi.loc[cpi['date'] == '2024-01-01', 'cpi'].values[0]

In [None]:
(
    df
    .join(cpi.set_index('date'), on='review_date')
    # Past dollars in terms of recent dollars = Dollar amount × Ending-period CPI / Beginning-period CPI.
    .assign(price_USD_2024 = lambda df_: np.round(df_['price_usd'] * cpi_jan_2024 / df_['cpi'], 2))
    .assign(price_USD_2024_per_100g = lambda df_: np.round(df_['price_USD_2024'] / df_['quantity_grams'] * 100, 2))

)