In [14]:
%load_ext autoreload
%autoreload 2

from pathlib import Path
import json

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

mpl.rcParams['figure.dpi']= 300

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
data_dir = Path('../../data')
df = pd.read_csv(data_dir / 'interim' / '05052024-roast-review-merged-openrefine.csv')

df.sample(3)

Unnamed: 0,rating,roaster,title,blind_assessment,bottom_line,roaster_location,coordinate location,og_roaster_location,roaster_location_identifier,territorial_entity_1,...,review_date,aroma,body,flavor,aftertaste,with_milk,url,acidity/structure,acidity,notes
316,90,Coffee By Design,1994 Blend,"Richly fruit-toned, wood-framed. Mulberry, ced...",Pleasing aromatic cedar notes frame this fruit...,Portland,"43.66,-70.255","Portland, Maine",Q49201,Cumberland County,...,September 2023,8.0,8.0,8.0,8.0,,https://www.coffeereview.com/review/1994-blend/,8.0,,"A blend of coffees from Ethiopia, both washed ..."
3206,92,Black Oak Coffee Roasters,Boa Vista Brazil,"Very sweet, delicately rich. Lychee, hazelnut,...","A profoundly sweet cup (lychee, honeysuckle) w...",Ukiah,"39.148888888889,-123.20861111111","Ukiah, California",Q837105,Mendocino County,...,May 2016,9.0,8.0,9.0,8.0,,https://www.coffeereview.com/review/boa-vista-...,,8.0,"Produced from trees of the Bourbon, Catuai and..."
2096,94,Lexington Coffee Roasters,Giusto Organic Espresso Blend,"Evaluated as espresso. Richly sweet-tart, citr...","A certified organic espresso blend, classic in...",Lexington,"37.7840208,-79.4428157","Lexington, Virginia",Q501761,Virginia,...,January 2020,9.0,9.0,9.0,8.0,9.0,https://www.coffeereview.com/review/giusto-org...,,,"Certified organic. Founded in 1990, Lexington ..."


In [16]:
def tweak_df(df: pd.DataFrame) -> pd.DataFrame:
    return(
        df
        .dropna(subset=['review_date', 'est_price', 'coffee_origin', 'aroma', 'roast_level', 'aftertaste', 'notes'])
        .assign(review_date = lambda df_: pd.to_datetime(df_['review_date'], format="%B %Y"),
                acidity = lambda df_: df_['acidity'].fillna(df_['acidity/structure']),
                agtron_external = lambda df_: (df_['agtron']
                                                       .str.split('/')
                                                       .str[0].str.strip()
                                                       .replace(['', 'NA', 'g', '0'], np.nan)
                                                       ),
                agtron_ground = lambda df_: (df_['agtron']
                                                     .str.split('/')
                                                     .str[1].str.strip()
                                                     .replace(['', 'NA', 'wb', '0'], np.nan)
                                                     )
        )
        .dropna(subset=['agtron_external', 'agtron_ground', 'acidity'])
        .astype({'agtron_external': 'float', 'agtron_ground': 'float', 'acidity': 'float'})
        .replace('', np.nan)
    )

coffee_df = tweak_df(df)
coffee_df.sample(3)

Unnamed: 0,rating,roaster,title,blind_assessment,bottom_line,roaster_location,coordinate location,og_roaster_location,roaster_location_identifier,territorial_entity_1,...,body,flavor,aftertaste,with_milk,url,acidity/structure,acidity,notes,agtron_external,agtron_ground
1196,93,Luv Moshi,Kenya Nyeri AA Hercules,"Balanced, sweetly savory. Narcissus, date, ora...","A balanced, floral-toned Kenya cup with an esp...",Tainan Village,"23.284746556006855,120.28203735551197","Tainan City, Taiwan",Q96976070,Xinying District,...,9.0,9.0,8.0,,https://www.coffeereview.com/review/kenya-nyer...,8.0,8.0,Despite challenges ranging from unclear govern...,54.0,72.0
3115,93,Equator Coffees & Teas,Costa Rica El Aguacate White Honey,"Crisply sweet, complex. Vanilla, grapefruit ze...","A tartly sweet, crisp honey-processed cup with...",San Rafael,"37.973333333333336,-122.53083333333333","San Rafael, California",Q631915,Marin County,...,8.0,9.0,8.0,,https://www.coffeereview.com/review/costa-rica...,,9.0,Produced by Alejandro Solís Blanco at El Aguac...,52.0,69.0
3367,95,Dragonfly Coffee Roasters,Ethiopia Sidama Special Lot Natural,"Complex, fruity and spice-toned. Sugared grape...",,Boulder,"40.019444444444,-105.29277777778","Boulder, Colorado",Q192517,Boulder County,...,9.0,9.0,8.0,,https://www.coffeereview.com/review/ethiopia-s...,,9.0,Sidamo (also Sidama) is a coffee-growing regio...,63.0,86.0


## Quantity and Price Data Cleaning

In [28]:
pd.set_option('display.max_rows', None)

(
    coffee_df
    .est_price.str.split("/", n=1, expand=True)
    .rename(columns={0: 'price', 1: 'quantity'})
    .assign(quantity = lambda df_: (df_['quantity']
                                    .str.replace(r"\(.*?\)", "", regex=True)
                                    .str.replace(r";.*", "", regex=True)
                                    .str.replace(".", "").str.strip()
                                    .str.replace(r".g$", " grams", regex=True)
                                    .str.replace(r"\sg$", "grams", regex=True)
                                    #.str.replace("^pound$", "1 pound")
                                    .str.strip()
                                    ),
            price = lambda df_: df_['price']
           )
    .dropna()
    .loc[lambda df_: ~df_['quantity'].str.contains('can|box|pouch|packet|tin'), :]
    .assign(quantity_value = lambda df_: (df_['quantity'].str.extract(r'(\d+)').astype(float))),
    
)

(              price          quantity  quantity_value
 1           NT $390         200 grams           200.0
 2           NT $280          8 ounces             8.0
 4           NT $290          8 ounces             8.0
 5            $40.00         150 grams           150.0
 7            $20.00         12 ounces            12.0
 9           NT $250          8 ounces             8.0
 10          NT $500         227 grams           227.0
 11           $20.00         12 ounces            12.0
 12          NT $310          8 ounces             8.0
 13          NT $550         227 grams           227.0
 14           $19.00         12 ounces            12.0
 15           $19.00         12 ounces            12.0
 18          NT $299          8 ounces             8.0
 19           $19.00         12 ounces            12.0
 24           $19.00         12 ounces            12.0
 26          NT $350          8 ounces             8.0
 28          NT $220          8 ounces             8.0
 30       

In [None]:
quantity = (
    df['est_price']
    .str.split('[/;]', n=1, expand=True)
    .dropna()
    .rename(columns={0: 'price', 1: 'amount'})
    .loc[:, 'amount']
    .str.replace('oz.*|ouncues|onces|ounce$|-ounce', 'ounces', regex=True)  # Consolidated handling for "ounces"
    .str.replace('g |g. | g. |g$', 'grams', regex=True)  # Consolidated handling for "grams"
    .str.replace(r'-gram', ' grams', regex=True)
    .str.replace(r'\([^)]*\)', '', regex=True)  # Handles any content within parentheses
    .str.replace(';|\(|\$.*|tin', '', regex=True)  # Remove unwanted characters and substrings
    .str.replace('  ', ' ', regex=True)  # Replace double spaces with single
    .str.replace('8 18 grams pouches', '152 grams', regex=False)  # Specific case replacements
    .str.replace('350 grams 12.3 ounces', '350 grams', regex=False)
    .str.strip()  # Trim whitespace
    .mask(lambda x: x.str.contains('capsule|packet|pods|vue|k-cups|sticks|tubes|cups|boxed|discs|can', case=False), np.nan)
    # Set the value to NaN if certain words are present, indicating non-standard quantities
)

print(quantity.sample(10))

In [None]:
quantity_unit = (
    quantity
    .str.split(' ', expand=True)
    .rename(columns={0: 'quantity', 1: 'unit'})
    .replace({'pound': '1 pounds'})  # Handle the special case where 'pound' should be considered as '1 pound'
    .assign(
        quantity=lambda df_: df_['quantity'].str.extract('(\d+)')[0].astype(float),  # Extract numeric part and convert to float
        unit=lambda df_: df_['unit'].str.extract('([a-zA-Z]+)')[0]  # Extract unit part
    )
    .replace({'unit': {'pounds': 'pound',}})  # Normalize singular 'pounds' to 'pound' if needed
    .assign(quantity_grams=lambda df_: df_['quantity'] * df_['unit'].map(grams_conversions))
    .fillna(value=np.nan)
    .drop(columns=[2, 3])
    .dropna()
)

quantity_unit

In [None]:

price = (
    df['est_price']
      .str.split('[/;]', n=1, expand=True)  # Split the string by '/' or ';' and expand to new DataFrame
      .iloc[:, 0]                           # Select the first column (price part before '/' or ';')
      .str.replace(',', '', regex=False)    # Remove commas from the price for proper conversion
      .str.extract(r'(\d+\.\d+|\d+)')       # Use regex to extract the complete price
      .iloc[:, 0]                           # Select the first column of the extraction result                   # Convert the extracted string to float
      .rename('nominal_price')
      .dropna()
)

price

In [None]:
currency = (
    df['est_price']
    .str.split('[/;]', n=1, expand=True)
    .loc[:, 0]
    .str.replace(',', '', regex=False)
    .str.replace(r'(\d+\.\d+|\d+)', '', regex=True)
    .str.strip()
    .str.replace(' ', '')
    .map(currency_map, na_action='ignore')
    .rename('currency')
    .dropna()
    )


currency
coffee = df.join(quantity_unit, how='left').join(price, how='left').join(currency, how='left')

coffee

In [None]:
with open('data/exchange_rates.json', 'r') as f:
    exchange_rates = json.load(f)
    
df = (df
      .dropna(subset=['review_date', 'currency'])
      .astype({'review_date': 'str'})
      .assign(exchange_rate = lambda df_: df_.apply(lambda x: exchange_rates[x['review_date']][x['currency']], axis=1))
      .assign(price_usd = lambda df_: df_['nominal_price'] / df_['exchange_rate'])
      .assign(review_date = lambda df_: pd.to_datetime(df_['review_date']))
      .round({'price_usd': 2})
)

In [None]:
# Read in CPI data
def cpi_date(filepath: str) -> pd.DataFrame:
    try:
        df = pd.read_csv(filepath).drop(columns=['HALF1', 'HALF2'])
    except Exception as e:
        print(e)
    return (df
          .melt(id_vars='Year', var_name='Month', value_name='cpi')
          .assign(date=lambda df_: pd.to_datetime(df_['Year'].astype(str) + '-' + df_['Month'], format='mixed'))
          .dropna(subset=['cpi'])
          .sort_values('date')
          .drop(columns=['Year', 'Month'])
    )

cpi = cpi_date('data/cpi.csv')
cpi_jan_2024 = cpi.loc[cpi['date'] == '2024-01-01', 'cpi'].values[0]

In [None]:
(
    df
    .join(cpi.set_index('date'), on='review_date')
    # Past dollars in terms of recent dollars = Dollar amount × Ending-period CPI / Beginning-period CPI.
    .assign(price_USD_2024 = lambda df_: np.round(df_['price_usd'] * cpi_jan_2024 / df_['cpi'], 2))
    .assign(price_USD_2024_per_100g = lambda df_: np.round(df_['price_USD_2024'] / df_['quantity_grams'] * 100, 2))

)