In [1]:
# OPTIONAL: Load the "autoreload" extension so that code can change
%load_ext autoreload

# OPTIONAL: always reload modules so that as you change code in src, it gets loaded
%autoreload 2

from pathlib import Path
import json

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn.objects as so

import src.config

mpl.rcParams['figure.dpi']= 300

In [6]:
def tweak_df(df):
    return (df
            .replace('na', np.nan)
            .dropna(subset=['origin_country_cluster', 'roaster_location', 'roast_level', 'body', 'aroma'])
            .assign(origin_country=lambda df_: (df_['origin_country_cluster']
                                                .str.split(',')
                                                .str[-1]
                                                .str.strip()
                                                ),
                    origin_region=lambda df_: (df_['origin_country_cluster']
                                               .str.split(',')
                                               .str[:-1]
                                               .str.join(',')
                                               .str.strip()
                                               ),
                    review_date = lambda df_: pd.to_datetime(df_['review_date'], format="%B %Y"),
                    acidity = lambda df_: df_['acidity'].fillna(df_['acidity/structure']),
                    agtron_external = lambda df_: (df_['agtron']
                                                   .str.split('/')
                                                   .str[0].str.strip()
                                                   .replace(['', 'NA', 'g', '0'], np.nan)
                                                   ),
                    agtron_ground = lambda df_: (df_['agtron']
                                                 .str.split('/')
                                                 .str[1].str.strip()
                                                 .replace(['', 'NA', 'wb', '0'], np.nan)),
            )
            .drop(columns=['acidity/structure', 'with_milk', 'bottom_line', 'agtron'])
            .dropna(subset=['acidity']) # drop rows where acidity is still NaN, these are espresso reviews
            .rename(columns={'country': 'roaster_country', 'roaster_location': 'roaster_city', 
                             'located in the administrative territorial entity': 'roaster_region',
                             'coordinate location': 'roaster_coordinates'})
            .assign(origin_region=lambda df_: df_['origin_region'].replace('', np.nan))
            .astype({'agtron_external': 'Int64', 'agtron_ground': 'Int64', 'acidity': 'float',
                     'aftertaste': 'Int64', 'aroma': 'float', 'body': 'float',
                     'rating': 'Int64', 'flavor': 'float',})
    )


# File path
data_dir = Path("../../data")
file_path = data_dir / "interim" / "raw-roast-reviews-openrefine.csv"

# Load post-openrefine data
df = tweak_df(pd.read_csv(file_path))
df.head()

Unnamed: 0,roaster_city,roaster_country,roaster_coordinates,roaster_region,roaster_location_identifier,coffee_origin,origin_country_cluster,roast_level,est_price,review_date,...,roaster,name,blind_assessment,notes,url,acidity,origin_country,origin_region,agtron_external,agtron_ground
0,Hong Kong,People's Republic of China,"22.278333333333,114.15861111111",People's Republic of China,Q8646,multiple,multiple,Medium-Light,CNY 160/250 grams,2023-11-01,...,Nodi Coffee,Windsor Blend,"Gently fruit-toned, delicately floral. Apricot...","A blend of two coffees, one Ethiopia and one R...",https://www.coffeereview.com/review/windsor-bl...,8.0,multiple,,60,78
1,Madison,United States of America,"43.07472222222222,-89.38416666666667",Dane County,Q43788,"Nyeri growing region, south-central Kenya","Nyeri growing region, Kenya",Medium-Light,$22.00/12 ounces,2023-12-01,...,JBC Coffee Roasters,Nyeri Hill Kenya,"Richly sweet-tart. Blackberry compote, cocoa n...","Produced at Nyeri Hill Estate, from trees of t...",https://www.coffeereview.com/review/nyeri-hill...,9.0,Kenya,Nyeri growing region,56,74
2,Plymouth,United States of America,"41.95861111111111,-70.66777777777777",Plymouth County,Q326295,"Monte Verde, Santa Ana Department, central El ...","Apaneca-Ilamatepec growing region, El Salvador",Medium-Light,$16.00/12 ounces,2023-12-01,...,Speedwell Coffee,El Salvador Los Cipreses,"Richly sweet, floral-toned. Nutella, magnolia,...",Produced by Rene Contreras on the family farm ...,https://www.coffeereview.com/review/el-salvado...,9.0,El Salvador,Apaneca-Ilamatepec growing region,58,74
3,Madison,United States of America,"43.07472222222222,-89.38416666666667",Dane County,Q43788,"Tarrazu, Costa Rica","Tarrazú growing region, Costa Rica",Medium-Light,$18.00/8 ounces,2023-12-01,...,JBC Coffee Roasters,Cordillera de Fuego Costa Rica,"Richly sweet, deeply spice-toned. Cinnamon, cl...","Produced by Luis Eduardo Campos, from trees of...",https://www.coffeereview.com/review/cordillera...,9.0,Costa Rica,Tarrazú growing region,59,77
4,Peoria,United States of America,"40.6925,-89.59",Peoria County,Q233129,"Los Robles de Naranjo, West Valley, Costa Rica","West Valley growing region, Costa Rica",Medium-Light,$22.00/12 ounces,2023-12-01,...,Intuition Coffee,Costa Rica Finca Tono Natural,"Gently fruity, richly nut-toned. Hazelnut butt...","Produced by Felipe and Erasmo Aguilera, entire...",https://www.coffeereview.com/review/costa-rica...,8.0,Costa Rica,West Valley growing region,57,76


In [9]:
def load_json(file_path):
    with open(file_path) as f:
        return json.load(f)
    
# Load conversions
conversions = load_json(data_dir / "conversions.json")
grams_conversions = conversions['grams_conversions']
currency_map = conversions['currency_map']

FileNotFoundError: [Errno 2] No such file or directory: '../../data/conversions.json'

In [10]:
quantity = (
    df['est_price']
    .str.split('[/;]', n=1, expand=True)
    .dropna()
    .rename(columns={0: 'price', 1: 'amount'})
    .loc[:, 'amount']
    .str.replace('oz.*|ouncues|onces|ounce$|-ounce', 'ounces', regex=True)  # Consolidated handling for "ounces"
    .str.replace('g |g. | g. |g$', 'grams', regex=True)  # Consolidated handling for "grams"
    .str.replace(r'-gram', ' grams', regex=True)
    .str.replace(r'\([^)]*\)', '', regex=True)  # Handles any content within parentheses
    .str.replace(';|\(|\$.*|tin', '', regex=True)  # Remove unwanted characters and substrings
    .str.replace('  ', ' ', regex=True)  # Replace double spaces with single
    .str.replace('8 18 grams pouches', '152 grams', regex=False)  # Specific case replacements
    .str.replace('350 grams 12.3 ounces', '350 grams', regex=False)
    .str.strip()  # Trim whitespace
    .mask(lambda x: x.str.contains('capsule|packet|pods|vue|k-cups|sticks|tubes|cups|boxed|discs|can', case=False), np.nan)
    # Set the value to NaN if certain words are present, indicating non-standard quantities
)

print(quantity.sample(10))

2006             100 grams
911              12 ounces
4645             12 ounces
5127             12 ounces
2428             12 ounces
4215             12 ounces
5732             12 ounces
6163             10 ounces
671              12 ounces
6283    250grams8.8 ounces
Name: amount, dtype: object


In [11]:
quantity_unit = (
    quantity
    .str.split(' ', expand=True)
    .rename(columns={0: 'quantity', 1: 'unit'})
    .replace({'pound': '1 pounds'})  # Handle the special case where 'pound' should be considered as '1 pound'
    .assign(
        quantity=lambda df_: df_['quantity'].str.extract('(\d+)')[0].astype(float),  # Extract numeric part and convert to float
        unit=lambda df_: df_['unit'].str.extract('([a-zA-Z]+)')[0]  # Extract unit part
    )
    .replace({'unit': {'pounds': 'pound',}})  # Normalize singular 'pounds' to 'pound' if needed
    .assign(quantity_grams=lambda df_: df_['quantity'] * df_['unit'].map(grams_conversions))
    .fillna(value=np.nan)
    .drop(columns=[2, 3])
    .dropna()
)

quantity_unit

NameError: name 'grams_conversions' is not defined

In [12]:

price = (
    df['est_price']
      .str.split('[/;]', n=1, expand=True)  # Split the string by '/' or ';' and expand to new DataFrame
      .iloc[:, 0]                           # Select the first column (price part before '/' or ';')
      .str.replace(',', '', regex=False)    # Remove commas from the price for proper conversion
      .str.extract(r'(\d+\.\d+|\d+)')       # Use regex to extract the complete price
      .iloc[:, 0]                           # Select the first column of the extraction result                   # Convert the extracted string to float
      .rename('nominal_price')
      .dropna()
)

price

0         160
1       22.00
2       16.00
3       18.00
4       22.00
        ...  
6456    14.99
6468    45.99
6470    19.95
6471       30
6725    11.95
Name: nominal_price, Length: 4619, dtype: object

In [13]:
currency = (
    df['est_price']
    .str.split('[/;]', n=1, expand=True)
    .loc[:, 0]
    .str.replace(',', '', regex=False)
    .str.replace(r'(\d+\.\d+|\d+)', '', regex=True)
    .str.strip()
    .str.replace(' ', '')
    .map(currency_map, na_action='ignore')
    .rename('currency')
    .dropna()
    )


currency
coffee = df.join(quantity_unit, how='left').join(price, how='left').join(currency, how='left')

coffee

NameError: name 'currency_map' is not defined

In [94]:
with open('data/exchange_rates.json', 'r') as f:
    exchange_rates = json.load(f)
    
df = (df
      .dropna(subset=['review_date', 'currency'])
      .astype({'review_date': 'str'})
      .assign(exchange_rate = lambda df_: df_.apply(lambda x: exchange_rates[x['review_date']][x['currency']], axis=1))
      .assign(price_usd = lambda df_: df_['nominal_price'] / df_['exchange_rate'])
      .assign(review_date = lambda df_: pd.to_datetime(df_['review_date']))
      .round({'price_usd': 2})
)

FileNotFoundError: [Errno 2] No such file or directory: 'data/exchange_rates.json'

In [80]:
# Read in CPI data
def cpi_date(filepath: str) -> pd.DataFrame:
    try:
        df = pd.read_csv(filepath).drop(columns=['HALF1', 'HALF2'])
    except Exception as e:
        print(e)
    return (df
          .melt(id_vars='Year', var_name='Month', value_name='cpi')
          .assign(date=lambda df_: pd.to_datetime(df_['Year'].astype(str) + '-' + df_['Month'], format='mixed'))
          .dropna(subset=['cpi'])
          .sort_values('date')
          .drop(columns=['Year', 'Month'])
    )

cpi = cpi_date('data/cpi.csv')
cpi_jan_2024 = cpi.loc[cpi['date'] == '2024-01-01', 'cpi'].values[0]

[Errno 2] No such file or directory: 'data/cpi.csv'


UnboundLocalError: cannot access local variable 'df' where it is not associated with a value

In [81]:
(
    df
    .join(cpi.set_index('date'), on='review_date')
    # Past dollars in terms of recent dollars = Dollar amount × Ending-period CPI / Beginning-period CPI.
    .assign(price_USD_2024 = lambda df_: np.round(df_['price_usd'] * cpi_jan_2024 / df_['cpi'], 2))
    .assign(price_USD_2024_per_100g = lambda df_: np.round(df_['price_USD_2024'] / df_['quantity_grams'] * 100, 2))

)

NameError: name 'cpi' is not defined