# Data Cleaning
- Load data that has been processed in OpenRefine and make some general tweaks
- Split est_price into quantity and price columns
    - Cleanup quantity units and convert all values to single unit
    - Cleanup and standardize currency symbols
- Load historical exchange rate data, and convert all data to historical USD using the exchange rate from the month of the 
review date
- Load consumer price index data from BLS. Use this to convert all historical USD prices to January 2024 USD prices
- Cleanup geographic data
    - Create a column for all counties
    - Create a column for US State
    - Check country column is only countries

In [159]:
%load_ext autoreload
%autoreload 2

from pathlib import Path
import json

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

pd.set_option('display.max_columns', None)
mpl.rcParams['figure.dpi']= 300

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [171]:
data_dir = Path('../../data')
file_path = data_dir / 'interim' / '05052024_roast_review_merged_openrefine.csv'
df_raw = pd.read_csv(file_path)

display(df_raw.info())
display(df_raw.sample(5))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6539 entries, 0 to 6538
Data columns (total 29 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   rating                            6539 non-null   int64  
 1   roaster                           6539 non-null   object 
 2   title                             6539 non-null   object 
 3   blind_assessment                  6539 non-null   object 
 4   bottom_line                       3260 non-null   object 
 5   roaster_location                  6538 non-null   object 
 6   coordinate location               6538 non-null   object 
 7   og_roaster_location               6537 non-null   object 
 8   roaster_location_identifier       6538 non-null   object 
 9   territorial_entity_1              6531 non-null   object 
 10  territorial_entity_1_identifiers  6531 non-null   object 
 11  territorial_entity_2              5889 non-null   object 
 12  territ

None

Unnamed: 0,rating,roaster,title,blind_assessment,bottom_line,roaster_location,coordinate location,og_roaster_location,roaster_location_identifier,territorial_entity_1,territorial_entity_1_identifiers,territorial_entity_2,territorial_entity_2_identifiers,roaster_country,coffee_origin,coffee_origin_country,roast_level,agtron,est_price,review_date,aroma,body,flavor,aftertaste,with_milk,url,acidity/structure,acidity,notes
5179,91,Paradise Roasters,Hawaii Ka’u AhSan Farm,"A rich, balanced lighter-roasted cup. Complex ...",,Ramsey,"45.232055555555554,-93.4605","Ramsey, Minnesota",Q1992875,Anoka County,Q110495,Minnesota,Q1527,USA,"Kau growing region, Big Island, Hawaii, USA",United States of America,Light,65/85,,December 2009,8.0,8.0,9.0,7.0,,https://www.coffeereview.com/review/hawaii-kau...,,8,Ka'u is a new Hawaii coffee district with a gr...
3174,93,JBC Coffee Roasters,Tolima Colombia,"Deeply pungent, sweet-savory. Magnolia, dried ...","A unique cup. Suggests a good IPA: sweet, thou...",Madison,"43.07472222222222,-89.38416666666667","Madison, Wisconsin",Q43788,Dane County,Q502200,Wisconsin,Q1537,USA,"Tolima Department, Colombia",Colombia,Medium-Light,59/82,$14.85/12 ounces,June 2016,9.0,9.0,9.0,8.0,,https://www.coffeereview.com/review/tolima-col...,,8,Produced by the Resguardo Indigena Nasa We'sx ...
5418,95,Willoughby's Coffee & Tea,Kenya AA Gachatha,"Dry berry notes (the famous Kenya ""black curra...",,Branford,"41.277777777778,-72.799722222222","Branford, Connecticut",Q898484,South Central Connecticut Planning Region,Q117287005,Connecticut,Q779,USA,Kenya,Kenya,Medium,47/58,,December 2008,9.0,8.0,9.0,8.0,,https://www.coffeereview.com/review/kenya-aa-g...,,8,Despite stresses brought on by social unrest a...
4816,76,Douwe Egberts,Senseo Colombian (paper pod),(As brewed in a Senseo single-serve brewing de...,,Downers Grove,"41.7947,-88.0169","Downers Grove, Illinois",Q1007011,DuPage County,Q109626,Illinois,Q1204,USA,Colombia,Colombia,Very Dark,0/50,$21.99/ 64 pods,April 2011,6.0,6.0,4.0,4.0,,https://www.coffeereview.com/review/senseo-col...,,6,"Douwe Egberts, the venerable Dutch firm become..."
6367,87,Neighbors Coffee,Guatemalan Huehuetenango,"Splendid aroma: sweet, smoky/spicy, cantaloupe...",,Oklahoma City,"35.4823,-97.5352","Oklahoma City, Oklahoma",Q34863,Oklahoma County,Q485038,Oklahoma,Q1649,USA,"Huehuetenango growing region, Guatemala",Guatemala,Medium,49/69,,September 2003,8.0,7.0,7.0,,,https://www.coffeereview.com/review/guatemalan...,,8,"Huehuetenango, a mountainous, rugged growing r..."


In [164]:
def tweak_coffee(df: pd.DataFrame) -> pd.DataFrame:
    return(
        df
        .assign(review_date = lambda df_: pd.to_datetime(df_['review_date'], format="%B %Y"),
                # Combing acidity and acidity/structure into one column, they are the same 
                # field but names used in reviews changed at one point
                acidity = lambda df_: df_['acidity'].fillna(df_['acidity/structure']),
                # Split the agtron column into one for external bean agtron data and ground 
                # bean agtron data
                agtron_external = lambda df_: (df_['agtron']
                                                       .str.split('/')
                                                       .str[0].str.strip()
                                                       .replace(['', 'NA', 'g', '0'], np.nan)
                                                       ),
                agtron_ground = lambda df_: (df_['agtron']
                                                     .str.split('/')
                                                     .str[1].str.strip()
                                                     .replace(['', 'NA', 'wb', '0'], np.nan)
                                                     )
        )
        .dropna(subset=['agtron_external', 'agtron_ground', 'acidity',
                        'review_date', 'est_price', 'coffee_origin',
                        'aroma', 'roast_level', 'aftertaste',]
        )
        .drop(columns=['with_milk', 'acidity/structure', 'agtron'])
        .astype({'agtron_external': 'float', 'agtron_ground': 'float', 'acidity': 'float'})
        .replace('', np.nan)
    )


df = df_raw.pipe(tweak_coffee)

# Save the review dates to a separate file
df.review_date.drop_duplicates().to_csv(data_dir / 'processed' / 'review_dates.csv', index=False)

display(df.info())
display(df.sample(3))


<class 'pandas.core.frame.DataFrame'>
Index: 4333 entries, 1 to 5163
Data columns (total 28 columns):
 #   Column                            Non-Null Count  Dtype         
---  ------                            --------------  -----         
 0   rating                            4333 non-null   int64         
 1   roaster                           4333 non-null   object        
 2   title                             4333 non-null   object        
 3   blind_assessment                  4333 non-null   object        
 4   bottom_line                       2860 non-null   object        
 5   roaster_location                  4333 non-null   object        
 6   coordinate location               4333 non-null   object        
 7   og_roaster_location               4333 non-null   object        
 8   roaster_location_identifier       4333 non-null   object        
 9   territorial_entity_1              4331 non-null   object        
 10  territorial_entity_1_identifiers  4331 non-null   obj

None

Unnamed: 0,rating,roaster,title,blind_assessment,bottom_line,roaster_location,coordinate location,og_roaster_location,roaster_location_identifier,territorial_entity_1,territorial_entity_1_identifiers,territorial_entity_2,territorial_entity_2_identifiers,roaster_country,coffee_origin,coffee_origin_country,roast_level,est_price,review_date,aroma,body,flavor,aftertaste,url,acidity,notes,agtron_external,agtron_ground
3572,91,Paradise Roasters,Peru Cajamarca,"Sweet, graceful, gently pungent. Walnut, peach...",,Ramsey,"45.232055555555554,-93.4605","Ramsey, Minnesota",Q1992875,Anoka County,Q110495,Minnesota,Q1527,USA,"Cajamarca growing region, Peru",Peru,Medium,$13.95/12 ounces,2015-03-01,9.0,8.0,8.0,8.0,https://www.coffeereview.com/review/peru-cajam...,8.0,Produced by small-holding farmers from trees o...,50.0,68.0
2036,90,Paradise Roasters,S.O. Espresso Brazil Fazenda Cachoeira Yellow ...,"Sweetly nut-toned, chocolaty. Hazelnut butter,...",Designed as a single-origin espresso but cuppe...,Minneapolis,"44.981944444444444,-93.26916666666666","Minneapolis, Minnesota",Q36091,Hennepin County,Q486229,Minnesota,Q1527,USA,"Mogiana growing region, Sao Paulo State, Brazil",Brazil,Medium,$14.95/12 ounces,2020-04-01,8.0,8.0,8.0,8.0,https://www.coffeereview.com/review/s-o-espres...,8.0,Produced at Fazenda Cachoeira de Grama entirel...,50.0,68.0
1016,95,RamsHead Coffee Roasters,Ethiopia Odola Washed,"Gently fruit-forward, elegantly floral. Wild r...","A pretty, quietly confident washed Ethiopia cu...",Bozeman,"45.677777777778,-111.04722222222","Bozeman, Montana",Q569678,Gallatin County,Q387978,Montana,Q1212,USA,"Guji Zone, Oromia Region, Ethiopia",Ethiopia,Light,$18.95/12 ounces,2022-05-01,9.0,9.0,10.0,8.0,https://www.coffeereview.com/review/ethiopia-o...,9.0,Produced by smallholding farmers surrounding t...,60.0,82.0


## Quantity and Price Data Cleaning
Here we will split est_price into a column for quantity, quantity unit, price, and currency. Currency symbols will be standardized and all prices will be converted to USD. Quanity units will be cleaned then used to create a column for quantity in lbs. 

In [166]:

def price_quantity_split(df: pd.DataFrame) -> pd.DataFrame:
    price_quantity = (
        df
        .replace(',', '', regex=True)
        # Split est_price into columns for price and quantity
        .est_price.str.split("/", n=1, expand=True)
        .rename(columns={0: 'price', 1: 'quantity'})
        .assign(quantity = lambda df_: (df_['quantity']
                                        # Remove anything in parentheses, including the parentheses
                                        .str.replace(r"\(.*?\)", "", regex=True)
                                        # Remove anything after a semicolon
                                        .str.replace(r";.*", "", regex=True)
                                        # Standardize units
                                        .str.replace(r".g$", " grams", regex=True)
                                        .str.replace(r"\sg$", "grams", regex=True)
                                        .str.replace(r"pound$", "1 pounds", regex=True)
                                        .str.replace(r"oz|onces|ounce$|ounces\*", "ounces", regex=True)
                                        # Removie "online" from any quantity
                                        .str.replace("online", "")
                                        .str.strip()
                                        )
            )
        .dropna()
        # Remove rows where coffee is sold in a can, box, pouch, packet, or tin
        .loc[lambda df_: ~df_['quantity'].str.contains('can|box|pouch|packet|tin'), :]
        # Split quantity into value and unit, and split price into value and currency
        .assign(quantity_value = lambda df_: (df_['quantity']
                                              .str.extract(r'(\d+)')
                                              .astype(float)
                                              ),
                quantity_unit = lambda df_: (df_['quantity']
                                             .str.replace(r"(\d+)", "", regex=True)
                                             .replace("\.", "", regex=True)
                                             .str.strip()
                                             .mask(lambda s: s == 'g', 'grams')
                                             .str.strip()
                                             ),
                price_value = lambda df_: (df_['price']
                                           .str.extract(r'(\d+\.\d+|\d+)')
                                           .astype(float)
                                           ),
                price_currency = lambda df_: (df_['price']
                                              .str.replace(",", "")
                                              .str.replace(r'(\d+\.\d+|\d+)', '', regex=True)
                                              .str.strip()
                                              )
                )
        # Now that price and quantity have been split, drop the original columns
        .drop(columns=['price', 'quantity'])
    )
    # Merge the price_quantity DataFrame with the original DataFrame
    return df.merge(price_quantity, left_index=True, right_index=True)


df = df_raw.pipe(tweak_coffee).pipe(price_quantity_split)

display(df.info())
display(df.loc[:, ['price_value', 'quantity_value']].describe())
display(df['price_currency'].value_counts())   
display(df['quantity_unit'].value_counts()) 

<class 'pandas.core.frame.DataFrame'>
Index: 4318 entries, 1 to 5163
Data columns (total 32 columns):
 #   Column                            Non-Null Count  Dtype         
---  ------                            --------------  -----         
 0   rating                            4318 non-null   int64         
 1   roaster                           4318 non-null   object        
 2   title                             4318 non-null   object        
 3   blind_assessment                  4318 non-null   object        
 4   bottom_line                       2852 non-null   object        
 5   roaster_location                  4318 non-null   object        
 6   coordinate location               4318 non-null   object        
 7   og_roaster_location               4318 non-null   object        
 8   roaster_location_identifier       4318 non-null   object        
 9   territorial_entity_1              4316 non-null   object        
 10  territorial_entity_1_identifiers  4316 non-null   obj

None

Unnamed: 0,price_value,quantity_value
count,4318.0,4318.0
mean,314.101797,35.186892
std,3831.617611,68.630375
min,4.73,1.0
25%,16.0,12.0
50%,19.7,12.0
75%,35.0,12.0
max,120000.0,554.0


price_currency
$           3458
NT $         578
CAD $         94
NT$           43
NTD $         21
HKD $         16
¥             11
RMB $          9
AUD $          9
KRW            9
NT             8
KRW $          7
TWD $          7
£              6
HK $           5
AED $          5
IDR $          3
$ NTD          3
THB $          3
THB            2
US $           2
USD $          2
#              2
RMB            2
GBP            2
$NT            1
               1
pesos          1
$NT$           1
Nt $           1
CNY $          1
KRW$           1
E              1
GTQ            1
Price: $       1
$ CAD          1
Name: count, dtype: int64

quantity_unit
ounces      3762
grams        543
pounds        12
kilogram       1
Name: count, dtype: int64

#### Cleaning Currencies



In [173]:
def clean_currency(df: pd.DataFrame) -> pd.DataFrame:
    price_currency = (
        df.price_currency
        .str.upper()
        .str.replace(r'^\$$', 'USD', regex=True)
        .str.replace('PRICE: $', 'USD')
        .str.replace('$', '')
        .str.replace('#', 'GBP')
        .str.replace('¥', 'JPY')
        .str.replace('£', 'GBP')
        .str.replace('POUND', 'GBP')
        .str.replace('PESOS', 'MXN')
        .str.replace('RMB', 'CNY')
        .str.strip()
        .mask(lambda s: s == "US", "USD")
        .mask(lambda s: s == ' ', "USD")
        .mask(lambda s: s == 'E', 'EUR')
        .mask(lambda s: s == 'NTD', 'TWD')
        .mask(lambda s: s == 'NT', 'TWD')
        .mask(lambda s: s == '', 'USD')
        .mask(lambda s: s == 'HK', 'HKD')
        .str.strip()
    )
    return df.assign(price_currency=price_currency)


df = df_raw.pipe(tweak_coffee).pipe(price_quantity_split).pipe(clean_currency)
df.loc[:, ["est_price", "price_currency"]].groupby('price_currency').sample(3, replace=True)

Unnamed: 0,est_price,price_currency
2824,AED $95.00/250 grams,AED
2824,AED $95.00/250 grams,AED
2490,AED $103.95/250 grams,AED
3159,AUD $15.00/250 grams,AUD
3181,AUD $24.00/300 grams,AUD
3159,AUD $15.00/250 grams,AUD
5097,CAD $31.20/32 oz.,CAD
4765,CAD $15.99/16 ounces,CAD
4687,CAD $52.50/12 ounces,CAD
404,RMB 399/100 grams,CNY


### Converting prices to 2024 USD
1. Convert price to USD using historical exchange rates
2. Adjust price to 2023 USD using BLS consumer price index

In [169]:
def convert_currency_to_usd(row):
    date = row['review_date'].strftime('%Y-%m-%d')
    currency = row['price_currency']
    price = row['price_value']
    if currency == 'USD':
        return price
    else:
        return np.round(price / exchange_rates[date][currency], 2)
    
with open(data_dir / 'external' / 'openex_exchange_rates.json') as f:
    exchange_rates = json.load(f)

coffee_df["price_value_usd_hist"] = coffee_df.apply(convert_currency_to_usd, axis=1)

coffee_df.loc[:, ["price_currency", "price_value", "price_value_usd_hist"]].groupby("price_currency").sample(3, replace=True)

KeyError: 'price_currency'

In [170]:

def clean_transform_cpi(df: pd.DataFrame) -> pd.DataFrame:
    """
    Transforms a CPI DataFrame by melting and converting month strings to datetime objects.
    """
    MONTH_MAP = {
    'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6,
    'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12
    }

    return (df
            .melt(id_vars='Year', var_name='Month', value_name='cpi')
            .assign(Month=lambda x: x['Month'].map(MONTH_MAP),
                    date=lambda x: pd.to_datetime(x[['Year', 'Month']].assign(day=1)))
            .dropna()
            .drop(columns=['Year', 'Month'])
            .rename(columns={'cpi': 'consumer_price_index'})
            .sort_values('date')
            .reset_index(drop=True)
           )
    
def create_cpi_adjusted_price(df: pd.DataFrame, file_path: Path) -> pd.DataFrame:
    """
    Adjusts historical price data to 2024 prices using CPI data.
    """
    try:
        cpi = pd.read_csv(data_dir / 'external' / 'consumer_price_index.csv', usecols=['Year', 'Jan', 'Feb', 'Mar',
                                                                                       'Apr', 'May', 'Jun',
                                                                                       'Jul', 'Aug', 'Sep',
                                                                                       'Oct', 'Nov', 'Dec'])
    except FileNotFoundError:
        raise FileNotFoundError("CPI file is not found in the specified directory.")
    
    cpi = clean_transform_cpi(cpi)
    cpi_jan_2024 = cpi.loc[cpi['date'] == '2024-01-01', 'consumer_price_index'].values[0]
    
    return (df
            .merge(cpi, left_on="review_date", right_on="date")
            .drop(columns='date')
            .assign(price_usd_2024=lambda df_: np.round(
                df_['price_value_usd_hist'] * cpi_jan_2024 / df_['consumer_price_index'], 2)
                    )
            )
      
data_dir = Path('../../data')
file_path = data_dir / 'external' / 'consumer_price_index.csv'
coffee_df = create_cpi_adjusted_price(coffee_df, file_path)

coffee_df.sample(3)

KeyError: 'price_value_usd_hist'

### Converting quantities to lbs

In [100]:
to_grams_conversion = {"ounces": 28.3495231, "pounds":453.59237, "kilogram": 1000, "grams": 1}
to_lbs_conversion = {"ounces": 1/16, "pounds":1, "kilogram": 2.20462, "grams": 0.00220462}

coffee_df['quantity_in_lbs'] = ( 
    np.round(coffee_df['quantity_value'] * 
    coffee_df['quantity_unit'].map(to_lbs_conversion), 2)
)

coffee_df.sample(3)

Unnamed: 0,rating,roaster,title,blind_assessment,bottom_line,roaster_location,coordinate location,og_roaster_location,roaster_location_identifier,territorial_entity_1,...,agtron_external,agtron_ground,quantity_value,quantity_unit,price_value,price_currency,price_value_usd_hist,consumer_price_index,price_usd_2024,quantity_in_lbs
2163,95,PT's Coffee Roasting Co.,Finca El Socorro Maracaturra,"Richly sweet, balanced, intricately layered. F...",A lovely example of the big-beaned Maracaturra...,Topeka,"39.04833,-95.67804","Topeka, Kansas",Q41057,Shawnee County,...,52.0,78.0,12.0,ounces,22.5,USD,22.5,252.439,27.49,0.75
2463,92,Allegro Coffee,Congo Muungano,"Sweetly savory, high-toned. Blackberry, freesi...",An intriguing Congo cup from a cooperative tha...,Thornton,"39.9031,-104.954","Thornton, Colorado",Q579761,Adams County,...,52.0,65.0,8.0,ounces,8.99,USD,8.99,244.955,11.32,0.5
2127,92,Magnolia Coffee,Guatemala Finca San Gerardo,"Crisp, richly sweet. Magnolia, dried persimmon...","A friendly, floral-toned Guatemala cup with un...",Charlotte,"35.226944444444,-80.843333333333","Charlotte, North Carolina",Q16565,Mecklenburg County,...,52.0,72.0,12.0,ounces,15.0,USD,15.0,252.885,18.29,0.75


In [101]:
coffee_df['price_usd_2024_per_lb'] = np.round(coffee_df['price_usd_2024'] / coffee_df['quantity_in_lbs'], 2)


coffee_df['price_usd_2024_per_lb'].describe().apply("{:.2f}".format)

count    4230.00
mean       50.81
std       123.39
min         5.36
25%        25.64
50%        30.45
75%        41.41
max      5180.00
Name: price_usd_2024_per_lb, dtype: object

In [105]:
coffee_df[coffee_df['price_usd_2024_per_lb'] < 6].url

1798    https://www.coffeereview.com/review/100-guatem...
Name: url, dtype: object

In [103]:
df[df['url'].isin(coffee_df[coffee_df['price_usd_2024_per_lb'] < 0.5].url)].est_price

Series([], Name: est_price, dtype: object)

In [104]:
coffee_df[coffee_df.url.isin(coffee_df[coffee_df['price_usd_2024_per_lb'] < 0.5].url)]

Unnamed: 0,rating,roaster,title,blind_assessment,bottom_line,roaster_location,coordinate location,og_roaster_location,roaster_location_identifier,territorial_entity_1,...,agtron_ground,quantity_value,quantity_unit,price_value,price_currency,price_value_usd_hist,consumer_price_index,price_usd_2024,quantity_in_lbs,price_usd_2024_per_lb
