In [1]:
import os
from datetime import date
import pandas as pd
import sys
sys.path.append('../src')
import data.clean as clean
import data.process as process

# Scrape raw data from Chewy

In [2]:
#os.system("scrapy crawl chewy_spider")

# Read today's external data into a pandas df

In [3]:
today = date.today()
d = today.strftime('%d_%m_%Y')

df = pd.read_csv(f'../data/external/raw_chewy_data_{d}.csv')

# Clean data

## Unpack attribute and guarenteed analysis (GA) dictionaries
Attribute information and guarenteed analysis (GA) information are stored as dictionaries in their columns. This is because different products have different fields/numbers of fields in these tables. So first things first, I expand those two columns.

In [4]:
# help(clean.dict_col_to_cols)

df = clean.dict_col_to_cols(df=df, col='ga_dict', ga=True)
df = clean.dict_col_to_cols(df=df, col='attr_dict', ga=False)

## Fill guarenteed analysis columns missing critical statistics

It is mandatory that cat food manufacturers report % protein min, % fat min, and % fiber max in their guarenteed analysis. So it's likley that when we don't have a value for one of these macros, but we do have a value for a very similarly named macro, a typo has been made. So here we fill our empty macro values with the values most likely to be correct.

In [5]:
df['% protein min'] = df['% protein min'].fillna(df['% protein']).fillna(df['% protein max'])
df['% fat min'] = df['% fat min'].fillna(df['% fat']).fillna(df['% fat max'])
df['% fiber max'] = df['% fiber max'].fillna(df['% fiber']).fillna(df['% fiber min'])
df['% moisture max'] = df['% moisture max'].fillna(df['% moisture']).fillna(df['% moisture min'])

# df = df.dropna(thresh = df.shape[0] * 0.5, how = 'all', axis = 1)

## Extract kilocalories/mass or kilocalories/volume
The calories column contains information about the number of kilocalories per unit mass or volume of food. However, the units are not consistent, and often multiple values are reported. So we need to pull out the kcal/unit mass for dry food, and the kcal/unit volume for wet food (this is because total product size is given in mass for dry food and volume for wet food, and we'd like to eventually calculate U.S. dollar per kcal).

In [6]:
# help(clean.get_unit_val_col)

# kcal/unit of mass:
df = clean.get_unit_val_col(df, column_name = 'calories',
                  col_split_regex = '(?<!/) (?=\d)|,|;|:',
                  unit_name = 'kcal_per_kg',
                  unit_regex = 'kcal.*kg')

df = clean.get_unit_val_col(df, column_name = 'calories',
                  col_split_regex = '(?<!/) (?=\d)|,|;|:',
                  unit_name = 'kcal_per_lb',
                  unit_regex = 'kcal.*lb')

# kcal/unit of volume:
df = clean.get_unit_val_col(df, column_name = 'calories',
                  col_split_regex = '(?<!/) (?=\d)|,|;|:',
                  unit_name = 'kcal_per_oz',
                  unit_regex = 'kcal.*oz|kcal.*ounce')

df = clean.get_unit_val_col(df, column_name = 'calories',
                  col_split_regex = '(?<!/) (?=\d)|,|;|:',
                  unit_name = 'kcal_per_cup',
                  unit_regex = 'kcal.*cup')

# kcal/item (kcal in a single can, pouch, etc.)
df = clean.get_unit_val_col(df, column_name = 'calories',
                  col_split_regex = '(?<!/) (?=\d)|,|;|:',
                  unit_name = 'kcal_per_item',
                  unit_regex = 'kcal.*can|kcal.*pouch|kcl.*pouch|kcal.*unit|kcal.*bowl|kcal.*tray|kcal.*container|kcal.*serving|kcal.*tub|kcal.*pack',
                  clear_inconsistent_rows = False,
                  get_denom_col = True,
                  denom_name = 'item_volume_for_kcal',
                  denom_regex = '((?<=\/).*?\d+\.?\d+)')

## Extract item mass, volume, and/or count
For dry food we need the total mass of the bag, for wet food we need the total volume of the can/item unit and the number of cans/item units.

In [7]:
# number of lbs per item
df = clean.get_unit_val_col(df, column_name = 'name',
                  col_split_regex = ',',
                  unit_name = 'total_mass_lb',
                  unit_regex = 'lb|pound|Pound')

# if no number lbs found, get that value from the weight attribute 
# DO NOT DO THIS WEIGHTS ARE NOT ALL IN LBS
# df['total_mass_lb'] = df['total_mass_lb'].fillna(df['weight'])

# number of ounces per item
df = clean.get_unit_val_col(df, column_name = 'name',
                  col_split_regex = ',',
                  unit_name = 'item_oz',
                  unit_regex = 'oz|ounce')

# number of items
df = clean.get_unit_val_col(df, column_name = 'name',
                  col_split_regex = ',',
                  unit_name = 'item_count',
                  unit_regex = 'case of')

# if no number of items specified, set item_count to 1
df['item_count'] = df['item_count'].fillna(1)

## BELOW IS OUTDATED
## Do we have the necessary kcal and mass/volume values for all food?
### Dry food
For dry food, we need total mass, and kilocalories per unit of mass.
How many dry food products are missing either both `kcal_per_kg` and `kcal_per_lb`, or `total_mass_lb`?

In [8]:
d_food = df[df['food form'] == 'dry food']

d_miss_val = d_food[((d_food['kcal_per_kg'].isna()) 
                     & (d_food['kcal_per_lb'].isna())) 
                    | d_food['total_mass_lb'].isna()]

miss_lbs = d_miss_val[d_miss_val['total_mass_lb'].isna()]

miss_kcals = d_miss_val[(d_miss_val['kcal_per_kg'].isna()) 
                        & (d_miss_val['kcal_per_lb'].isna())]

have_vol = d_miss_val[d_miss_val['item_oz'].notnull() 
                      & ((d_miss_val['kcal_per_oz'].notnull()) 
                         | (d_miss_val['kcal_per_cup'].notnull()))]

print(f'\n %d out of %d dry food products are missing at least one required value.' % (len(d_miss_val), len(d_food)),
      f'\n That is ~%d percent.' % (round(len(d_miss_val) / len(d_food) * 100)),
      f'\n Of those %d, %d are missing total mass.' % (len(d_miss_val), len(miss_lbs)),
      f'\n Of those %d, %d are missing kcal/unit of mass.' % (len(d_miss_val), len(miss_kcals)),
      f'\n Of those %d missing kcal/mass info, %d have kcal/volume info.' % (len(miss_kcals), len(have_vol))
     )


 79 out of 564 dry food products are missing at least one required value. 
 That is ~14 percent. 
 Of those 79, 1 are missing total mass. 
 Of those 79, 78 are missing kcal/unit of mass. 
 Of those 78 missing kcal/mass info, 0 have kcal/volume info.


### Wet food
For wet food, we need item volume, item count, and either kcal per item or kcal per unit of volume.
How many wet food products are missing either `item_oz` or both of `kcal_per_oz` and `kcal_per_cup`, AND are missing`kcal_per_item`.

In [9]:
w_food = df[df['food form'] == 'wet food']

w_miss_val = w_food[((w_food['item_oz'].isna()) 
                    | (w_food['kcal_per_oz'].isna() 
                       & w_food['kcal_per_cup'].isna())) 
                    & w_food['kcal_per_item'].isna()]

w_miss_val_and_mass = w_miss_val[((w_miss_val['kcal_per_kg'].isna()) 
                                  & (w_miss_val['kcal_per_lb'].isna())) 
                                 | w_miss_val['total_mass_lb'].isna()]

print(f'\n %d out of %d wet food products are missing at least one required value.' % (len(w_miss_val), len(w_food)),
      f'\n That is ~%d percent.' % (round(len(w_miss_val) / len(w_food) * 100)),
      f'\n Of those %d missing kcal/volume info, %d have kcal/mass info.' \
      % (len(w_miss_val), len(w_miss_val) - len(w_miss_val_and_mass))
     )



 116 out of 1474 wet food products are missing at least one required value. 
 That is ~8 percent. 
 Of those 116 missing kcal/volume info, 0 have kcal/mass info.
