In [13]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [14]:
df = pd.read_csv('../data/nutrition_dataset.csv')
df.head(5)

Unnamed: 0,Name,Meal,Meal_cleaned,Brand,Calories,Fat,Carbs,Protein,Amount,Units,Cals Per Unit
0,sbarro - spinach stromboli,spinach stromboli,spinach stromboli,sbarro,900,41.0,96.0,33,376.0,g,3.0
1,sbarro - caesar salad,caesar salad,caesar salad,sbarro,80,5.0,6.0,2,8.0,oz,10.0
2,"papa johns - large, pepperoni and bacon, norma...","large, pepperoni and bacon, normal crust","large , pepperoni bacon , normal crust",papa johns,700,32.0,74.0,28,2.0,slice,350.0
3,papa johns - cheese sticks per web site,cheese sticks per web site,cheese stick per web site,papa johns,185,8.0,21.0,7,2.0,sticks,93.0
4,papa johns - garlic dipping cup,garlic dipping cup,garlic dipping cup,papa johns,75,9.0,0.0,0,0.5,cup,150.0


In [15]:
df.iloc[0][['Name', 'Meal']]

Name    sbarro - spinach stromboli
Meal             spinach stromboli
Name: 0, dtype: object

## Drop these with empty Name and empty Meal

In [62]:
before = df.shape[0]

empty_name = (df['Name'].str.strip() == '') | (df['Name'].str.lower() == 'nan') | (df['Name'].str.strip() == '-')
empty_meal = (df['Meal'].str.strip() == '') | (df['Meal'].str.lower() == 'nan') | (df['Meal'].str.strip() == '-')
df.loc[empty_name, 'Name'] = None
df.loc[empty_meal, 'Meal'] = None

df.dropna(subset=['Name', 'Meal'], how='all', inplace=True)

f"Before: {before}, After: {df.shape[0]}"

'Before: 558799, After: 558424'

In [17]:
df['Meal_cleaned'] = df['Meal']

## Impute empty Meal names

In [18]:
empty_meals = df['Meal_cleaned'].str.strip() == ''
df[empty_meals] = np.nan

empty_meals = df['Meal_cleaned'].isna()
print(df[empty_meals].shape[0])
df[empty_meals].head(5)

df.loc[empty_meals, 'Meal_cleaned'] = df['Name']

13246


In [19]:
# Any left empty meals?
empty_meals = df['Meal_cleaned'].isna()
print(df[empty_meals].shape[0])

0


## Clean text

In [20]:
# Example with 5 rows

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

for i, row in df.head(5).iterrows():
    print(i, row['Meal_cleaned'])
    # Separate by whitespaces, commas, dots, etc.
    tokens = word_tokenize(row['Meal_cleaned'])
    print(tokens)

    # Remove stop words like "the", "a", "an", etc.
    tokens = [w for w in tokens if w not in stop_words]
    print(tokens)

    # Lemmatize words by e.g. making "burgers" into "burger"
    tokens = [lemmatizer.lemmatize(word, pos = "n") for word in tokens]
    print(tokens)

    # Put it all together again
    df.loc[i, 'Meal_cleaned'] = ' '.join(tokens)
    print(row['Meal_cleaned'])


0  spinach stromboli
['spinach', 'stromboli']
['spinach', 'stromboli']
['spinach', 'stromboli']
 spinach stromboli
1  caesar salad
['caesar', 'salad']
['caesar', 'salad']
['caesar', 'salad']
 caesar salad
2  large, pepperoni and bacon, normal crust
['large', ',', 'pepperoni', 'and', 'bacon', ',', 'normal', 'crust']
['large', ',', 'pepperoni', 'bacon', ',', 'normal', 'crust']
['large', ',', 'pepperoni', 'bacon', ',', 'normal', 'crust']
 large, pepperoni and bacon, normal crust
3  cheese sticks  per web site 
['cheese', 'sticks', 'per', 'web', 'site']
['cheese', 'sticks', 'per', 'web', 'site']
['cheese', 'stick', 'per', 'web', 'site']
 cheese sticks  per web site 
4  garlic dipping cup
['garlic', 'dipping', 'cup']
['garlic', 'dipping', 'cup']
['garlic', 'dipping', 'cup']
 garlic dipping cup


In [21]:
# Put together to a function
def clean_text(text):
    text = text.strip()
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in stop_words]
    tokens = [lemmatizer.lemmatize(word, pos = "n") for word in tokens]
    return ' '.join(tokens)

# Apply
df['Meal_cleaned'] = df['Meal_cleaned'].apply(clean_text)

In [22]:
df = df[['Name', 'Meal', 'Meal_cleaned', 'Brand', 'Calories', 'Fat', 'Carbs', 'Protein',
       'Amount', 'Units', 'Cals Per Unit']]

In [23]:
df.head(5)

Unnamed: 0,Name,Meal,Meal_cleaned,Brand,Calories,Fat,Carbs,Protein,Amount,Units,Cals Per Unit
0,sbarro - spinach stromboli,spinach stromboli,spinach stromboli,sbarro,900,41.0,96.0,33,376.0,g,3.0
1,sbarro - caesar salad,caesar salad,caesar salad,sbarro,80,5.0,6.0,2,8.0,oz,10.0
2,"papa johns - large, pepperoni and bacon, norma...","large, pepperoni and bacon, normal crust","large , pepperoni bacon , normal crust",papa johns,700,32.0,74.0,28,2.0,slice,350.0
3,papa johns - cheese sticks per web site,cheese sticks per web site,cheese stick per web site,papa johns,185,8.0,21.0,7,2.0,sticks,93.0
4,papa johns - garlic dipping cup,garlic dipping cup,garlic dipping cup,papa johns,75,9.0,0.0,0,0.5,cup,150.0


## Remove "quick added calories"

In [126]:
df.drop(df[df['Meal_cleaned'].str.contains('quick added')].index, inplace=True)

## Clean units

In [82]:
df['Units'] = df['Units'].str.strip().str.lower()

## Determine base units

In [144]:
# First assumption: All foods with am amount bigegr than 10 are likely to be base units
base_units = df[df['Amount'] > 10]['Units'].unique()

# Next we jsut want to keep those with less than 30 calories per unit, since this includes most of the base units
base_units = df[df['Units'].isin(base_units) & ~(df['Cals Per Unit'] > 10)]['Units'].unique()
print(len(base_units))

base_units.sort()
len(base_units), base_units

for unit in base_units:
    mean = df[df['Units'] == unit]['Cals Per Unit'].mean()
    print(unit)

#df.iloc[7704]['Name'].strip()
#df[df['Name'].str.strip().str.len() == 0]

443

"
-
-10
-10oz
-18
-32
1
1 2
1 4
1 5
14g
15
2 5
3 10
3 4
3 5
30g
3oz 84g
9 10
about
almond
almonds
baby
ball
balls
bar
beans
berries
berry
berrys
biscuit
biscuitd
biscuits
biscuits 55g
biscuts
bisquits
bisuits
bites
blackberry
blueberries
bottle
brussels
buscuits
cakes
cal
calorie
calories
cals
can
candies
candy
capers
caplets
capsule
carb
carrots
cashew
cashews
cc
cheetos
cherries
cherry
cherry 6 8g
chews
chip
chips
chips 1
chips 15g
chips 23g
chips 28g
chips 50g
chocolate
chunks
cl
clove
container
cookies
cra
cracker
crackers
crackers 1 1oz 30g
crakers
creamer
crisps
croutons
crutons
cube
cubes
cup
cup s
cups
drained
drops
ea
each
egg
eggs
extra
fish
fl
fl oz
floz
fluid
fries
full
g
g 08dl
g 1
g 1 16oz
g 1 1oz
g 1 2
g 1 2 5can
g 1 3cup
g 1 3cup 30g
g 1 4
g 1 4cup
g 1 50g
g 1 71g
g 1 9
g 1 9oz
g 12 5g-1bar
g 125ml
g 12g-1piece
g 12g-1sachet
g 14g-1piece
g 15
g 15 5-1piece
g 15g-about
g 16
g 17g-1
g 18g-1piece
g 1bag
g 1bar
g 1cone
g 1ear
g 1muffin
g 1oz
g 1pce 6 1g
g 1row 15 45g
g

In [127]:
# Tried to identify grams by its mean calories per unit, but it didn't work
# for unit in base_units:
#     mean = df[df['Units'] == 'g']['Cals Per Unit'].mean()

#     if mean < 2 or mean > 4: continue

#     print(unit, mean)

In [254]:
# Found someone who mistypoed kg as lg
df.loc[df['Units'] == 'lg', ['Units']] = 'kg'

In [262]:
# Investigate single units
# print(df[df['Units'] == 'g']['Cals Per Unit'].describe())
# print(df[df['Units'] == 'grams']['Cals Per Unit'].describe())
# print(df[df['Units'] == 'oz']['Cals Per Unit'].describe())

unit = 'tgsp'

print(df[df['Units'] == unit]['Cals Per Unit'].describe())

df[df['Units'] == unit]

count     1.0
unique    1.0
top       9.0
freq      1.0
Name: Cals Per Unit, dtype: float64


Unnamed: 0,Name,Meal,Meal_cleaned,Brand,Calories,Fat,Carbs,Protein,Amount,Units,Cals Per Unit,Units_cleaned,Calories Per Portion
471445,lucerene - best of the egg,best of the egg,best egg,lucerene,67,0.0,3.0,13,8.0,tgsp,9.0,portion,8.375


In [263]:
base_unit_mappings = {
    'cc': 'cl',
    'cl': 'cl',
    '-flasche': 'cup',
    'cup': 'cup',
    'cup 175': 'cup',
    'cup 200g': 'cup',
    'cup 206g': 'cup',
    'cup 213g': 'cup',
    'cup 227g': 'cup',
    'cup 236g': 'cup',
    'cup 240': 'cup',
    'cup 240ml': 'cup',
    'cup 250': 'cup',
    'cup 250gm': 'cup',
    'cup 250ml': 'cup',
    'cup 251g': 'cup',
    'cup s': 'cup',
    'cup serving': 'cup',
    'cups': 'cup',
    'g 200ml': 'cup',
    'glas': 'cup',
    'glass': 'cup',
    'glass 250ml': 'cup',
    'glass-5oz': 'cup',
    'dl': 'dl',
    '-fl': 'fl oz',
    'fl': 'fl oz',
    'fl oz': 'fl oz',
    'flox': 'fl oz',
    'floz': 'fl oz',
    'fluid': 'fl oz',
    'lf': 'fl oz',
    'g': 'g',
    'g 08dl': 'g',
    'g 1': 'g',
    'g 1 16oz': 'g',
    'g 1 1oz': 'g',
    'g 1 2': 'g',
    'g 1 2 5can': 'g',
    'g 1 3cup': 'g',
    'g 1 3cup 30g': 'g',
    'g 1 4': 'g',
    'g 1 4cup': 'g',
    'g 1 50g': 'g',
    'g 1 71g': 'g',
    'g 1 9': 'g',
    'g 1 9oz': 'g',
    'g 12 5g-1bar': 'g',
    'g 125ml': 'g',
    'g 12g-1piece': 'g',
    'g 12g-1sachet': 'g',
    'g 14g-1piece': 'g',
    'g 15': 'g',
    'g 15 5-1piece': 'g',
    'g 15g-about': 'g',
    'g 16': 'g',
    'g 17g-1': 'g',
    'g 18g-1piece': 'g',
    'g 1bag': 'g',
    'g 1bar': 'g',
    'g 1cone': 'g',
    'g 1ear': 'g',
    'g 1muffin': 'g',
    'g 1oz': 'g',
    'g 1pce 6 1g': 'g',
    'g 1row 15 45g': 'g',
    'g 1sachet 35g': 'g',
    'g 1scoop': 'g',
    'g 1serve 125g': 'g',
    'g 1serve 200g': 'g',
    'g 1serve 212g': 'g',
    'g 1serve 220g': 'g',
    'g 1serve 32g': 'g',
    'g 1serve 40g': 'g',
    'g 1serve 47g': 'g',
    'g 1serve 50g': 'g',
    'g 1serve 75g': 'g',
    'g 1tablespoon': 'g',
    'g 1tbsp': 'g',
    'g 1tbspn 20g': 'g',
    'g 2': 'g',
    'g 2 3cup': 'g',
    'g 2 6': 'g',
    'g 2 71': 'g',
    'g 2 90g': 'g',
    'g 20': 'g',
    'g 20g-1piece': 'g',
    'g 20g-1small': 'g',
    'g 25': 'g',
    'g 250 350ml': 'g',
    'g 250ml 1': 'g',
    'g 25g-3crackers': 'g',
    'g 25g-3pieces': 'g',
    'g 26': 'g',
    'g 28g-4pieces': 'g',
    'g 28g-about': 'g',
    'g 2oz': 'g',
    'g 2pieces-30g': 'g',
    'g 2sl 59g': 'g',
    'g 2sl 60g': 'g',
    'g 2sl 67g': 'g',
    'g 2tbsp': 'g',
    'g 3': 'g',
    'g 3 4': 'g',
    'g 3 4cup-30g': 'g',
    'g 3 5': 'g',
    'g 30': 'g',
    'g 32g-2tbsp': 'g',
    'g 35': 'g',
    'g 36g-3pieces': 'g',
    'g 38g-1packet': 'g',
    'g 3oz': 'g',
    'g 3pieced-21g': 'g',
    'g 3pieces-26g': 'g',
    'g 4': 'g',
    'g 4 5oz': 'g',
    'g 43g': 'g',
    'g 45g-2 3cup': 'g',
    'g 49g-1 3cup': 'g',
    'g 4oz': 'g',
    'g 4pieces': 'g',
    'g 4pieces-21g': 'g',
    'g 5 3oz': 'g',
    'g 5 5': 'g',
    'g 50g-1pack': 'g',
    'g 50g-1piece': 'g',
    'g 58g-2pieces': 'g',
    'g 59g-1cup': 'g',
    'g 5pieces': 'g',
    'g 60g-1': 'g',
    'g 62 5g-5pieces': 'g',
    'g 6oz': 'g',
    'g 7 37': 'g',
    'g 7 5g 1 4c': 'g',
    'g 7 9oz': 'g',
    'g 8 3oz': 'g',
    'g 8oz': 'g',
    'g 9': 'g',
    'g 9oz': 'g',
    'g about': 'g',
    'g container': 'g',
    'g ml': 'g',
    'g one': 'g',
    'g reg': 'g',
    'g s': 'g',
    'g serve': 'g',
    'g-': 'g',
    'g-1': 'g',
    'g-1pz': 'g',
    'g-1tbls': 'g',
    'g-2': 'g',
    'g-2tbsp': 'g',
    'g-drained': 'g',
    'g-ish': 'g',
    'ge': 'g',
    'get': 'g',
    'gm': 'g',
    'gm1': 'g',
    'gms': 'g',
    'gms 2tbsp': 'g',
    'gr': 'g',
    'gr 1': 'g',
    'gr 1pack': 'g',
    'gram': 'g',
    'gram s': 'g',
    'gram w': 'g',
    'gramas': 'g',
    'grame': 'g',
    'gramm': 'g',
    'grammes': 'g',
    'grammi': 'g',
    'gramms': 'g',
    'gramos': 'g',
    'grams': 'g',
    'grams 1': 'g',
    'grams 1 2': 'g',
    'grams 1 4': 'g',
    'grams 18': 'g',
    'grams 2': 'g',
    'grams 31': 'g',
    'grams 4oz': 'g',
    'grams 6"': 'g',
    'grams 8': 'g',
    'grams aprox': 'g',
    'grams-': 'g',
    'grm': 'g',
    'grms': 'g',
    'grs': 'g',
    'kg': 'kg',
    'kg s': 'kg',
    'l': 'l',
    'liter': 'l',
    'litre': 'l',
    'litro': 'l',
    'lb': 'lb',
    'lb 16': 'lb',
    'lb s': 'lb',
    'lbs': 'lb',
    'mcg': 'mg',
    'mg': 'mg',
    'micrograms': 'mg',
    'milligram': 'mg',
    'kl': 'ml',
    'm': 'ml',
    'mil': 'ml',
    'mililitros': 'ml',
    'milliliter': 'ml',
    'milliliters': 'ml',
    'mils': 'ml',
    'ml': 'ml',
    'ml 1': 'ml',
    'ml 100ml-48g': 'ml',
    'ml 100ml-84g': 'ml',
    'ml 11 15': 'ml',
    'ml 118ml-99g': 'ml',
    'ml 1cone': 'ml',
    'ml 1glass': 'ml',
    'ml 1serve 1 4cup': 'ml',
    'ml 236ml-1packet': 'ml',
    'ml 24oz': 'ml',
    'ml 2fl': 'ml',
    'ml 2tsp': 'ml',
    'ml 3oz': 'ml',
    'ml 68ml-1bottle': 'ml',
    'ml 8 5oz': 'ml',
    'ml s': 'ml',
    'ml-3 4': 'ml',
    'mls': 'ml',
    '25': 'oz',
    '28': 'oz',
    '29': 'oz',
    '30': 'oz',
    '1oz': 'oz',
    '26g': 'oz',
    '27g': 'oz',
    '30g': 'oz',
    '30g': 'oz',
    '30grms': 'oz',
    '32g': 'oz',
    '80g 2 8oz': 'oz',
    'cup-30g': 'oz',
    'o': 'oz',
    'o z': 'oz',
    'onz': 'oz',
    'ounce': 'oz',
    'ounce 112g': 'oz',
    'ounce 15': 'oz',
    'ounce 36': 'oz',
    'ounce-': 'oz',
    'ounces': 'oz',
    'ounces 113g': 'oz',
    'ounces 162': 'oz',
    'ounces 56': 'oz',
    'ounces 8': 'oz',
    'ounces 85': 'oz',
    'ounces-': 'oz',
    'ounces1': 'oz',
    'ounches': 'oz',
    'ounzes': 'oz',
    'ournces': 'oz',
    'ox': 'oz',
    'oz': 'oz',
    'oz   25': 'oz',
    'oz  1': 'oz',
    'oz  112g': 'oz',
    'oz  113': 'oz',
    'oz  118': 'oz',
    'oz  14': 'oz',
    'oz  142g': 'oz',
    'oz  18': 'oz',
    'oz  2': 'oz',
    'oz  240ml': 'oz',
    'oz  28': 'oz',
    'oz  28g': 'oz',
    'oz  28g about': 'oz',
    'oz  312': 'oz',
    'oz  340g': 'oz',
    'oz  45': 'oz',
    'oz  591ml': 'oz',
    'oz  6': 'oz',
    'oz  85g about': 'oz',
    'oz  9chips': 'oz',
    'oz  cooked': 'oz',
    'oz  slice': 'oz',
    'oz -': 'oz',
    'oz 1': 'oz',
    'oz 1 4': 'oz',
    'oz 103 5g': 'oz',
    'oz 105g': 'oz',
    'oz 112g': 'oz',
    'oz 113': 'oz',
    'oz 113g': 'oz',
    'oz 125': 'oz',
    'oz 140g': 'oz',
    'oz 15': 'oz',
    'oz 150g': 'oz',
    'oz 153g': 'oz',
    'oz 156': 'oz',
    'oz 16': 'oz',
    'oz 165g': 'oz',
    'oz 16floz': 'oz',
    'oz 170g': 'oz',
    'oz 18g': 'oz',
    'oz 1bread': 'oz',
    'oz 1serving  85g': 'oz',
    'oz 2': 'oz',
    'oz 200g': 'oz',
    'oz 240ml': 'oz',
    'oz 24fl': 'oz',
    'oz 24floz': 'oz',
    'oz 25g': 'oz',
    'oz 28': 'oz',
    'oz 28 3g': 'oz',
    'oz 28g': 'oz',
    'oz 28g 13chips': 'oz',
    'oz 28g 31chips': 'oz',
    'oz 28g 39': 'oz',
    'oz 28g 3chips': 'oz',
    'oz 28g about': 'oz',
    'oz 28g pack': 'oz',
    'oz 28grams about': 'oz',
    'oz 3': 'oz',
    'oz 3 5g': 'oz',
    'oz 30g': 'oz',
    'oz 31 1g 1': 'oz',
    'oz 330': 'oz',
    'oz 35g pack': 'oz',
    'oz 3slices': 'oz',
    'oz 40g': 'oz',
    'oz 42': 'oz',
    'oz 42 5': 'oz',
    'oz 43g': 'oz',
    'oz 45g': 'oz',
    'oz 49 6': 'oz',
    'oz 52g': 'oz',
    'oz 56g': 'oz',
    'oz 56g  25': 'oz',
    'oz 56g 3 4': 'oz',
    'oz 57': 'oz',
    'oz 57g': 'oz',
    'oz 57g0': 'oz',
    'oz 6': 'oz',
    'oz 60': 'oz',
    'oz 70g': 'oz',
    'oz 74g': 'oz',
    'oz 8': 'oz',
    'oz 84': 'oz',
    'oz 84g': 'oz',
    'oz 85g': 'oz',
    'oz 85gm': 'oz',
    'oz 9': 'oz',
    'oz 90g': 'oz',
    'oz about': 'oz',
    'oz can': 'oz',
    'oz cup': 'oz',
    'oz s': 'oz',
    'oz slice': 'oz',
    'oz slices': 'oz',
    'oz tall': 'oz',
    'oz-': 'oz',
    'oz--about': 'oz',
    'oz-1': 'oz',
    'oz-142g': 'oz',
    'oz-1chips': 'oz',
    'oz-28g': 'oz',
    'oz-8': 'oz',
    'oz-grande': 'oz',
    'ozs': 'oz',
    'shot 1 30ml': 'oz',
    'shot-1oz': 'oz',
    'shot1': 'oz',
    'shots': 'oz',
    't': 'tbsp',
    't  30g': 'tbsp',
    'tabelspoon': 'tbsp',
    'tabelspoons': 'tbsp',
    'tabl': 'tbsp',
    'table': 'tbsp',
    'tablepspoon': 'tbsp',
    'tablesoon': 'tbsp',
    'tablesoons': 'tbsp',
    'tablespool': 'tbsp',
    'tablespoom': 'tbsp',
    'tablespoon': 'tbsp',
    'tablespoon  15g': 'tbsp',
    'tablespoon 15g': 'tbsp',
    'tablespoon- 15ml': 'tbsp',
    'tablespoons': 'tbsp',
    'tablespoons 14g': 'tbsp',
    'tablespoons 17g': 'tbsp',
    'tablespoons 20g': 'tbsp',
    'tablespoons 32g': 'tbsp',
    'tablespoons 50g': 'tbsp',
    'tablespoonss': 'tbsp',
    'tablespooons': 'tbsp',
    'tablespoos': 'tbsp',
    'tablesppon': 'tbsp',
    'tablesppons': 'tbsp',
    'tb': 'tbsp',
    'tbl': 'tbsp',
    'tblespn': 'tbsp',
    'tbls': 'tbsp',
    'tblsp': 'tbsp',
    'tblspn': 'tbsp',
    'tblspns': 'tbsp',
    'tblspoon': 'tbsp',
    'tblsps': 'tbsp',
    'tbp': 'tbsp',
    'tbpn': 'tbsp',
    'tbps': 'tbsp',
    'tbs': 'tbsp',
    'tbs 15': 'tbsp',
    'tbs 1oz': 'tbsp',
    'tbs 6 92g': 'tbsp',
    'tbs- 14g': 'tbsp',
    'tbs--21g': 'tbsp',
    'tbsb': 'tbsp',
    'tbsn': 'tbsp',
    'tbsp': 'tbsp',
    'tbsp  14g': 'tbsp',
    'tbsp  14gr  1 2': 'tbsp',
    'tbsp  15g': 'tbsp',
    'tbsp  28g': 'tbsp',
    'tbsp  46g': 'tbsp',
    'tbsp  5': 'tbsp',
    'tbsp  7g  0 25oz': 'tbsp',
    'tbsp 1': 'tbsp',
    'tbsp 12g': 'tbsp',
    'tbsp 14': 'tbsp',
    'tbsp 14g': 'tbsp',
    'tbsp 15': 'tbsp',
    'tbsp 15 2grams': 'tbsp',
    'tbsp 15g': 'tbsp',
    'tbsp 15ml': 'tbsp',
    'tbsp 17': 'tbsp',
    'tbsp 17g': 'tbsp',
    'tbsp 19': 'tbsp',
    'tbsp 20g': 'tbsp',
    'tbsp 21g': 'tbsp',
    'tbsp 28': 'tbsp',
    'tbsp 28g': 'tbsp',
    'tbsp 28g 1oz': 'tbsp',
    'tbsp 30': 'tbsp',
    'tbsp 30g': 'tbsp',
    'tbsp 30ml': 'tbsp',
    'tbsp 31': 'tbsp',
    'tbsp 31g': 'tbsp',
    'tbsp 32g': 'tbsp',
    'tbsp 34g': 'tbsp',
    'tbsp 3g': 'tbsp',
    'tbsp 45ml': 'tbsp',
    'tbsp 4g': 'tbsp',
    'tbsp 5 ml': 'tbsp',
    'tbsp 5g': 'tbsp',
    'tbsp 60ml': 'tbsp',
    'tbsp 6g': 'tbsp',
    'tbsp 7': 'tbsp',
    'tbsp 7g': 'tbsp',
    'tbsp 9': 'tbsp',
    'tbsp oz': 'tbsp',
    'tbsp s': 'tbsp',
    'tbsp-': 'tbsp',
    'tbsp-15ml': 'tbsp',
    'tbsp-30g': 'tbsp',
    'tbspn': 'tbsp',
    'tbspns': 'tbsp',
    'tbspp': 'tbsp',
    'tbsps': 'tbsp',
    'tbsps 29g': 'tbsp',
    'tbst': 'tbsp',
    'tdsp': 'tbsp',
    'tlb': 'tbsp',
    'tlbs': 'tbsp',
    'tlbsp': 'tbsp',
    'tpsp': 'tbsp',
    'tsb': 'tbsp',
    'tsbp': 'tbsp',
    'tspb': 'tbsp',
    'ttbsp': 'tbsp',
    '1tsp': 'tsp',
    't 14': 'tsp',
    'teaspon': 'tsp',
    'teaspoon': 'tsp',
    'teaspoons': 'tsp',
    'teaspooon': 'tsp',
    'tps': 'tsp',
    'ts': 'tsp',
    'tsp': 'tsp',
    'tsp  5g': 'tsp',
    'tsp  8g': 'tsp',
    'tsp  9g': 'tsp',
    'tsp 0 3': 'tsp',
    'tsp 1 3g': 'tsp',
    'tsp 2': 'tsp',
    'tsp 3gms': 'tsp',
    'tsp 5g': 'tsp',
    'tsp 5ml': 'tsp',
    'tsp 7g': 'tsp',
    'tsp s': 'tsp',
    'tspn': 'tsp',
    'tspns': 'tsp',
    'tsps': 'tsp',
}

df["Units_cleaned"] = df["Units"].apply(lambda x: base_unit_mappings[x] if x in base_unit_mappings else "portion")

#df.loc[base_unit__mask, "Units_cleaned"]
df

Unnamed: 0,Name,Meal,Meal_cleaned,Brand,Calories,Fat,Carbs,Protein,Amount,Units,Cals Per Unit,Units_cleaned,Calories Per Portion
0,sbarro - spinach stromboli,spinach stromboli,spinach stromboli,sbarro,900,41.0,96.0,33,376.0,g,3.0,g,
1,sbarro - caesar salad,caesar salad,caesar salad,sbarro,80,5.0,6.0,2,8.0,oz,10.0,oz,
2,"papa johns - large, pepperoni and bacon, norma...","large, pepperoni and bacon, normal crust","large , pepperoni bacon , normal crust",papa johns,700,32.0,74.0,28,2.0,slice,350.0,portion,350.0
3,papa johns - cheese sticks per web site,cheese sticks per web site,cheese stick per web site,papa johns,185,8.0,21.0,7,2.0,sticks,93.0,portion,92.5
4,papa johns - garlic dipping cup,garlic dipping cup,garlic dipping cup,papa johns,75,9.0,0.0,0,0.5,cup,150.0,cup,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
558821,"generic - green pepper, raw 100 g","green pepper, raw 100 g","green pepper , raw 100 g",generic,5,0.0,1.0,0,25.0,g,1.0,g,
558822,"generic - onions, sweet, raw 100 g usda data","onions, sweet, raw 100 g usda data","onion , sweet , raw 100 g usda data",generic,8,0.0,2.0,0,25.0,g,1.0,g,
558823,publix - tomato ketchup - large bottle,tomato ketchup - large bottle,tomato ketchup - large bottle,publix,30,0.0,8.0,0,2.0,tbsp,15.0,tbsp,
558824,louana - all natural pure coconut oil,all natural pure coconut oil,natural pure coconut oil,louana,40,5.0,0.0,0,0.33,tbsp,122.0,tbsp,


In [264]:
# Check the orignal units of those that we categorized with portion
print("".join([f"\n{x}" for x in df.loc[df['Units_cleaned'] == 'portion', 'Units'].unique()]))


slice
sticks
pack
pizza
spear
bun
patty
slices
skillet
roll
shot
sandwhich
cookie
donut
pouch
container
stick
packet
bagel
meal
sandwich
hash
egg
banana
cube
pieces
bottle
large
creamer
taco
pint
piece
pastry
strips 213g
order
side
medium
wrap
stalks
ragoons
4  long
serving s
halves
biscuit
chicks
crackers
and
bar
rolls
muffin
serving
pkg
burger
package
soft
waffle
tomatoes
crkr
doughnut
cup 40g
flour
links
scoop
1 2
biscuits
bar 23g
slice 28g
apple
link
scoops
box
grilled
cupcake
can
small
cake
muffins
full
"
pancake
cookies
inch
pie
capsule
chop
piece slice
level
squares
tablets
tablet
c
burrito
sheet
rings
lavash
tray
inches
crackers 2
cooked
bag
whole
us
g 2"
orange
-
rasher
raw
eggs
rashers
oatcake
focaccia
pear
-2 5"
spray
crabs
sausage
thin
of
sandwiches
pills
capsules
vegetable
tea
fruit
carrots
leaf
softgel
wings
hot
cheesestring s
each
peach
k-cup
patties
strip
tomato
bowl
berries
ea
rack
figs
punnet
omelette
tortilla
clementine
rounded
fillet
peppers
snack
pan
shells
tamale

## Define the calories per portion

In [275]:
# Those with amount 0, impute 1
df['Amount'] = df['Amount'].replace(0, 1)
df['Amount'] = df['Amount'].replace('kg', 1)
df['Amount'] = df['Amount'].astype(float)

df['Amount'].dtypes

dtype('float64')

In [280]:
df['Calories'] = df['Calories'].replace('kg', 1)
df['Calories'] = df['Calories'].astype(float)


  df['Calories'] = df['Calories'].replace('kg', 1)


In [281]:
df.loc[df['Units_cleaned'] == 'portion', 'Calories Per Portion'] = df['Calories'] / df['Amount']
df

Unnamed: 0,Name,Meal,Meal_cleaned,Brand,Calories,Fat,Carbs,Protein,Amount,Units,Cals Per Unit,Units_cleaned,Calories Per Portion
0,sbarro - spinach stromboli,spinach stromboli,spinach stromboli,sbarro,900.0,41.0,96.0,33,376.00,g,3.0,g,
1,sbarro - caesar salad,caesar salad,caesar salad,sbarro,80.0,5.0,6.0,2,8.00,oz,10.0,oz,
2,"papa johns - large, pepperoni and bacon, norma...","large, pepperoni and bacon, normal crust","large , pepperoni bacon , normal crust",papa johns,700.0,32.0,74.0,28,2.00,slice,350.0,portion,350.0
3,papa johns - cheese sticks per web site,cheese sticks per web site,cheese stick per web site,papa johns,185.0,8.0,21.0,7,2.00,sticks,93.0,portion,92.5
4,papa johns - garlic dipping cup,garlic dipping cup,garlic dipping cup,papa johns,75.0,9.0,0.0,0,0.50,cup,150.0,cup,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
558821,"generic - green pepper, raw 100 g","green pepper, raw 100 g","green pepper , raw 100 g",generic,5.0,0.0,1.0,0,25.00,g,1.0,g,
558822,"generic - onions, sweet, raw 100 g usda data","onions, sweet, raw 100 g usda data","onion , sweet , raw 100 g usda data",generic,8.0,0.0,2.0,0,25.00,g,1.0,g,
558823,publix - tomato ketchup - large bottle,tomato ketchup - large bottle,tomato ketchup - large bottle,publix,30.0,0.0,8.0,0,2.00,tbsp,15.0,tbsp,
558824,louana - all natural pure coconut oil,all natural pure coconut oil,natural pure coconut oil,louana,40.0,5.0,0.0,0,0.33,tbsp,122.0,tbsp,


## Calculate calories per gram

In [284]:
#df[df['Units_cleaned'] != 'portion']
print(df['Units_cleaned'].unique())

to_gram_factors = {
    'g': 1,
    'oz': 28.3495,
    'cup': 236.588,
    'tbsp': 14.7868,
    'fl oz': 29.5735,
    'tsp': 4.92892,
    'kg': 1000,
    'mg': 0.001,
    'lb': 453.592,
    'l': 1000,
    'dl': 100,
    'cl': 10,
    'ml': 1,
}

['g' 'oz' 'portion' 'cup' 'tbsp' 'fl oz' 'ml' 'tsp' 'kg' 'lb' 'mg' 'l'
 'cl' 'dl']


In [287]:
df['Grams'] = df['Amount'] * df['Units_cleaned'].map(to_gram_factors)
df['Calories Per Gram'] = df['Calories'] / df['Grams']
df

Unnamed: 0,Name,Meal,Meal_cleaned,Brand,Calories,Fat,Carbs,Protein,Amount,Units,Cals Per Unit,Units_cleaned,Calories Per Portion,Grams,Calories Per Gram
0,sbarro - spinach stromboli,spinach stromboli,spinach stromboli,sbarro,900.0,41.0,96.0,33,376.00,g,3.0,g,,376.000000,2.393617
1,sbarro - caesar salad,caesar salad,caesar salad,sbarro,80.0,5.0,6.0,2,8.00,oz,10.0,oz,,226.796000,0.352740
2,"papa johns - large, pepperoni and bacon, norma...","large, pepperoni and bacon, normal crust","large , pepperoni bacon , normal crust",papa johns,700.0,32.0,74.0,28,2.00,slice,350.0,portion,350.0,,
3,papa johns - cheese sticks per web site,cheese sticks per web site,cheese stick per web site,papa johns,185.0,8.0,21.0,7,2.00,sticks,93.0,portion,92.5,,
4,papa johns - garlic dipping cup,garlic dipping cup,garlic dipping cup,papa johns,75.0,9.0,0.0,0,0.50,cup,150.0,cup,,118.294000,0.634014
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
558821,"generic - green pepper, raw 100 g","green pepper, raw 100 g","green pepper , raw 100 g",generic,5.0,0.0,1.0,0,25.00,g,1.0,g,,25.000000,0.200000
558822,"generic - onions, sweet, raw 100 g usda data","onions, sweet, raw 100 g usda data","onion , sweet , raw 100 g usda data",generic,8.0,0.0,2.0,0,25.00,g,1.0,g,,25.000000,0.320000
558823,publix - tomato ketchup - large bottle,tomato ketchup - large bottle,tomato ketchup - large bottle,publix,30.0,0.0,8.0,0,2.00,tbsp,15.0,tbsp,,29.573600,1.014418
558824,louana - all natural pure coconut oil,all natural pure coconut oil,natural pure coconut oil,louana,40.0,5.0,0.0,0,0.33,tbsp,122.0,tbsp,,4.879644,8.197319


## Save back to CSV

In [291]:
df = df[['Name', 'Meal_cleaned', 'Brand', 'Calories', 'Fat', 'Carbs', 'Protein', 'Amount', 'Units', 'Units_cleaned', 'Calories Per Portion', 'Grams', 'Calories Per Gram']]
df.columns = ['original_name', 'name', 'brand', 'calories', 'fats', 'carbohydrates', 'proteins', 'amount', 'original_unit', 'unit', 'calories_per_portion', 'grams_per_portion', 'calories_per_gram']
df

Unnamed: 0,original_name,name,brand,calories,fats,carbophydrates,proteins,amount,original_unit,unit,calories_per_portion,grams_per_portion,calories_per_gram
0,sbarro - spinach stromboli,spinach stromboli,sbarro,900.0,41.0,96.0,33,376.00,g,g,,376.000000,2.393617
1,sbarro - caesar salad,caesar salad,sbarro,80.0,5.0,6.0,2,8.00,oz,oz,,226.796000,0.352740
2,"papa johns - large, pepperoni and bacon, norma...","large , pepperoni bacon , normal crust",papa johns,700.0,32.0,74.0,28,2.00,slice,portion,350.0,,
3,papa johns - cheese sticks per web site,cheese stick per web site,papa johns,185.0,8.0,21.0,7,2.00,sticks,portion,92.5,,
4,papa johns - garlic dipping cup,garlic dipping cup,papa johns,75.0,9.0,0.0,0,0.50,cup,cup,,118.294000,0.634014
...,...,...,...,...,...,...,...,...,...,...,...,...,...
558821,"generic - green pepper, raw 100 g","green pepper , raw 100 g",generic,5.0,0.0,1.0,0,25.00,g,g,,25.000000,0.200000
558822,"generic - onions, sweet, raw 100 g usda data","onion , sweet , raw 100 g usda data",generic,8.0,0.0,2.0,0,25.00,g,g,,25.000000,0.320000
558823,publix - tomato ketchup - large bottle,tomato ketchup - large bottle,publix,30.0,0.0,8.0,0,2.00,tbsp,tbsp,,29.573600,1.014418
558824,louana - all natural pure coconut oil,natural pure coconut oil,louana,40.0,5.0,0.0,0,0.33,tbsp,tbsp,,4.879644,8.197319


In [292]:
df.to_csv('../data/nutrition_dataset.csv', index=False)