In [86]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [66]:
nutrition_dataset = pd.read_csv('../data/nutrition_dataset.csv')
nutrition_dataset.head(5)

Unnamed: 0,Name,Meal,Brand,Calories,Fat,Carbs,Protein,Amount,Units,Cals Per Unit
0,sbarro - spinach stromboli,spinach stromboli,sbarro,900,41.0,96.0,33,376.0,g,3.0
1,sbarro - caesar salad,caesar salad,sbarro,80,5.0,6.0,2,8.0,oz,10.0
2,"papa johns - large, pepperoni and bacon, norma...","large, pepperoni and bacon, normal crust",papa johns,700,32.0,74.0,28,2.0,slice,350.0
3,papa johns - cheese sticks per web site,cheese sticks per web site,papa johns,185,8.0,21.0,7,2.0,sticks,93.0
4,papa johns - garlic dipping cup,garlic dipping cup,papa johns,75,9.0,0.0,0,0.5,cup,150.0


In [45]:
nutrition_dataset.iloc[0][['Name', 'Meal']]

Name    sbarro - spinach stromboli
Meal             spinach stromboli
Name: 0, dtype: object

## Drop these with empty Name and empty Meal

In [67]:
before = nutrition_dataset.shape[0]

empty_name = nutrition_dataset['Name'].str.strip() == ''
empty_meal = nutrition_dataset['Meal'].str.strip() == ''
nutrition_dataset.loc[empty_name, 'Name'] = np.nan
nutrition_dataset.loc[empty_meal, 'Meal'] = np.nan

nutrition_dataset.dropna(subset=['Name', 'Meal'], how='all', inplace=True)

f"Before: {before}, After: {nutrition_dataset.shape[0]}"

'Before: 558917, After: 558826'

In [69]:
nutrition_dataset['Meal_cleaned'] = nutrition_dataset['Meal']

## Impute empty Meal names

In [70]:
empty_meals = nutrition_dataset['Meal_cleaned'].str.strip() == ''
nutrition_dataset[empty_meals] = np.nan

empty_meals = nutrition_dataset['Meal_cleaned'].isna()
print(nutrition_dataset[empty_meals].shape[0])
nutrition_dataset[empty_meals].head(5)

nutrition_dataset.loc[empty_meals, 'Meal_cleaned'] = nutrition_dataset['Name']

13246


In [71]:
# Any left empty meals?
empty_meals = nutrition_dataset['Meal_cleaned'].isna()
print(nutrition_dataset[empty_meals].shape[0])

0


## Remove whitespaces and make to lower

In [74]:
nutrition_dataset['Meal_cleaned'] = (nutrition_dataset['Meal_cleaned']
    .str.strip()
    .str.lower())

## Remove stopwords

In [95]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

for i, row in nutrition_dataset.head(5).iterrows():
    print(row['Meal_cleaned'])
    # Separate by whitespaces, commas, dots, etc.
    tokens = word_tokenize(row['Meal_cleaned'])
    print(tokens)

    # Remove stop words like "the", "a", "an", etc.
    tokens = [w for w in tokens if w not in stop_words]
    print(tokens)

    # Lemmatize words by e.g. making "burgers" into "burger"
    tokens = [lemmatizer.lemmatize(word, pos = "n") for word in tokens]
    print(tokens)

    # Put it all together again
    nutrition_dataset.iloc[i, 'Meal_cleaned'] = ' '.join(tokens)
    print(row['Meal_cleaned'])


spinach stromboli
['spinach', 'stromboli']
['spinach', 'stromboli']
['spinach', 'stromboli']
spinach stromboli
caesar salad
['caesar', 'salad']
['caesar', 'salad']
['caesar', 'salad']
caesar salad
large, pepperoni and bacon, normal crust
['large', ',', 'pepperoni', 'and', 'bacon', ',', 'normal', 'crust']
['large', ',', 'pepperoni', 'bacon', ',', 'normal', 'crust']
['large', ',', 'pepperoni', 'bacon', ',', 'normal', 'crust']
large , pepperoni bacon , normal crust
cheese sticks  per web site
['cheese', 'sticks', 'per', 'web', 'site']
['cheese', 'sticks', 'per', 'web', 'site']
['cheese', 'stick', 'per', 'web', 'site']
cheese stick per web site
garlic dipping cup
['garlic', 'dipping', 'cup']
['garlic', 'dipping', 'cup']
['garlic', 'dipping', 'cup']
garlic dipping cup


## Lemmatize (make words simpler)