In [38]:
import pandas as pd
import json

Here we open the json data and normalize the data such that each row corresponds to one ingredient.

The dataset has already been parsed by a CRF model to turn the freeform recipe ingredients into structured data (name, unit, quantity).

In [39]:
with open('enriched_recipes.json') as f:
    data = json.load(f)
    df = pd.json_normalize(
        data, 'parsed_ingredients',
        meta=['author',
              'photo_url',
              'prep_time_minutes',
              'rating_stars',
              'review_count',
              'title',
              'total_time_minutes',
              'url'
             ],
        record_prefix='ingredient_', errors='ignore')

The output data looks like this:

In [40]:
df

Unnamed: 0,ingredient_comment,ingredient_input,ingredient_name,ingredient_other,ingredient_qty,ingredient_unit,ingredient_range_end,author,photo_url,prep_time_minutes,rating_stars,review_count,title,total_time_minutes,url
0,chilled and cubed,"1/2 cup unsalted butter, chilled and cubed",unsalted butter,",",1/2,cup,,Stephanie,http://images.media-allrecipes.com/userphotos/...,55,4.32,46,"Basil, Roasted Peppers and Monterey Jack Cornb...",100,http://allrecipes.com/Recipe/6664/
1,chopped,1 cup chopped onion,onion,,1,cup,,Stephanie,http://images.media-allrecipes.com/userphotos/...,55,4.32,46,"Basil, Roasted Peppers and Monterey Jack Cornb...",100,http://allrecipes.com/Recipe/6664/
2,,1 3/4 cups cornmeal,cornmeal,,1 3/4,cup,,Stephanie,http://images.media-allrecipes.com/userphotos/...,55,4.32,46,"Basil, Roasted Peppers and Monterey Jack Cornb...",100,http://allrecipes.com/Recipe/6664/
3,,1 1/4 cups all-purpose flour,all-purpose flour,,1 1/4,cup,,Stephanie,http://images.media-allrecipes.com/userphotos/...,55,4.32,46,"Basil, Roasted Peppers and Monterey Jack Cornb...",100,http://allrecipes.com/Recipe/6664/
4,,1/4 cup white sugar,white sugar,,1/4,cup,,Stephanie,http://images.media-allrecipes.com/userphotos/...,55,4.32,46,"Basil, Roasted Peppers and Monterey Jack Cornb...",100,http://allrecipes.com/Recipe/6664/
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
836531,,2 tablespoons brown sugar,brown sugar,,2,tablespoon,,larkspur,http://images.media-allrecipes.com/global/reci...,32,0.0,0,Blueberry Oatmeal Cream Brulee,44,http://allrecipes.com/Recipe/258791/
836532,,2 tablespoons maple syrup,maple syrup,,2,tablespoon,,larkspur,http://images.media-allrecipes.com/global/reci...,32,0.0,0,Blueberry Oatmeal Cream Brulee,44,http://allrecipes.com/Recipe/258791/
836533,,2 tablespoons butter,butter,,2,tablespoon,,larkspur,http://images.media-allrecipes.com/global/reci...,32,0.0,0,Blueberry Oatmeal Cream Brulee,44,http://allrecipes.com/Recipe/258791/
836534,,1 cup blueberries,blueberries,,1,cup,,larkspur,http://images.media-allrecipes.com/global/reci...,32,0.0,0,Blueberry Oatmeal Cream Brulee,44,http://allrecipes.com/Recipe/258791/


Filter the data where recipe ratings are valid (more than 20 reviews and more than 0 stars).

Then, group the ingredients by ingredient name, and average each ingredient's numerical rating.

In [41]:
grouped = df.where(df['rating_stars'] > 0).where(df['review_count'] > 20).groupby("ingredient_name").agg(avg_rating=("rating_stars", "mean")).reset_index()

In [42]:
grouped

Unnamed: 0,ingredient_name,avg_rating
0,(.13 ounce) envelope unsweetened grape soft dr...,4.400000
1,(.13 ounce) envelope unsweetened orange soft d...,4.400000
2,(.14 ounce) package sugar free instant lemonad...,4.490000
3,(.18 ounce) packet sazon seasoning,4.520000
4,(.25 ounce) envelope active dry yeast,4.471250
...,...,...
9292,zucchini chunks,4.560000
9293,zucchini matchsticks,4.540000
9294,zucchini squash,4.454000
9295,zucchini squashes,4.260000


In [74]:
spices = [
    'allspice',
    'angelica',
    'anise',
    'asafoetida',
    'bay leaf',
    'basil',
    'bergamot',
    'black cumin',
    'black mustard',
    'black pepper',
    'borage',
    'brown mustard',
    'burnet',
    'caraway',
    'cardamom',
    'cassia',
    'catnip',
    'cayenne pepper',
    'celery seed',
    'chervil',
    'chicory',
    'chili pepper',
    'chives',
    'cicely',
    'cilantro',
    'cinnamon',
    'clove',
    'coriander',
    'costmary',
    'cumin',
    'curry',
    'dill',
    'fennel',
    'fenugreek',
    'filé',
    'ginger',
    'grains of paradise',
    'holy basil',
    'horehound',
    'horseradish',
    'hyssop',
    'lavender',
    'lemon balm',
    'lemon grass',
    'lemon verbena',
    'licorice',
    'lovage',
    'mace',
    'marjoram',
    'nutmeg',
    'oregano',
    'paprika',
    'parsley',
    'peppermint',
    'poppy seed',
    'rosemary',
    'rue',
    'saffron',
    'sage',
    'savory',
    'sesame',
    'sorrel',
    'star anise',
    'spearmint',
    'tarragon',
    'thyme',
    'tumeric',
    'vanilla',
    'wasabi',
    'white mustard'
]

In [95]:
spices_regex = r"|".join(spices)
df['ingredient_name'].value_counts()[:110]

salt                 33137
butter               24385
water                19700
all-purpose flour    18161
garlic               18063
                     ...  
almond extract        1141
green onion           1130
cooking spray         1129
ham                   1127
oil                   1127
Name: ingredient_name, Length: 110, dtype: int64

In [112]:
recipes_with_spices = df.loc[df['ingredient_name'].str.contains(spices_regex).fillna(False)]

replace_dict = {}

for spice in spices:
    replace_dict[f".*{spice}.*"] = spice

recipes_with_spices['spice'] = recipes_with_spices['ingredient_name'].replace(replace_dict, regex=True)

recipes_with_spices['spice'].value_counts()[:20]

13               basil
15        black pepper
33            cinnamon
34              nutmeg
89            cinnamon
              ...     
836514      peppermint
836515      peppermint
836519         vanilla
836524        cinnamon
836525        cardamom
Name: ingredient_name, Length: 104539, dtype: object