In [1]:
import numpy as np
import pandas as pd
import re
pd.options.mode.chained_assignment = None 

We'll read in the raw scraped .josn data from Hugo Darwood's kaggle dataset as a pandas DataFrame. The goal will be to train a model that can predict whether or not a recipe can adhere to a specific dietary restriction based on its ingredient makeup.

In [2]:
full_recipes = pd.read_json('db/full_format_recipes.json')

We are going to extract the raw ingredients, which will serve as our predictors, and the diet tags from the 'categories' column.

In [3]:
full_recipes.sample(3)

Unnamed: 0,calories,categories,date,desc,directions,fat,ingredients,protein,rating,sodium,title
15240,233.0,"[Sauce, Ginger, Side, Christmas, Thanksgiving,...",2007-10-01 21:52:26,,"[Combine apple cider vinegar, onion, ginger, l...",0.0,"[2 cups apple cider vinegar, 1 cup finely chop...",1.0,3.75,17.0,"Cranberry, Pear, and Ginger Chutney"
4875,400.0,"[Fruit, Mustard, Sauté, Quick & Easy, Pear, Po...",2004-08-20 04:00:00,Combining smooth and grainy Dijon mustards res...,[Melt butter in heavy large skillet over mediu...,13.0,"[3 tablespoons unsalted butter, 4 firm medium ...",26.0,4.375,139.0,Sauteed Pork Tenderloin and Pears in Mustard-P...
1735,145.0,"[Rum, Alcoholic, Christmas, Thanksgiving, Spic...",2004-08-20 20:58:12,,[In a china or glass mug rinsed with boiling w...,4.0,"[1 teaspoon light brown sugar, 1/4 teaspoon fi...",0.0,3.125,2.0,Hot Spiced Buttered Rum


There are a few duplicate entries and missing entries, most likely since the data was scraped in batches.  We'll drop these.

In [4]:
full_recipes.duplicated('title', keep='first').sum()

2354

In [5]:
full_recipes.drop_duplicates('title', keep='first', inplace=True)

In [6]:
full_recipes.dropna(axis=0, inplace=True)

In [97]:
recipe_categories = full_recipes[['title', 'categories']]

Now we will start to clean the category tags in preparation of vectorizing them for the model.  The first transformation will be to make all letters lowercase.  The second will be to remove hypens and other unnecessary punctuations.

In [98]:
recipe_categories['categories'] = recipe_categories['categories'].apply(lambda x: [i.lower() for i in x])

In [99]:
recipe_categories['categories'] = recipe_categories['categories'].apply(lambda x: re.sub(r"[\'\[\]]|\bname\b", '', str(x)))

In [100]:
recipe_categories['categories'] = recipe_categories['categories'].apply(lambda x: re.sub("[^\w]", " ", x).split())

Now we are going to create a new column that extracts the diet tags as defined by the recipe authors.  We'll create a vector of diet labels.

In [93]:
diets = ['alcoholic','dairy free','dairy','fat free', 'healthy','high fiber','kid-friendly','kidney friendly',
'kosher','low cal','low carb','low cholesterol','low fat','low sodium','low sugar','low/no sugar',
'no sugar added','non-alcoholic','organic','paleo','peanut free','pescatarian','soy free','soy','sugar conscious',
'tree nut free','tree nut','vegan','vegetarian','wheat/gluten-free']

In [None]:
elemental_ingredients = ['almond','amaretto','anchovy','anise','apple juice','apple','apricot','artichoke',
'arugula','asian pear','asparagus','avocado','bacon','banana','barley','basil','bass','beef rib',
'beef shank','beef tenderloin','beef','beet','bell pepper','blackberry','blue cheese','blueberry','bok choy',
'bourbon','brandy','bread','breadcrumbs','brie','brisket','broccoli rabe','broccoli','brown rice',
'brussel sprout','buffalo','bulgur','burrito','butter','buttermilk','butternut squash','butterscotch/caramel',
'cabbage','calvados','campari','cantaloupe','capers','caraway','cardamom','carrot','cashew','cauliflower',
'caviar','celery','chambord','champagne','chard','chartreuse','cheddar','cherry','chestnut','chicken',
'chickpea','chile pepper','chili','chive','chocolate','cilantro','cinnamon','citrus','clam',
'clove','coconut','cod','coffee','cognac/armagnac','collard greens',
'coriander',
'corn',
'cornmeal',
'cottage cheese',
'couscous',
'crab',
'cranberry sauce',
'cranberry',
'cream cheese',
'créme de cacao',
'crêpe',
'cr��me de cacao',
'cucumber',
'cumin',
'currant',
'curry',
'date',
'dill',
'dried fruit',
'duck',
'eau de vie',
'egg nog',
'egg',
'eggplant',
'endive',
'escarole', 
'fennel',
'feta',
'fig',
'fontina',
'frangelico',
'garlic',
'gin',
'ginger',
'goat cheese',
'goose',
'gouda',
'grand marnier',
'granola',
'grape',
'grapefruit',
'grappa',
'green bean',
'green onion/scallion',
'ground beef',
'ground lamb',
'guava',
'halibut',
'ham',
'hamburger',
'hazelnut',
'hominy/cornmeal/masa',
'honey',
'honeydew',
'horseradish',
'hot pepper',
'hummus',
'iced coffee',
'iced tea',
'jalapeño',
'jerusalem artichoke',
'jícama',
'kahlúa',
'kale',
'kirsch',
'kiwi',
'kumquat',
'lamb chop',
'lamb shank',
'lamb',
'leek',
'lemon juice',
'lemon',
'lemongrass',
'lentil',
'lettuce',
'lima bean',
'lime juice',
'lime',
'lingonberry',
'lobster',
'lychee',
'macadamia nut',
'mango',
'maple syrup',
'marsala',
'marscarpone',
'marshmallow',
'mayonnaise',
'mezcal',
'midori',
'mint',
'molasses',
'monterey jack',
'mozzarella',
'mushroom',
'mussel',
'mustard greens',
'nectarine',
'nutmeg',
'oat',
'oatmeal',
'octopus',
'okra',
'olive',
'onion',
'orange juice',
'orange',
'oregano',
'orzo',
'oyster',
'papaya',
'paprika',
'parmesan',
'parsley',
'parsnip',
'passion fruit',
'pea',
'peach',
'peanut butter',
'peanut',
'pear',
'pecan',
'pepper',
'pernod',
'persimmon',
'pickles',
'pine nut',
'pineapple',
'pistachio',
'plantain',
'plum',
'poblano',
'pomegranate juice',
'pomegranate',
'poppy',
'pork chop',
'pork rib',
'pork tenderloin',
'pork',
'port',
'potato',
'prosciutto',
'prune',
'pumpkin',
'quail',
'quince',
'quinoa',
'rabbit',
'rack of lamb',
'radicchio',
'radish',
'raisin',
'raspberry',
'red wine',
'rhubarb',
'rice',
'ricotta',
'rosemary',
'rosé',
'rum',
'rutabaga',
'rye',
'saffron',
'sage',
'sake',
'salmon',
'sardine',
'scallop',
'scotch',
'semolina',
'sesame oil',
'sesame',
'shallot',
'sherry',
'shrimp',
'snapper',
'sorbet',
'sour cream',
'sourdough',
'soy sauce',
'spinach',
'squash',
'squid',
'strawberry',
'sugar snap pea',
'sweet potato/yam',
'swiss cheese',
'swordfish',
'tamarind',
'tangerine',
'tapioca',
'tarragon',
'tea',
'tequila',
'thyme',
'tilapia',
'tofu',
'tomatillo',
'tomato',
'triple sec',
'trout',
'tuna',
'turkey',
'turnip',
'vanilla',
'veal',
'venison',
'vermouth',
'vodka',
'walnut',
'wasabi',
'watercress',
'watermelon',
'whiskey',
'white wine',
'wild rice',
'yellow squash',
'yogurt',
'yuca',
'zucchini']

In [102]:
def new_label_array(x, some_list):
    try:
        label_list = [i for i in x if i in some_list]
    except:
        label_list = [i for i in x if i in some_list]
    return list(label_list)
    

In [103]:
recipe_categories['diets'] = recipe_categories['categories'].apply(lambda x: new_label_array(x, diets))