In [15]:
import numpy as np
import pandas as pd
import re
pd.options.mode.chained_assignment = None 

from lib.helper import diets, ingredients, new_label_array

We'll read in the raw scraped .josn data from Hugo Darwood's kaggle dataset as a pandas DataFrame. The goal will be to train a model that can predict whether or not a recipe can adhere to a specific dietary restriction based on its ingredient makeup.

In [16]:
full_recipes = pd.read_json('db/full_format_recipes.json')

We are going to extract the raw ingredients, which will serve as our predictors, and the diet tags from the 'categories' column.

In [17]:
full_recipes.sample(3)

Unnamed: 0,calories,categories,date,desc,directions,fat,ingredients,protein,rating,sodium,title
3431,,"[Vegetable, Side, Bake, Quick & Easy, Spice, H...",2004-08-20 04:00:00,,[Preheat oven to 400°F. Toss all ingredients i...,,"[4 pounds butternut squash, peeled, seeded, cu...",,3.75,,Spiced Butternut Squash
5287,83.0,"[Salad, Tomato, Side, Picnic, Vegetarian, Quic...",2009-08-03 04:00:00,,[Whisk oil and vinegar in medium bowl. Season ...,7.0,"[2 tablespoons extra-virgin olive oil, 1 table...",1.0,3.125,11.0,Heirloom Tomato Salad
2157,180.0,"[Herb, Tomato, Vegetable, Appetizer, Vegetaria...",2004-08-20 04:00:00,,[Quarter medium tomatoes and chop celery. In a...,11.0,[2 pounds medium vine-ripened tomatoes (about ...,5.0,0.0,479.0,"""Virgin Mary"" Aspic"


There are a few duplicate entries and missing entries, most likely since the data was scraped in batches.  We'll drop these.

In [18]:
full_recipes.duplicated('title', keep='first').sum()

2354

In [19]:
full_recipes.drop_duplicates('title', keep='first', inplace=True)

In [20]:
full_recipes.dropna(axis=0, inplace=True)

In [29]:
recipes = full_recipes[['title', 'categories', 'desc']]

Now we will start to clean the category tags in preparation of vectorizing them for the model.  The first transformation will be to make all letters lowercase.  The second will be to remove hypens and other unnecessary punctuations.

In [30]:
recipes['categories'] = recipes['categories'].apply(lambda x: [i.lower() for i in x])

In [31]:
recipes['categories'] = recipes['categories'].apply(lambda x: re.sub(r"[\'\[\]]|\bname\b", '', str(x)))

In [32]:
recipes['categories'] = recipes['categories'].apply(lambda x: re.sub("[^\w]", " ", x).split())

Now we are going to create new columns that extract the ingredient tags and the diet tags as defined by the recipe authors.  The diet tag will be our target labels and the ingredient tags will serve as our predictors along with the recipe title and descriptions.  

In [34]:
recipes['ingredients'] = recipes['categories'].apply(lambda x: new_label_array(x, ingredients))

In [79]:
recipes['ingredients'] = recipes['ingredients'].apply(lambda x: ' '.join(x))

In [33]:
recipes['diets'] = recipes['categories'].apply(lambda x: new_label_array(x, diets))

In [35]:
recipes.drop('categories', axis=1, inplace=True)

In [80]:
recipes.sample(5)

Unnamed: 0,title,desc,diets,ingredients
3989,"Salmon with Arugula, Tomato and Caper Sauce",Start with grilled country bread spread with o...,[],fish tomato arugula
9981,Veal Chops with Tomato-Orange-Basil Sauce,The delicious sauce is also very nice with chi...,[],veal
3405,"Chocolate-Chunk Cookies with Pecans, Dried Apr...",This recipe can be prepared in 45 minutes or l...,"[vegetarian, pescatarian, soy, kosher]",chocolate fruit nut fruit apricot cherry pecan...
18821,Chicken Tagine with Apricots and Spiced Pine N...,"""There is no typical tagine of Algeria—the cou...",[soy],chicken fruit ginger orange apricot nut spice ...
5384,"Soba with Pea Shoots, Shiitake Mushrooms, and ...",Can be prepared in 45 minutes or less.,"[vegetarian, vegan, pescatarian, dairy, kosher]",mushroom vegetable leek noodle peanut nut


In [42]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.preprocessing import MultiLabelBinarizer

In [53]:
hv = HashingVectorizer(stop_words='english')

In [68]:
X = hv.fit_transform(recipes['desc'])

In [44]:
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(recipes['diets'])

In [74]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC

In [69]:
kf = KFold(n_splits=5, random_state=42)

In [71]:
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

In [72]:
clf = OneVsRestClassifier(SVC(kernel='linear'))
clf.fit(X_train, y_train)

OneVsRestClassifier(estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
          n_jobs=1)

In [73]:
clf.score(X_test, y_test)

0.46281433921883358

In a multilabel multiclass situation like this, the scoring method is a harsh method because the sample much be correct prediction of the exact combination of each labels.  While 46% is not terrible, let's see what happens to the model's accuracy when we add in the title and ingredient vectors.

In [81]:
recipes['combined_corpus'] = recipes[['title', 'desc', 'ingredients']].apply(lambda x: ' '.join(x), axis=1)

In [83]:
X = hv.fit_transform(recipes['combined_corpus'])

In [84]:
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

In [85]:
clf = OneVsRestClassifier(SVC(kernel='linear'))
clf.fit(X_train, y_train)

OneVsRestClassifier(estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
          n_jobs=1)

In [86]:
clf.score(X_test, y_test)

0.55644729802033177