# Recipe Labelling

## Set up for the work

In [None]:
! pip install snorkel

In [None]:
! pip install spacy

In [None]:
!python -m spacy download en_core_web_sm

In [1]:
! pip install nltk



In [1]:
import nltk
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import re
import string
from nltk.stem import PorterStemmer 

In [2]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/faculty/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/faculty/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/faculty/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/faculty/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [4]:
# Display full output rather than just the last line of output
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [12]:
import pandas as pd

PATH = 'https://raw.githubusercontent.com/uceikow/DataEngineeringGroupAO/master/Recipe_dataset/'

indian = pd.read_json(PATH + "data_indian.json")
indian['label'] = 'indian'
italian = pd.read_json(PATH + "data_italian.json")
italian['label'] = 'italian'
mexican = pd.read_json(PATH + "data_mexican.json")
mexican['label'] = 'mexican'
new_recipe = pd.read_csv(PATH + "final_scrape_not_cleaned.csv")

In [6]:
# Length of each dataset
len(indian)
len(mexican)
len(italian)
len(new_recipe)

480

620

400

2340

In [13]:
# Concat them into one dataset
recipe = pd.concat([indian, italian, mexican, new_recipe],ignore_index = True)
recipe

Unnamed: 0,Title,Description,label
0,Indian Peanut Stew,"This is an easy, authentic dish from South Asi...",indian
1,Roomali Roti,"There is no leavening in this simple, tender I...",indian
2,Spicy Sweet Potato Salad,It's important to use good mayonnaise in this ...,indian
3,Chicken Saag,The classic Indian chicken and spinach dish ge...,indian
4,Paleo Slow Cooker Pork Loin,Boneless pork loin slowly cooks in a curried f...,indian
...,...,...,...
3835,Sicilian aubergine and rigatoni pasta bake,A quick way to make a pasta bake with rigatoni...,italian
3836,Meaty spinach lasagne with béchamel sauce (Las...,An especially lovely and hearty lasagne The go...,italian
3837,Italian Stuffed Chicken,This roast chicken dish has the perfect combin...,italian
3838,Sicilian Almond Biscuits,These crunchy and delicious biscuits are inspi...,italian


## Clean dataset

In [14]:
# Clean the dataset
# Lowercase
recipe = recipe.apply(lambda row: row.str.lower())

# Remove digits
recipe['Title'] = recipe.apply((lambda row: ''.join([i for i in row['Title'] if not i.isdigit()])),axis = 1)
recipe['Description'] = recipe.apply((lambda row: ''.join([i for i in row['Description'] if not i.isdigit()])),axis = 1)

# Remove punctuations
recipe['Title'] = recipe.apply((lambda row: ''.join([i for i in row['Title'] if i not in string.punctuation])),axis=1)
recipe['Description'] = recipe.apply((lambda row: ''.join([i for i in row['Description'] if i not in string.punctuation])),axis=1)

# Remove Stopwords
stop = stopwords.words('english')
recipe['Title'] = recipe['Title'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
recipe['Description'] = recipe['Description'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [15]:
recipe.head()

Unnamed: 0,Title,Description,label
0,indian peanut stew,easy authentic dish south asia appeals wide ra...,indian
1,roomali roti,leavening simple tender indian flatbread bread...,indian
2,spicy sweet potato salad,important use good mayonnaise recipe let cooke...,indian
3,chicken saag,classic indian chicken spinach dish gets richn...,indian
4,paleo slow cooker pork loin,boneless pork loin slowly cooks curried fruit ...,indian


In [16]:
# Stemming
ps = PorterStemmer()
recipe['Title'] = recipe['Title'].apply(lambda x: ' '.join([ps.stem(word) for word in x.split()]))
recipe['Description'] = recipe['Description'].apply(lambda x: ' '.join([ps.stem(word) for word in x.split()]))

In [17]:
recipe.head()

Unnamed: 0,Title,Description,label
0,indian peanut stew,easi authent dish south asia appeal wide rang ...,indian
1,roomali roti,leaven simpl tender indian flatbread bread flo...,indian
2,spici sweet potato salad,import use good mayonnais recip let cook potat...,indian
3,chicken saag,classic indian chicken spinach dish get rich s...,indian
4,paleo slow cooker pork loin,boneless pork loin slowli cook curri fruit sau...,indian


## Pattern Exploration

Before splitting the dataset and writing labelling function,  we might want to first get an idea of how our targetting labels look like. This gives us some basic information of how to start building the labelling function.

In [18]:
# Patterns from different recipes

# Filter out different recipes
recipe_ind = recipe[recipe['label'] == 'indian']
recipe_ita = recipe[recipe['label'] == 'italian']
recipe_mex = recipe[recipe['label'] == 'mexican']

# Word frequency in 'Title'
top_N = 20

title1 = recipe_ind.Title.str.cat(sep=' ')
words_in_title1 = nltk.tokenize.word_tokenize(title1)
word_dist_title1 = nltk.FreqDist(words_in_title1)

title2 = recipe_ita.Title.str.cat(sep=' ')
words_in_title2 = nltk.tokenize.word_tokenize(title2)
word_dist_title2 = nltk.FreqDist(words_in_title2)

title3 = recipe_mex.Title.str.cat(sep=' ')
words_in_title3 = nltk.tokenize.word_tokenize(title3)
word_dist_title3 = nltk.FreqDist(words_in_title3)

ind_freq = pd.DataFrame(word_dist_title1.most_common(top_N),
                    columns=['Indian', 'Frequency'])
ita_freq = pd.DataFrame(word_dist_title2.most_common(top_N),
                    columns=['Italian', 'Frequency'])
mex_freq = pd.DataFrame(word_dist_title3.most_common(top_N),
                    columns=['Mexican', 'Frequency'])

title_freq = pd.concat([ind_freq,ita_freq,mex_freq],axis = 1)
title_freq

Unnamed: 0,Indian,Frequency,Italian,Frequency.1,Mexican,Frequency.2
0,curri,365,italian,178,mexican,230
1,chicken,303,pasta,145,chicken,208
2,indian,190,sauc,141,enchilada,118
3,rice,76,tomato,134,taco,108
4,masala,75,chicken,126,bean,99
5,spici,74,spaghetti,68,salsa,98
6,lamb,73,bake,58,soup,87
7,potato,64,risotto,55,chilli,79
8,easi,61,pizza,54,beef,66
9,paneer,50,bread,50,tortilla,62


It is easier to find patterns and differences of recipes if we display titles of three recipes together. The same goes for description.

In [19]:
# Word frequency in 'Description'
top_N =20

des1 = recipe_ind.Description.str.cat(sep=' ')
words_in_des1 = nltk.tokenize.word_tokenize(des1)
word_dist_des1 = nltk.FreqDist(words_in_des1)

des2 = recipe_ita.Description.str.cat(sep=' ')
words_in_des2 = nltk.tokenize.word_tokenize(des2)
word_dist_des2 = nltk.FreqDist(words_in_des2)

des3 = recipe_mex.Description.str.cat(sep=' ')
words_in_des3 = nltk.tokenize.word_tokenize(des3)
word_dist_des3 = nltk.FreqDist(words_in_des3)

d1_freq = pd.DataFrame(word_dist_des1.most_common(top_N),
                    columns=['Ind_description', 'Frequency'])
d2_freq = pd.DataFrame(word_dist_des2.most_common(top_N),
                    columns=['Ita_description', 'Frequency'])
d3_freq = pd.DataFrame(word_dist_des3.most_common(top_N),
                    columns=['Mex_description', 'Frequency'])

des_freq = pd.concat([d1_freq,d2_freq,d3_freq],axis=1)
des_freq

Unnamed: 0,Ind_description,Frequency,Ita_description,Frequency.1,Mex_description,Frequency.2
0,curri,555,pasta,320,chicken,267
1,indian,441,italian,317,mexican,264
2,rice,325,sauc,306,tortilla,252
3,serv,312,make,293,make,231
4,dish,305,tomato,272,serv,222
5,recip,303,serv,268,bean,193
6,chicken,278,dish,259,recip,189
7,make,275,recip,253,tomato,182
8,spice,245,delici,227,chilli,165
9,use,206,chees,221,chees,163


From this point, we see that there are something more to do with data cleaning. 

- Get rid of those " ..."
- Are Verbs, Conjuctions and prepositions important in our case, if not, we can clear them, only leaving Nouns in our dataset.
- Stemming words (*curry* and *curried* should be of the same thing)

**Re-clean the dataset**

The dataset is scraped from an online recipe website. We only captured part of the description, and some of them contains "…" if the sentence does not ended when capped.

In [20]:
# Remove "..." from description
recipe['Description'] = recipe.Description.apply(lambda x: re.sub(r'…','',x))

Part-pf-speech is used to remove verbs, prepositions, conjunctions from the text.

In [21]:
# Remove verb, prepositions, conjunctions from the text

# Part of speech
recipe['tokens'] = recipe.Description.apply(lambda x: nltk.word_tokenize(x))
recipe['tagged'] = recipe.tokens.apply(lambda x: nltk.pos_tag(x))

In [22]:
# Define remove function
def re_pos(tuple_list):
    for i in tuple_list:
        if i[1] in ['VB','VBD','VBG','VBN','VBP','VBZ','CC','IN']:
            tuple_list.remove(i)
    return tuple_list

In [23]:
# Remove unwanted words and change tokens
recipe['tagged'] = recipe.tagged.apply(lambda x: re_pos(x))
recipe['tokens'] = recipe.tagged.apply(lambda x: [i[0] for i in x])
recipe['Description'] = recipe.tokens.apply(lambda x: ' '.join(x))

In [19]:
recipe.head()

Unnamed: 0,Title,Description,label,tokens,tagged
0,indian peanut stew,easy authentic dish south asia appeals wide ra...,indian,"[easy, authentic, dish, south, asia, appeals, ...","[(easy, JJ), (authentic, JJ), (dish, NN), (sou..."
1,roomali roti,simple tender indian flatbread flour oil salt,indian,"[simple, tender, indian, flatbread, flour, oil...","[(simple, JJ), (tender, NN), (indian, JJ), (fl..."
2,spicy sweet potato salad,important use good mayonnaise recipe let potatoes,indian,"[important, use, good, mayonnaise, recipe, let...","[(important, JJ), (use, NN), (good, JJ), (mayo..."
3,chicken saag,classic indian chicken spinach dish richness s...,indian,"[classic, indian, chicken, spinach, dish, rich...","[(classic, JJ), (indian, JJ), (chicken, NN), (..."
4,paleo slow cooker pork loin,boneless pork slowly cooks fruit sauce tender ...,indian,"[boneless, pork, slowly, cooks, fruit, sauce, ...","[(boneless, NN), (pork, NN), (slowly, RB), (co..."


After re-cleaning, let's see the change of patterns.

In [24]:
# Patterns after re-cleaning
recipe_ind = recipe[recipe['label'] == 'indian']
recipe_ita = recipe[recipe['label'] == 'italian']
recipe_mex = recipe[recipe['label'] == 'mexican']

top_N =30

des1 = recipe_ind.Description.str.cat(sep=' ')
words_in_des1 = nltk.tokenize.word_tokenize(des1)
word_dist_des1 = nltk.FreqDist(words_in_des1)

des2 = recipe_ita.Description.str.cat(sep=' ')
words_in_des2 = nltk.tokenize.word_tokenize(des2)
word_dist_des2 = nltk.FreqDist(words_in_des2)

des3 = recipe_mex.Description.str.cat(sep=' ')
words_in_des3 = nltk.tokenize.word_tokenize(des3)
word_dist_des3 = nltk.FreqDist(words_in_des3)

d1_freq = pd.DataFrame(word_dist_des1.most_common(top_N),
                    columns=['Ind_description', 'Frequency'])
d2_freq = pd.DataFrame(word_dist_des2.most_common(top_N),
                    columns=['Ita_description', 'Frequency'])
d3_freq = pd.DataFrame(word_dist_des3.most_common(top_N),
                    columns=['Mex_description', 'Frequency'])

des_freq = pd.concat([d1_freq,d2_freq,d3_freq],axis=1)
des_freq

Unnamed: 0,Ind_description,Frequency,Ita_description,Frequency.1,Mex_description,Frequency.2
0,curri,542,italian,322,mexican,272
1,indian,451,pasta,321,tortilla,250
2,rice,325,tomato,266,chicken,214
3,dish,303,sauc,265,serv,200
4,serv,288,dish,258,bean,191
5,recip,280,serv,244,tomato,179
6,spice,246,recip,237,recip,176
7,chicken,232,delici,218,chilli,161
8,use,185,chees,207,dish,160
9,delici,162,easi,186,salsa,156


<div class="alert alert-success">

As you might have noticed, indian food and mexican food share some similarities, such as spicy-related words, sauce-related words, rice, etc. This is somewhere that we need to keep an eye on.
</b>


**Ideas of building Labelling functions:**

- Single word (specific ones), such as curry, masala, paneer and chutney for indian repice. These words can label one type of recipe quite well because of their specialty (they will not appear in other recipes). Except for words included in top 15 frequency list, they must be other special words, which might need go through the whole dataset to find.

- Word combos, such as **curry + chicken = indian** :) You might find that chicken are used a lot in both indian and mexican recipe, while a way to label them might be find a **word combos** (function = special word + main ingrediant).

- Unique wordlist difference! Check if any of word are unique for that kind of recipe but not in other recipes.

In [108]:
# Copy the orginal dataset for further pattern exploration
recipe_ex = recipe.copy()
recipe = recipe.drop(columns = ['tokens','tagged'])

## Split the dataset

As being discussed in group meeting, we split the dataset into training, validation, development and test datasets.

If we do multi-labelling, we need to make sure that all datasets above contains same proportion of the 3 recipes. I decided to have 30% labelled data, in which 10% for dev set, 10% for validation set, and the remaining 10% for test set. We left 70% data to training set.

In [109]:
# Split the dataset
# Use ShuffleStratifiedSplit to ensure same proportion of each dataset
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=3, test_size=0.3, random_state=0)

# Get different labelled data
ind = recipe[recipe['label'] == 'indian']
ind.reset_index(drop=True,inplace=True)
ita = recipe[recipe['label'] == 'italian']
ita.reset_index(drop=True,inplace=True)
mex = recipe[recipe['label'] == 'mexican']
mex.reset_index(drop=True,inplace=True)

# Split function (leave 70% for training)
def shuffle_split(df,sss):
  X = df[['Title','Description']]
  y = df['label']
  for train_index, test_index in sss.split(X, y):
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    return X_train, X_test, y_train, y_test

ind_X_train, ind_X_test, ind_y_train, ind_y_test = shuffle_split(ind,sss)
ita_X_train, ita_X_test, ita_y_train, ita_y_test = shuffle_split(ita,sss)
mex_X_train, mex_X_test, mex_y_train, mex_y_test = shuffle_split(mex,sss)

In [110]:
print('indian: ',len(ind_X_train),len(ind_X_test))
print('italian: ',len(ita_X_train),len(ita_X_test))
print('mexican: ',len(mex_X_train),len(mex_X_test))

indian:  336 144
italian:  280 120
mexican:  434 186


In [111]:
# Combine training and test dataset
X_train = pd.concat([ind_X_train,ita_X_train,mex_X_train],axis=0)
y_train = pd.concat([ind_y_train,ita_y_train,mex_y_train],axis=0)
X_test =  pd.concat([ind_X_test,ita_X_test,mex_X_test],axis=0)
y_test =  pd.concat([ind_y_test,ita_y_test,mex_y_test],axis=0)

In [112]:
# Combine training dataset
train = pd.concat([X_train,y_train],axis=1)

In [113]:
# Combine the test dataset for next splitting
test = pd.concat([X_test,y_test],axis=1)
test = test.reset_index(drop=True)

Split development and validation dataset from test dataset.

In [114]:
# From randomly sampled test set get dev set and validation set.

ind_val, ind_dev = test[:48], test[48:96]
ita_val, ita_dev = test[144:184], test[184:224]
mex_val, mex_dev = test[264:326], test[326:388]
ind_test, ita_test, mex_test = test[96:144],test[224:264],test[388:450]

In [115]:
# Combine val, dev and test set

val = pd.concat([ind_val,ita_val,mex_val],axis=0)
dev = pd.concat([ind_dev,ita_dev,mex_dev],axis=0)
test_n = pd.concat([ind_test,ita_test,mex_test],axis=0)

As we split the dataset by different countries, we need to shuffle them before training.

In [116]:
from sklearn.utils import shuffle
train = shuffle(train, random_state = 42)
test = shuffle(test_n, random_state = 42)
val = shuffle(val, random_state = 42)
dev = shuffle(dev, random_state = 42)

To apply LFAnalysis, we need to change labels to number.

In [117]:
# Change labels to number
def label_to_num(df):
    df.label = df.label.apply(lambda x: 0 if x == 'indian' else(1 if x == 'italian' else 2))
    return df
    
test = label_to_num(test)
val = label_to_num(val)
dev = label_to_num(dev)

In [118]:
# Prepare for later training
df_train = train.iloc[:,:2]
df_val = val.iloc[:,:2]
df_dev = dev.iloc[:,:2]
Y_val = val.iloc[:,-1].values
Y_dev = dev.iloc[:,-1].values

# Labelling functions

In [119]:
from snorkel.labeling import labeling_function

In [120]:
# For clarity, we define constants to represent the class labels and abstaining.
ABSTAIN = -1
INDIAN = 0
ITALIAN = 1
MEXICAN = 2

## Keywords LFs

In [121]:
ind_keywords = ['curry','indian','masala','paneer','chutney','curried',
                'simmered','cumin','yogurt','coconut']

@labeling_function()
def indian_keywords(x):
        if any(word in x.Title for word in ind_keywords):
            return INDIAN
        else:
            return ABSTAIN

In [122]:
# Unique word checking
import numpy as np

ind_values = np.unique(words_in_des1)
ita_values = np.unique(words_in_des2)
mex_values = np.unique(words_in_des3)

In [123]:
# Check words in indian but not in mexican and italian
word_diff_ind_mex = np.setdiff1d(ind_values,mex_values)
word_diff_ind_ita = np.setdiff1d(ind_values,ita_values)
word_for_ind = np.intersect1d(word_diff_ind_mex,word_diff_ind_ita)

In [124]:
@labeling_function()
def ind_unique_words(x):
        if any(word in x.Description for word in word_for_ind):
            return INDIAN
        else:
            return ABSTAIN

In [125]:
# Word Combo curry + meat
@labeling_function()
def currymeat(x):
    return INDIAN if re.search(r"(?=.*curry)(?=.*(chicken|lamb|beef))", x.Description, flags=re.I) else ABSTAIN

Some words in indian recipe, such as *rice, sauce, etc* might comflict with mexican recipe, however, as country_name + rice / sauce can be a good seperator.

In [126]:
# County name + food name
@labeling_function()
def ind_food(x):
    return INDIAN if re.search(r"(?=.*indian)(?=.*(rice|sauce|potatoes))", x.Description, flags=re.I) else ABSTAIN

In [127]:
from snorkel.labeling import PandasLFApplier

lfs = [indian_keywords, currymeat, ind_food, ind_unique_words]

applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=df_train)
L_dev = applier.apply(df=df_dev)


  0%|          | 0/1050 [00:00<?, ?it/s][A
  3%|▎         | 33/1050 [00:00<00:03, 324.13it/s][A
  6%|▋         | 67/1050 [00:00<00:03, 325.12it/s][A
  9%|▉         | 98/1050 [00:00<00:02, 319.61it/s][A
 13%|█▎        | 135/1050 [00:00<00:02, 332.73it/s][A
 16%|█▌        | 167/1050 [00:00<00:02, 325.92it/s][A
 19%|█▉        | 202/1050 [00:00<00:02, 329.22it/s][A
 22%|██▏       | 235/1050 [00:00<00:02, 328.84it/s][A
 26%|██▌       | 270/1050 [00:00<00:02, 333.82it/s][A
 29%|██▉       | 303/1050 [00:00<00:02, 331.95it/s][A
 32%|███▏      | 335/1050 [00:01<00:02, 325.46it/s][A
 35%|███▍      | 367/1050 [00:01<00:02, 318.12it/s][A
 38%|███▊      | 399/1050 [00:01<00:02, 307.30it/s][A
 41%|████      | 430/1050 [00:01<00:02, 306.33it/s][A
 44%|████▍     | 466/1050 [00:01<00:01, 317.12it/s][A
 47%|████▋     | 498/1050 [00:01<00:01, 311.73it/s][A
 50%|█████     | 530/1050 [00:01<00:01, 301.94it/s][A
 54%|█████▎    | 563/1050 [00:01<00:01, 308.68it/s][A
 57%|█████▋    | 594/10

In [128]:
from snorkel.labeling import LFAnalysis

LFAnalysis(L=L_train, lfs=lfs).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
indian_keywords,0,[0],0.16381,0.152381,0.0
currymeat,1,[0],0.021905,0.021905,0.0
ind_food,2,[0],0.020952,0.020952,0.0
ind_unique_words,3,[0],0.514286,0.174286,0.0


In [129]:
LFAnalysis(L=L_dev, lfs=lfs).lf_summary(Y=Y_dev)

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
indian_keywords,0,[0],0.173333,0.146667,0.0,25,1,0.961538
currymeat,1,[0],0.02,0.02,0.0,3,0,1.0
ind_food,2,[0],0.033333,0.033333,0.0,5,0,1.0
ind_unique_words,3,[0],0.473333,0.173333,0.0,44,27,0.619718


There is one misclassified case in the first LF and nearly half misclassified in the third LF. Let's check them.

In [130]:
from snorkel.analysis import get_label_buckets

buckets1 = get_label_buckets(Y_dev, L_dev[:, 0])
df_dev.iloc[buckets1[(MEXICAN, INDIAN)]]

buckets2 = get_label_buckets(Y_dev, L_dev[:, 3])
df_dev.iloc[buckets2[(MEXICAN, INDIAN)]]

Unnamed: 0,Title,Description
333,fish tacos honeycumin cilantro slaw chipotle mayo,whats flavorful tacos stuffed fried tilapia with


Unnamed: 0,Title,Description
379,hot bean dip,bean dip sooooo good easy always make super
348,taco seasoning ii,mixture little cornstarch closely the
370,margaritas rocks,sweet sour tequila triple sec grand marnier sq...
365,mexican turkey burgers,inspired mexican tortillas american burgers de...
369,chicken taco casserole,favorite taco fixings crowdpleasing casserole ...
346,mrs espys enchilada sauce,tomato sauce water seasonings browned flour ch...
380,mexican lasagna lite,light ingredients refried beans enchilada sour
355,carrot chile cilantro soup,delicious soup combines carrots potatoes garli...
331,turkey posole,authentic rural mexican dish usually prepared ...
349,pico de gallo,fresh tomato salsa red onion jalapeno lime jui...


It mixed with mexican recipe.