# Set up environment

In [1]:
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

import pyspark

number_cores = int(os.environ['NUM_CPUS'])
memory_gb = int(os.environ['AVAILABLE_MEMORY_MB']) // 1024
conf = (
    pyspark.SparkConf()
        .setMaster('local[{}]'.format(number_cores))
        .set('spark.driver.memory', '{}g'.format(memory_gb))
)
sc = pyspark.SparkContext(conf=conf)

In [2]:
print(sc)

<SparkContext master=local[4] appName=pyspark-shell>


In [3]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

# Load dataset

In [4]:
# Load the dataset
indian = sqlContext.read.csv("/project/Project/DataEngineeringGroupAO/Recipe_dataset/data_indian.csv",header=True)
italian = sqlContext.read.csv("/project/Project/DataEngineeringGroupAO/Recipe_dataset/data_italian.csv",header=True)
mexican = sqlContext.read.csv("/project/Project/DataEngineeringGroupAO/Recipe_dataset/data_mexican.csv",header=True)

In [5]:
# Label the data
from pyspark.sql.functions import lit

indian = indian.withColumn("label",lit("indian"))
italian = italian.withColumn("label",lit("italian"))
mexican = mexican.withColumn("label",lit("mexican"))

In [6]:
# Combine 3 dataset into one

from functools import reduce
from pyspark.sql import DataFrame

def unionAll(dfs):
    return reduce(DataFrame.unionAll, dfs)

dfs = [indian, italian, mexican]
recipe = unionAll(dfs)
recipe.show()

+--------------------+--------------------+------+
|               Title|         Description| label|
+--------------------+--------------------+------+
|  Indian Peanut Stew|This is an easy, ...|indian|
|        Roomali Roti|There is no leave...|indian|
|Spicy Sweet Potat...|It's important to...|indian|
|        Chicken Saag|The classic India...|indian|
|Paleo Slow Cooker...|Boneless pork loi...|indian|
|Bombay Chicken an...|Chicken parts are...|indian|
|Indian Carrots, P...|Potatoes, peas an...|indian|
|Wendy's Indian Bu...|This recipe resem...|indian|
|    Indian Chickpeas|Garbanzo beans, o...|indian|
|Dal Makhani (Indi...|These richly spic...|indian|
|               Raita|Chopped tomatoes ...|indian|
|Yogurt-Marinated ...|A yogurt-based ma...|indian|
|Indian-Spiced Roa...|Spicy roasted chi...|indian|
|Cauliflower and T...|Pressed tofu cube...|indian|
|Channa Masala (Ch...|This fantastic In...|indian|
|Bengali Chicken C...|Thy this deliciou...|indian|
|  Indian Sweet Bread|A crisp a

In [7]:
recipe.count()

1500

In [8]:
# Convert it to RDD
recipe_rdd = recipe.rdd

In [9]:
recipe_rdd.take(5)

[Row(Title='Indian Peanut Stew', Description='This is an easy, authentic dish from South Asia that appeals to a wide range of tastes. The…', label='indian'),
 Row(Title='Roomali Roti', Description='There is no leavening in this simple, tender Indian flatbread of bread flour, oil, salt and…', label='indian'),
 Row(Title='Spicy Sweet Potato Salad', Description="It's important to use good mayonnaise in this recipe, and to let the cooked potatoes chill…", label='indian'),
 Row(Title='Chicken Saag', Description='The classic Indian chicken and spinach dish gets richness from sour cream.', label='indian'),
 Row(Title='Paleo Slow Cooker Pork Loin', Description='Boneless pork loin slowly cooks in a curried fruit sauce until tender and delicious.', label='indian')]

# Data Cleaning

In [10]:
# import all packages needed for data cleaning

from pyspark.sql.functions import udf, regexp_replace, lower, col
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from nltk.stem.snowball import SnowballStemmer
from pyspark.sql.types import IntegerType

In [11]:
# Lowercase

recipe = recipe.select(*[lower(col(col_name)).name(col_name) for col_name in recipe.columns])
recipe.show()

+--------------------+--------------------+------+
|               Title|         Description| label|
+--------------------+--------------------+------+
|  indian peanut stew|this is an easy, ...|indian|
|        roomali roti|there is no leave...|indian|
|spicy sweet potat...|it's important to...|indian|
|        chicken saag|the classic india...|indian|
|paleo slow cooker...|boneless pork loi...|indian|
|bombay chicken an...|chicken parts are...|indian|
|indian carrots, p...|potatoes, peas an...|indian|
|wendy's indian bu...|this recipe resem...|indian|
|    indian chickpeas|garbanzo beans, o...|indian|
|dal makhani (indi...|these richly spic...|indian|
|               raita|chopped tomatoes ...|indian|
|yogurt-marinated ...|a yogurt-based ma...|indian|
|indian-spiced roa...|spicy roasted chi...|indian|
|cauliflower and t...|pressed tofu cube...|indian|
|channa masala (ch...|this fantastic in...|indian|
|bengali chicken c...|thy this deliciou...|indian|
|  indian sweet bread|a crisp a

In [12]:
# Remove punctuation and digits

recipe_clean = recipe.select(regexp_replace('Title', "[^a-zA-Z\\s]", "").alias('title'), 
    (regexp_replace('Description', "[^a-zA-Z\\s]", "").alias('des')),'label')

In [13]:
recipe_clean.show()

+--------------------+--------------------+------+
|               title|                 des| label|
+--------------------+--------------------+------+
|  indian peanut stew|this is an easy a...|indian|
|        roomali roti|there is no leave...|indian|
|spicy sweet potat...|its important to ...|indian|
|        chicken saag|the classic india...|indian|
|paleo slow cooker...|boneless pork loi...|indian|
|bombay chicken an...|chicken parts are...|indian|
|indian carrots pe...|potatoes peas and...|indian|
|wendys indian but...|this recipe resem...|indian|
|    indian chickpeas|garbanzo beans on...|indian|
|dal makhani india...|these richly spic...|indian|
|               raita|chopped tomatoes ...|indian|
|yogurtmarinated s...|a yogurtbased mar...|indian|
|indianspiced roas...|spicy roasted chi...|indian|
|cauliflower and t...|pressed tofu cube...|indian|
|channa masala chi...|this fantastic in...|indian|
|bengali chicken c...|thy this deliciou...|indian|
|  indian sweet bread|a crisp a

In [14]:
# Remove Stopwords

# Tokenize text
tokenizer = Tokenizer(inputCol="des", outputCol="des_token")
recipe = tokenizer.transform(recipe_clean).select('title','des','des_token','label')
# tokenized.select("Description", "Des_words")\
    #.withColumn("tokens", countTokens(col("Des_words"))).show(truncate=False)

# Remove stopwords
remover = StopWordsRemover(inputCol='des_token', outputCol='des_clean')
recipe_no_stopw = remover.transform(recipe).select('title','des_clean', 'label')
recipe_no_lists = recipe_no_stopw
recipe_no_stopw.show()

+--------------------+--------------------+------+
|               title|           des_clean| label|
+--------------------+--------------------+------+
|  indian peanut stew|[easy, authentic,...|indian|
|        roomali roti|[leavening, simpl...|indian|
|spicy sweet potat...|[important, use, ...|indian|
|        chicken saag|[classic, indian,...|indian|
|paleo slow cooker...|[boneless, pork, ...|indian|
|bombay chicken an...|[chicken, parts, ...|indian|
|indian carrots pe...|[potatoes, peas, ...|indian|
|wendys indian but...|[recipe, resemble...|indian|
|    indian chickpeas|[garbanzo, beans,...|indian|
|dal makhani india...|[richly, spiced, ...|indian|
|               raita|[chopped, tomatoe...|indian|
|yogurtmarinated s...|[yogurtbased, mar...|indian|
|indianspiced roas...|[spicy, roasted, ...|indian|
|cauliflower and t...|[pressed, tofu, c...|indian|
|channa masala chi...|[fantastic, india...|indian|
|bengali chicken c...|[thy, delicious, ...|indian|
|  indian sweet bread|[crisp, s

In [15]:
recipe = recipe_no_stopw

# Pattern Exploration

In [16]:
# Filter out different recipes
# Create temp table
recipe.createOrReplaceTempView('recipes')

recipe_ind = sqlContext.sql("SELECT * FROM recipes WHERE label == 'indian'")
recipe_ita = sqlContext.sql("SELECT * FROM recipes WHERE label == 'italian'")
recipe_mex = sqlContext.sql("SELECT * FROM recipes WHERE label == 'mexican'")
# print((recipe_ind.count(), len(recipe_ind.columns)))
# print((recipe_ita.count(), len(recipe_ita.columns)))
# print((recipe_mex.count(), len(recipe_mex.columns)))

In [17]:
# We asssume that we do not know labels for the majority of data points, 
# hence further explore only test split
recipe_ind_tr, recipe_ind_ts, recipe_ind_dv = recipe_ind.randomSplit([0.8,0.2,0.1],seed = 11)
recipe_ita_tr, recipe_ita_ts, recipe_ita_dv = recipe_ita.randomSplit([0.7,0.2,0.1],seed = 11)
recipe_mex_tr, recipe_mex_ts, recipe_mex_dv = recipe_mex.randomSplit([0.7,0.2,0.1],seed = 11)

We left aside dev/val split as 10% of each of the datasets and 20% as test split to calculate accuracies of LFs. Bigger test set of "gold" is valuable to get more matches of LFs and see overall performance. 

In [18]:
# Create frequency list
import pyspark.sql.functions as f

top_n = 15

ind_counts = recipe_ind_dv.select(f.explode('des_clean').alias('col')).groupBy('col').count()
ind_des_freq = ind_counts.orderBy(ind_counts["count"].desc()).limit(top_n)

ita_counts = recipe_ita_dv.select(f.explode('des_clean').alias('col')).groupBy('col').count()
ita_des_freq = ita_counts.orderBy(ita_counts["count"].desc()).limit(top_n)

mex_counts = recipe_mex_dv.select(f.explode('des_clean').alias('col')).groupBy('col').count()
mex_des_freq = mex_counts.orderBy(mex_counts["count"].desc()).limit(top_n)

In [19]:
# View then in one dataframe
from pyspark.sql.functions import monotonically_increasing_id 

df1 = ind_des_freq.withColumn("row_id", monotonically_increasing_id())
df2 = ita_des_freq.withColumn("row_id", monotonically_increasing_id())
df3 = mex_des_freq.withColumn("row_id", monotonically_increasing_id())

des_freq = df1.join(df2,("row_id")).join(df3,("row_id")).drop("row_id")
des_freq.show()

+-----------+-----+---------+-----+---------+-----+
|        col|count|      col|count|      col|count|
+-----------+-----+---------+-----+---------+-----+
|     indian|   12|   cheese|    8|  chicken|   12|
|      curry|   12|  chicken|    8|  mexican|   11|
|    chicken|    8|    sauce|    7|     beef|    9|
|     yogurt|    5|     easy|    7|     make|    9|
|      sweet|    5|  italian|    6|tortillas|    8|
|       rice|    5|   recipe|    5|     rice|    8|
|       dish|    4|delicious|    5| tomatoes|    7|
|cauliflower|    4|   tomato|    4|   filled|    7|
| vegetarian|    4|    using|    3|   recipe|    7|
|     spiced|    4|   creamy|    3|    spicy|    7|
|       make|    4| parmesan|    3|    beans|    7|
|      cumin|    4|    basil|    3|    flour|    6|
|      quick|    4|   flavor|    3|     dish|    6|
|    mixture|    4|  breasts|    3|    great|    6|
|      sauce|    3|   simple|    3|    sauce|    6|
+-----------+-----+---------+-----+---------+-----+



In [21]:
from pyspark.sql.functions import concat_ws

# train split full / remove lists
df_tr = recipe_ind_tr.union(recipe_ita_tr)
df_tr = df_tr.union(recipe_mex_tr).withColumn("des_clean", concat_ws(" ", "des_clean"))

# test split 
df_ts = recipe_ind_ts.union(recipe_ita_ts)
df_ts = df_ts.union(recipe_mex_ts).withColumn("des_clean", concat_ws(" ", "des_clean"))
# dev/val split
df_dv = recipe_ind_dv.union(recipe_ita_dv)
df_dv = df_dv.union(recipe_mex_dv).withColumn("des_clean", concat_ws(" ", "des_clean"))

In [22]:
# drop train split for labelling
df_tr.drop('label').first()

Row(title='ada adai', des_clean='try crepelike items indianstyle breakfast made lentils rice')

In [23]:
mapping = {'indian':0, 'italian':1, 'mexican':2}

In [24]:
from pyspark.sql.functions import col, create_map, lit
from itertools import chain

mapping_func = create_map([lit(x) for x in chain(*mapping.items())])

df_dv = df_dv.withColumn("num_label", mapping_func.getItem(col("label")))
# df_dv = df_dv.select('title','des_clean', 'num_label')
df_dv = df_dv.select('title','des_clean', 'num_label')
df_dv.show()

+--------------------+--------------------+---------+
|               title|           des_clean|num_label|
+--------------------+--------------------+---------+
|anapakaya paala k...|recipe calabash s...|        0|
|basic indian curr...|wonderful indian ...|        0|
|bombay chicken an...|chicken parts bru...|        0|
|channa masala chi...|fantastic indian ...|        0|
|       chicken korma|prepare flavorful...|        0|
|cucumbercilantro ...|quick tasty india...|        0|
|curried mushroom ...|steaming curriedm...|        0|
|curried pork chop...|lean pork chops t...|        0|
|curried stew with...|lamb marinated yo...|        0|
|dairyfree caulifl...|battered fried ca...|        0|
| easy veggie samosas|quick vegetarian ...|        0|
|grilled lamb chop...|grilled lamb chop...|        0|
|indian chicken ko...|indian chicken ko...|        0|
|indian masala chi...|marinate chicken ...|        0|
|   indian pork chops|great weeknight d...|        0|
|indian vegetable ...|indian

In [25]:
print((df_dv.count(), len(df_dv.columns)))

(137, 3)


In [27]:
import numpy as np

# create column with actual values 
Y_dv = df_dv.select('num_label').rdd.flatMap(lambda x: x).collect()
Y_dv = np.array(Y_dv)
# import numpy as np
# Y_dv = np.array(df_dv.select('num_label').collect())

np.shape(Y_dv)

(137,)

In [28]:
# For clarity, we define constants to represent the class labels and abstaining.
ABSTAIN = -1
INDIAN = 0
ITALIAN = 1
MEXICAN = 2

## Keywords LFs

In [29]:
! pip install snorkel
from snorkel.labeling.apply.spark import SparkLFApplier
from snorkel.labeling import LFAnalysis
from snorkel.labeling import labeling_function
import re

Collecting snorkel
  Downloading snorkel-0.9.3-py3-none-any.whl (139 kB)
[K     |████████████████████████████████| 139 kB 13.0 MB/s eta 0:00:01
[?25hCollecting torch<1.2.0,>=1.1.0
  Downloading torch-1.1.0-cp36-cp36m-manylinux1_x86_64.whl (676.9 MB)
[K     |████████████████████████████████| 676.9 MB 2.5 kB/s s eta 0:00:01     |██████████                      | 211.1 MB 71.7 MB/s eta 0:00:07
Collecting networkx<2.4,>=2.2
  Downloading networkx-2.3.zip (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 86.3 MB/s eta 0:00:01
[?25hCollecting munkres==1.1.2
  Downloading munkres-1.1.2-py2.py3-none-any.whl (6.8 kB)
Collecting pandas<0.26.0,>=0.25.0
  Downloading pandas-0.25.3-cp36-cp36m-manylinux1_x86_64.whl (10.4 MB)
[K     |████████████████████████████████| 10.4 MB 16.5 MB/s eta 0:00:01
[?25hCollecting tqdm<5.0.0,>=4.33.0
  Downloading tqdm-4.43.0-py2.py3-none-any.whl (59 kB)
[K     |████████████████████████████████| 59 kB 257 kB/s  eta 0:00:01
Collecting scikit-learn<0.22.0

### Indian LFs

In [30]:
ind_keywords = ['curry','indian','masala','paneer','chutney','curried',
                'simmered','cumin','yogurt','coconut']

@labeling_function()
def indian_keywords(x):
        if any(word in x.title for word in ind_keywords):
            return INDIAN
        else:
            return ABSTAIN

In [39]:
# Word Combo curry + meat
@labeling_function()
def currymeat(x):
    return INDIAN if re.search(r"(?=.*curry)(?=.*(chicken|lamb|beef))", x.des_clean, flags=re.I) else ABSTAIN

In [40]:
# cooking process + food name
@labeling_function()
def cook_food(x):
    return INDIAN if re.search(r"(?=.*(quick|easy))(?=.*(rice|sauce|potatoes))", x.des_clean, flags=re.I) else ABSTAIN

In [70]:
# Word Combo Sweet + Spicy

@labeling_function()
def sweet_spicy(x):
    return INDIAN if re.search(r"(?=.*sweet)(?=.*(spicy))", x.des_clean, flags=re.I) else ABSTAIN

In [71]:
# Word Combo Slow + Cook

@labeling_function()
def slow_cook(x):
    return INDIAN if re.search(r"(?=.*slow)(?=.*(cook))", x.des_clean, flags=re.I) else ABSTAIN

### Italian LFs

In [77]:
ita_keywords = ['pasta','mozzarella', 'lasagna','pesto','dente', 'pizza']

@labeling_function()
def italian_keywords(x):
        if any(word in x.title for word in ita_keywords):
            return ITALIAN
        else:
            return ABSTAIN

In [75]:
@labeling_function()
def pasta_with(x):
    return ITALIAN if re.search(r"(?=.*pasta)(?=.*(chicken|lamb|beef|pesto|creamy|shrimps|cheese))", x.des_clean, flags=re.I) else ABSTAIN

In [84]:
@labeling_function()
def sundried_tomatoes(x):
    return ITALIAN if re.search(r"(?=.*tomatoes)(?=.*(sun-dried|sundried))", x.des_clean, flags=re.I) else ABSTAIN

In [160]:
ita_regions = ['tuscan','sicilian', 'romano', 'romaine', 'mediterranean','meditterranean' ]

@labeling_function()
def ita_regions(x):
        if any(word in x.title for word in ita_keywords):
            return ITALIAN
        else:
            return ABSTAIN

In [116]:
# #Importing dataset of first names 

# known_names = open("/project/Project/DataEngineeringGroupAO/Data_for_LF/first_names.all.txt", "r")
# known_names = list(known_names)
# known_names = [x.replace('\n', '') for x in known_names]
# known_names[50:55]

['aadison', 'aadit', 'aadith', 'aadithya', 'aaditiya']

In [152]:
# known_names = ', '.join(known_names)

In [155]:
# @labeling_function()
# def chef_name(x):
#     return ITALIAN if re.search(r"(?=.*chef)(?=.*(John))", x.des_clean, flags=re.I) else ABSTAIN

### Mexican LFs

In [165]:
mex_keywords = ['chicken','beef','cheese','corn','beans','salsa',
                'spicy','tortillas','rice','rice']

@labeling_function()
def mexican_keywords(x):
        if any(word in x.title for word in mex_keywords):
            return MEXICAN
        else:
            return ABSTAIN

In [166]:
# Word Combo beef + cheese
@labeling_function()
def beefcheese(x):
    return MEXICAN if re.search(r"(?=.*beef)(?=.*(chicken|lamb))", x.des_clean, flags=re.I) else ABSTAIN

In [167]:
# cooking process + food name
@labeling_function()
def cook_food_mexican(x):
    return MEXICAN if re.search(r"(?=.*(quick|easy))(?=.*(rice|sauce|potatoes))", x.des_clean, flags=re.I) else ABSTAIN

In [172]:
# canned + food
@labeling_function()
def canned_food(x):
    return MEXICAN if re.search(r"(?=.*(canned))(?=.*(chillies|soups|soup|sauce))", x.des_clean, flags=re.I) else ABSTAIN

In [177]:
# cooking process + food name
@labeling_function()
def chipotle(x):
    return MEXICAN if re.search(r"(?=.*(chipotle))(?=.*(chicken|shrimp|chillies|peppers|sauce))", x.des_clean, flags=re.I) else ABSTAIN

In [123]:
## Functions Aggregated

In [178]:
df_tr_rdd = df_tr.rdd
df_dv_rdd = df_dv.rdd

lfs = [indian_keywords, currymeat, cook_food, sweet_spicy, slow_cook, 
       italian_keywords, pasta_with, sundried_tomatoes, chef_name, 
       ita_regions, mexican_keywords, beefcheese, cook_food_mexican, canned_food, chipotle]

applier = SparkLFApplier(lfs=lfs)
L_train = applier.apply(df_tr_rdd)
L_dev = applier.apply(df_dv_rdd)

In [179]:
np.shape(L_dev)

(137, 15)

In [180]:
from snorkel.labeling import LFAnalysis

LFAnalysis(L=L_train, lfs=lfs).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
indian_keywords,0,[0],0.169991,0.069326,0.062678
currymeat,1,[0],0.02849,0.023742,0.018044
cook_food,2,[0],0.025641,0.025641,0.025641
sweet_spicy,3,[0],0.003799,0.001899,0.001899
slow_cook,4,[0],0.020893,0.013295,0.011396
italian_keywords,5,[1],0.08452,0.08452,0.012346
pasta_with,6,[],0.0,0.0,0.0
sundried_tomatoes,7,[1],0.004748,0.002849,0.00095
chef_name,8,[1],0.006648,0.001899,0.001899
ita_regions,9,[1],0.08452,0.08452,0.012346


In [181]:
LFAnalysis(L_dev, lfs=lfs).lf_summary(Y_dv)

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
indian_keywords,0,[0],0.138686,0.058394,0.043796,19,0,1.0
currymeat,1,[0],0.043796,0.043796,0.029197,6,0,1.0
cook_food,2,[0],0.021898,0.021898,0.021898,0,3,0.0
sweet_spicy,3,[0],0.007299,0.007299,0.007299,0,1,0.0
slow_cook,4,[0],0.007299,0.007299,0.007299,0,1,0.0
italian_keywords,5,[1],0.080292,0.080292,0.021898,9,2,0.818182
pasta_with,6,[],0.0,0.0,0.0,0,0,0.0
sundried_tomatoes,7,[1],0.007299,0.0,0.0,1,0,1.0
chef_name,8,[],0.0,0.0,0.0,0,0,0.0
ita_regions,9,[1],0.080292,0.080292,0.021898,9,2,0.818182
