# Set up environment

In [1]:
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

import pyspark

number_cores = int(os.environ['NUM_CPUS'])
memory_gb = int(os.environ['AVAILABLE_MEMORY_MB']) // 1024
conf = (
    pyspark.SparkConf()
        .setMaster('local[{}]'.format(number_cores))
        .set('spark.driver.memory', '{}g'.format(memory_gb))
)
sc = pyspark.SparkContext(conf=conf)

In [2]:
print(sc)

<SparkContext master=local[4] appName=pyspark-shell>


In [3]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

# Load dataset

#### LOAD DATASET FROM GITHUB (TBD)

In [4]:
# Load the dataset
indian = sqlContext.read.csv("/project/Project/DataEngineeringGroupAO/Recipe_dataset/data_indian.csv",header=True)
italian = sqlContext.read.csv("/project/Project/DataEngineeringGroupAO/Recipe_dataset/data_italian.csv",header=True)
mexican = sqlContext.read.csv("/project/Project/DataEngineeringGroupAO/Recipe_dataset/data_mexican.csv",header=True)

In [5]:
# Label the data
from pyspark.sql.functions import lit

indian = indian.withColumn("label",lit("indian"))
italian = italian.withColumn("label",lit("italian"))
mexican = mexican.withColumn("label",lit("mexican"))

In [6]:
# Combine 3 dataset into one

from functools import reduce
from pyspark.sql import DataFrame

def unionAll(dfs):
    return reduce(DataFrame.unionAll, dfs)

dfs = [indian, italian, mexican]
recipe = unionAll(dfs)
recipe.show()

+--------------------+--------------------+------+
|               Title|         Description| label|
+--------------------+--------------------+------+
|  Indian Peanut Stew|This is an easy, ...|indian|
|        Roomali Roti|There is no leave...|indian|
|Spicy Sweet Potat...|It's important to...|indian|
|        Chicken Saag|The classic India...|indian|
|Paleo Slow Cooker...|Boneless pork loi...|indian|
|Bombay Chicken an...|Chicken parts are...|indian|
|Indian Carrots, P...|Potatoes, peas an...|indian|
|Wendy's Indian Bu...|This recipe resem...|indian|
|    Indian Chickpeas|Garbanzo beans, o...|indian|
|Dal Makhani (Indi...|These richly spic...|indian|
|               Raita|Chopped tomatoes ...|indian|
|Yogurt-Marinated ...|A yogurt-based ma...|indian|
|Indian-Spiced Roa...|Spicy roasted chi...|indian|
|Cauliflower and T...|Pressed tofu cube...|indian|
|Channa Masala (Ch...|This fantastic In...|indian|
|Bengali Chicken C...|Thy this deliciou...|indian|
|  Indian Sweet Bread|A crisp a

In [7]:
recipe.count()

1500

In [8]:
# Convert it to RDD
recipe_rdd = recipe.rdd

In [9]:
recipe_rdd.take(5)

[Row(Title='Indian Peanut Stew', Description='This is an easy, authentic dish from South Asia that appeals to a wide range of tastes. The…', label='indian'),
 Row(Title='Roomali Roti', Description='There is no leavening in this simple, tender Indian flatbread of bread flour, oil, salt and…', label='indian'),
 Row(Title='Spicy Sweet Potato Salad', Description="It's important to use good mayonnaise in this recipe, and to let the cooked potatoes chill…", label='indian'),
 Row(Title='Chicken Saag', Description='The classic Indian chicken and spinach dish gets richness from sour cream.', label='indian'),
 Row(Title='Paleo Slow Cooker Pork Loin', Description='Boneless pork loin slowly cooks in a curried fruit sauce until tender and delicious.', label='indian')]

# Data Cleaning

In [10]:
# import all packages needed for data cleaning

from pyspark.sql.functions import udf, regexp_replace, lower, col
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from nltk.stem.snowball import SnowballStemmer
from pyspark.sql.types import IntegerType

In [11]:
# Lowercase

recipe = recipe.select(*[lower(col(col_name)).name(col_name) for col_name in recipe.columns])
recipe.show()

+--------------------+--------------------+------+
|               Title|         Description| label|
+--------------------+--------------------+------+
|  indian peanut stew|this is an easy, ...|indian|
|        roomali roti|there is no leave...|indian|
|spicy sweet potat...|it's important to...|indian|
|        chicken saag|the classic india...|indian|
|paleo slow cooker...|boneless pork loi...|indian|
|bombay chicken an...|chicken parts are...|indian|
|indian carrots, p...|potatoes, peas an...|indian|
|wendy's indian bu...|this recipe resem...|indian|
|    indian chickpeas|garbanzo beans, o...|indian|
|dal makhani (indi...|these richly spic...|indian|
|               raita|chopped tomatoes ...|indian|
|yogurt-marinated ...|a yogurt-based ma...|indian|
|indian-spiced roa...|spicy roasted chi...|indian|
|cauliflower and t...|pressed tofu cube...|indian|
|channa masala (ch...|this fantastic in...|indian|
|bengali chicken c...|thy this deliciou...|indian|
|  indian sweet bread|a crisp a

In [12]:
# Remove punctuation and digits

recipe_clean = recipe.select(regexp_replace('Title', "[^a-zA-Z\\s]", "").alias('title'), 
    (regexp_replace('Description', "[^a-zA-Z\\s]", "").alias('des')),'label')

In [13]:
recipe_clean.show()

+--------------------+--------------------+------+
|               title|                 des| label|
+--------------------+--------------------+------+
|  indian peanut stew|this is an easy a...|indian|
|        roomali roti|there is no leave...|indian|
|spicy sweet potat...|its important to ...|indian|
|        chicken saag|the classic india...|indian|
|paleo slow cooker...|boneless pork loi...|indian|
|bombay chicken an...|chicken parts are...|indian|
|indian carrots pe...|potatoes peas and...|indian|
|wendys indian but...|this recipe resem...|indian|
|    indian chickpeas|garbanzo beans on...|indian|
|dal makhani india...|these richly spic...|indian|
|               raita|chopped tomatoes ...|indian|
|yogurtmarinated s...|a yogurtbased mar...|indian|
|indianspiced roas...|spicy roasted chi...|indian|
|cauliflower and t...|pressed tofu cube...|indian|
|channa masala chi...|this fantastic in...|indian|
|bengali chicken c...|thy this deliciou...|indian|
|  indian sweet bread|a crisp a

In [14]:
# Remove Stopwords

# Tokenize text
tokenizer = Tokenizer(inputCol="des", outputCol="des_token")
recipe = tokenizer.transform(recipe_clean).select('title','des','des_token','label')
# tokenized.select("Description", "Des_words")\
    #.withColumn("tokens", countTokens(col("Des_words"))).show(truncate=False)

# Remove stopwords
remover = StopWordsRemover(inputCol='des_token', outputCol='des_clean')
recipe_no_stopw = remover.transform(recipe).select('title','des_clean', 'label')
recipe_no_lists = recipe_no_stopw
recipe_no_stopw.show()

+--------------------+--------------------+------+
|               title|           des_clean| label|
+--------------------+--------------------+------+
|  indian peanut stew|[easy, authentic,...|indian|
|        roomali roti|[leavening, simpl...|indian|
|spicy sweet potat...|[important, use, ...|indian|
|        chicken saag|[classic, indian,...|indian|
|paleo slow cooker...|[boneless, pork, ...|indian|
|bombay chicken an...|[chicken, parts, ...|indian|
|indian carrots pe...|[potatoes, peas, ...|indian|
|wendys indian but...|[recipe, resemble...|indian|
|    indian chickpeas|[garbanzo, beans,...|indian|
|dal makhani india...|[richly, spiced, ...|indian|
|               raita|[chopped, tomatoe...|indian|
|yogurtmarinated s...|[yogurtbased, mar...|indian|
|indianspiced roas...|[spicy, roasted, ...|indian|
|cauliflower and t...|[pressed, tofu, c...|indian|
|channa masala chi...|[fantastic, india...|indian|
|bengali chicken c...|[thy, delicious, ...|indian|
|  indian sweet bread|[crisp, s

In [15]:
recipe = recipe_no_stopw

# Pattern Exploration

In [40]:
# Filter out different recipes
# Create temp table
recipe.createOrReplaceTempView('recipes')

recipe_ind = sqlContext.sql("SELECT * FROM recipes WHERE label == 'indian'")
recipe_ita = sqlContext.sql("SELECT * FROM recipes WHERE label == 'italian'")
recipe_mex = sqlContext.sql("SELECT * FROM recipes WHERE label == 'mexican'")
# print((recipe_ind.count(), len(recipe_ind.columns)))
# print((recipe_ita.count(), len(recipe_ita.columns)))
# print((recipe_mex.count(), len(recipe_mex.columns)))

In [41]:
# We asssume that we do not know labels for the majority of data points, 
# hence further explore only test split
recipe_ind_tr, recipe_ind_ts, recipe_ind_d,recipe_ind_v  = recipe_ind.randomSplit([0.6,0.2,0.1,0.1],seed = 11)
recipe_ita_tr, recipe_ita_ts, recipe_ita_d, recipe_ita_v = recipe_ita.randomSplit([0.6,0.2,0.1,0.1],seed = 11)
recipe_mex_tr, recipe_mex_ts, recipe_mex_d, recipe_mex_v = recipe_mex.randomSplit([0.6,0.2,0.1,0.1],seed = 11)

We left aside dev/val split as 10% of each of the datasets and 20% as test split to calculate accuracies of LFs. Bigger test set of "gold" is valuable to get more matches of LFs and see overall performance. 

In [42]:
# Create frequency list
import pyspark.sql.functions as f

top_n = 15

ind_counts = recipe_ind_d.select(f.explode('des_clean').alias('col')).groupBy('col').count()
ind_des_freq = ind_counts.orderBy(ind_counts["count"].desc()).limit(top_n)

ita_counts = recipe_ita_d.select(f.explode('des_clean').alias('col')).groupBy('col').count()
ita_des_freq = ita_counts.orderBy(ita_counts["count"].desc()).limit(top_n)

mex_counts = recipe_mex_d.select(f.explode('des_clean').alias('col')).groupBy('col').count()
mex_des_freq = mex_counts.orderBy(mex_counts["count"].desc()).limit(top_n)

In [43]:
# View then in one dataframe
from pyspark.sql.functions import monotonically_increasing_id 

df1 = ind_des_freq.withColumn("row_id", monotonically_increasing_id())
df2 = ita_des_freq.withColumn("row_id", monotonically_increasing_id())
df3 = mex_des_freq.withColumn("row_id", monotonically_increasing_id())

des_freq = df1.join(df2,("row_id")).join(df3,("row_id")).drop("row_id")
des_freq.show()

+---------+-----+--------+-----+---------+-----+
|      col|count|     col|count|      col|count|
+---------+-----+--------+-----+---------+-----+
|    curry|   11| italian|   12|  chicken|   18|
|     dish|   10|  cheese|    8|  mexican|   13|
|   indian|    9|   sauce|    7|    salsa|   10|
|   yogurt|    8|  garlic|    7|     corn|    8|
|    cumin|    8|   pasta|    7|   recipe|    7|
|     rice|    8|   fresh|    5|   cheese|    7|
|   spices|    7|  tomato|    5|    beans|    7|
|  coconut|    6|tomatoes|    5|    green|    6|
|  chicken|    6| lasagna|    5|tortillas|    6|
|   spiced|    5|   great|    5|   cooked|    6|
|   flavor|    5|   bread|    5|    great|    6|
|delicious|    4| chicken|    5|    onion|    6|
|   recipe|    4| sausage|    5|     like|    6|
|   tossed|    4| ricotta|    5| tomatoes|    5|
| simmered|    4|    dish|    4|    spicy|    5|
+---------+-----+--------+-----+---------+-----+



In [54]:
from pyspark.sql.functions import concat_ws

# concat_ws to convert lists to strings for SparkLFApplier compatability
# train split - unlabelled
df_tr = recipe_ind_tr.union(recipe_ita_tr)
df_tr = df_tr.union(recipe_mex_tr).withColumn("des_clean", concat_ws(" ", "des_clean"))

# test split - labelled
df_ts = recipe_ind_ts.union(recipe_ita_ts)
df_ts = df_ts.union(recipe_mex_ts).withColumn("des_clean", concat_ws(" ", "des_clean"))

# dev split - labelled
df_d = recipe_ind_d.union(recipe_ita_d)
df_d = df_d.union(recipe_mex_d).withColumn("des_clean", concat_ws(" ", "des_clean"))

# val split - labelled
df_v = recipe_ind_d.union(recipe_ita_v)
df_v = df_v.union(recipe_mex_v).withColumn("des_clean", concat_ws(" ", "des_clean"))

In [45]:
# drop train label split for further labelling
df_tr.drop('label').first()

Row(title='ada adai', des_clean='try crepelike items indianstyle breakfast made lentils rice')

In [46]:
mapping = {'indian':0, 'italian':1, 'mexican':2}

In [55]:
# get numerical value for corresponding cusine to validate LFs - clear memory to 
from pyspark.sql.functions import col, create_map, lit
from itertools import chain

mapping_func = create_map([lit(x) for x in chain(*mapping.items())])

df_v = df_v.withColumn("num_label", mapping_func.getItem(col("label")))
# df_dv = df_dv.select('title','des_clean', 'num_label')
df_v = df_v.select('title','des_clean', 'num_label')
df_v.show()


+--------------------+--------------------+---------+
|               title|           des_clean|num_label|
+--------------------+--------------------+---------+
|           aloo gobi|traditional india...|        0|
|andreas dal for i...|india dal means n...|        0|
|black pepper goat...|black peppercorns...|        0|
|bombay chicken wings|chicken wings lig...|        0|
|butternut squash ...|butternut squash ...|        0|
|chaat  dahi batat...|fragrant aromatic...|        0|
|chicken roti from...|fragrant homemade...|        0|
|chickpea coconut ...|garbanzo beans ge...|        0|
|    cilantro chutney|green chile peppe...|        0|
|creamy cashew chi...|marinating curry ...|        0|
|cucumber peanut s...|fresh delicious c...|        0|
|     easy curry rice|rice side dish re...|        0|
|four seasons chic...|hot sweet sour fr...|        0|
|fruited tofu curr...|vegetarian tofu s...|        0|
|  goan pork vindaloo|traditional goan ...|        0|
|gobi masala cauli...|small 

In [56]:
# num_label for test split for accuracy - INCLUDE LABEL TO GET REAL GOLD:)
df_ts = df_ts.withColumn("num_label", mapping_func.getItem(col("label")))
df_ts = df_ts.select('title','des_clean', 'num_label')



In [38]:
# print((df_v.count(), len(df_v.columns)))

In [57]:
import numpy as np

# create column with actual values for validation and convert to array since lf_summary() works with arrays
Y_v = df_v.select('num_label').rdd.flatMap(lambda x: x).collect()
Y_v = np.array(Y_v)


In [58]:
Y_test = df_ts.select('num_label').rdd.flatMap(lambda x: x).collect()
Y_test = np.array(Y_test)

In [49]:
# For clarity, we define constants to represent the class labels and abstaining.
ABSTAIN = -1
INDIAN = 0
ITALIAN = 1
MEXICAN = 2

## Keywords LFs

In [27]:
! pip install snorkel
from snorkel.labeling.apply.spark import SparkLFApplier
from snorkel.labeling import LFAnalysis
from snorkel.labeling import labeling_function
import re

Collecting scikit-learn<0.22.0,>=0.20.2
  Using cached scikit_learn-0.21.3-cp36-cp36m-manylinux1_x86_64.whl (6.7 MB)
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.22.2.post1
    Uninstalling scikit-learn-0.22.2.post1:
      Successfully uninstalled scikit-learn-0.22.2.post1
Successfully installed scikit-learn-0.22.1


In [28]:
ind_keywords = ['curry','indian','masala','paneer','chutney','curried',
                'simmered','cumin','yogurt','coconut']

@labeling_function()
def indian_keywords(x):
        if any(word in x.title for word in ind_keywords):
            return INDIAN
        else:
            return ABSTAIN

In [29]:
# Word Combo curry + meat
@labeling_function()
def currymeat(x):
    return INDIAN if re.search(r"(?=.*curry)(?=.*(chicken|lamb|beef))", x.des_clean, flags=re.I) else ABSTAIN

In [30]:
# cooking process + food name
@labeling_function()
def cook_food(x):
    return INDIAN if re.search(r"(?=.*(quick|easy))(?=.*(rice|sauce|potatoes))", x.des_clean, flags=re.I) else ABSTAIN

In [31]:
# Word Combo Sweet + Spicy

@labeling_function()
def sweet_spicy(x):
    return INDIAN if re.search(r"(?=.*sweet)(?=.*(spicy))", x.des_clean, flags=re.I) else ABSTAIN

In [32]:
# Word Combo Slow + Cook

@labeling_function()
def slow_cook(x):
    return INDIAN if re.search(r"(?=.*slow)(?=.*(cook))", x.des_clean, flags=re.I) else ABSTAIN

In [33]:
df_tr_rdd = df_tr.rdd
df_dv_rdd = df_dv.rdd

lfs = [indian_keywords, currymeat, cook_food, sweet_spicy, slow_cook]

applier = SparkLFApplier(lfs=lfs)
L_train = applier.apply(df_tr_rdd)
L_dev = applier.apply(df_v_rdd)

In [35]:
from snorkel.labeling import LFAnalysis

LFAnalysis(L=L_train, lfs=lfs).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
indian_keywords,0,[0],0.169991,0.025641,0.0
currymeat,1,[0],0.02849,0.015195,0.0
cook_food,2,[0],0.025641,0.006648,0.0
sweet_spicy,3,[0],0.003799,0.00095,0.0
slow_cook,4,[0],0.020893,0.004748,0.0


In [36]:
LFAnalysis(L_dev, lfs=lfs).lf_summary(Y_dv)

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
indian_keywords,0,[0],0.138686,0.029197,0.0,19,0,1.0
currymeat,1,[0],0.043796,0.029197,0.0,6,0,1.0
cook_food,2,[0],0.021898,0.0,0.0,0,3,0.0
sweet_spicy,3,[0],0.007299,0.0,0.0,0,1,0.0
slow_cook,4,[0],0.007299,0.0,0.0,0,1,0.0


<div class="alert alert-block alert-danger">
<b>Explanation needed:</b> EXPLAIN coverage,ovarlaps,conflicts and etc.
</div>

# Label Model

In [50]:
# Compared against Y_test - actual values of L
L_test = applier.apply(df_ts.rdd)

In this section we use LabelModel presented by spark which (as documentation) claims to produce probablity-aware labels to train the further classification model. However, comparison between Label Model and Majority Vote is also provided.

In [59]:
from snorkel.labeling import LabelModel

label_model = LabelModel(cardinality=3, verbose=True)
label_model.fit(L_train, n_epochs=500, log_freq=100, seed=11)

label_model_acc = label_model.score(L=L_test, Y=Y_test, tie_break_policy="random")["accuracy"]
print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%")

Label Model Accuracy:     41.8%


In [61]:
from snorkel.labeling import MajorityLabelVoter

majority_model = MajorityLabelVoter()
preds_train = majority_model.predict(L=L_train)

majority_acc = majority_model.score(L=L_test, Y=Y_test, tie_break_policy="random")[
    "accuracy"
]
print(f"{'Majority Vote Accuracy:':<25} {majority_acc * 100:.1f}%")

Majority Vote Accuracy:   37.5%


From the above Label Model performs better in labelling test split.