# Set up environment

In [1]:
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

import pyspark

number_cores = int(os.environ['NUM_CPUS'])
memory_gb = int(os.environ['AVAILABLE_MEMORY_MB']) // 1024
conf = (
    pyspark.SparkConf()
        .setMaster('local[{}]'.format(number_cores))
        .set('spark.driver.memory', '{}g'.format(memory_gb))
)
sc = pyspark.SparkContext(conf=conf)

In [2]:
print(sc)

<SparkContext master=local[4] appName=pyspark-shell>


In [3]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

# Load dataset

In [4]:
# Load the dataset
indian = sqlContext.read.csv("/project/Project/DataEngineeringGroupAO/Recipe_dataset/data_indian.csv",header=True)
italian = sqlContext.read.csv("/project/Project/DataEngineeringGroupAO/Recipe_dataset/data_italian.csv",header=True)
mexican = sqlContext.read.csv("/project/Project/DataEngineeringGroupAO/Recipe_dataset/data_mexican.csv",header=True)

In [5]:
# Label the data
from pyspark.sql.functions import lit

indian = indian.withColumn("label",lit("indian"))
italian = italian.withColumn("label",lit("italian"))
mexican = mexican.withColumn("label",lit("mexican"))

In [6]:
# Combine 3 dataset into one

from functools import reduce
from pyspark.sql import DataFrame

def unionAll(dfs):
    return reduce(DataFrame.unionAll, dfs)

dfs = [indian, italian, mexican]
recipe = unionAll(dfs)
recipe.show()

+--------------------+--------------------+------+
|               Title|         Description| label|
+--------------------+--------------------+------+
|  Indian Peanut Stew|This is an easy, ...|indian|
|        Roomali Roti|There is no leave...|indian|
|Spicy Sweet Potat...|It's important to...|indian|
|        Chicken Saag|The classic India...|indian|
|Paleo Slow Cooker...|Boneless pork loi...|indian|
|Bombay Chicken an...|Chicken parts are...|indian|
|Indian Carrots, P...|Potatoes, peas an...|indian|
|Wendy's Indian Bu...|This recipe resem...|indian|
|    Indian Chickpeas|Garbanzo beans, o...|indian|
|Dal Makhani (Indi...|These richly spic...|indian|
|               Raita|Chopped tomatoes ...|indian|
|Yogurt-Marinated ...|A yogurt-based ma...|indian|
|Indian-Spiced Roa...|Spicy roasted chi...|indian|
|Cauliflower and T...|Pressed tofu cube...|indian|
|Channa Masala (Ch...|This fantastic In...|indian|
|Bengali Chicken C...|Thy this deliciou...|indian|
|  Indian Sweet Bread|A crisp a

In [7]:
# Check the size of the dataset
recipe.count()

1500

In [9]:
# Convert it to RDD
recipe_rdd = recipe.rdd

In [16]:
recipe_rdd.take(5)

[Row(Title='Indian Peanut Stew', Description='This is an easy, authentic dish from South Asia that appeals to a wide range of tastes. The…', label='indian'),
 Row(Title='Roomali Roti', Description='There is no leavening in this simple, tender Indian flatbread of bread flour, oil, salt and…', label='indian'),
 Row(Title='Spicy Sweet Potato Salad', Description="It's important to use good mayonnaise in this recipe, and to let the cooked potatoes chill…", label='indian'),
 Row(Title='Chicken Saag', Description='The classic Indian chicken and spinach dish gets richness from sour cream.', label='indian'),
 Row(Title='Paleo Slow Cooker Pork Loin', Description='Boneless pork loin slowly cooks in a curried fruit sauce until tender and delicious.', label='indian')]

# Data Cleaning

In [8]:
# import all packages needed for data cleaning

from pyspark.sql.functions import udf, regexp_replace, lower, col
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from nltk.stem.snowball import SnowballStemmer
from pyspark.sql.types import IntegerType

In [9]:
# Lowercase

recipe = recipe.select(*[lower(col(col_name)).name(col_name) for col_name in recipe.columns])
recipe.show()

+--------------------+--------------------+------+
|               Title|         Description| label|
+--------------------+--------------------+------+
|  indian peanut stew|this is an easy, ...|indian|
|        roomali roti|there is no leave...|indian|
|spicy sweet potat...|it's important to...|indian|
|        chicken saag|the classic india...|indian|
|paleo slow cooker...|boneless pork loi...|indian|
|bombay chicken an...|chicken parts are...|indian|
|indian carrots, p...|potatoes, peas an...|indian|
|wendy's indian bu...|this recipe resem...|indian|
|    indian chickpeas|garbanzo beans, o...|indian|
|dal makhani (indi...|these richly spic...|indian|
|               raita|chopped tomatoes ...|indian|
|yogurt-marinated ...|a yogurt-based ma...|indian|
|indian-spiced roa...|spicy roasted chi...|indian|
|cauliflower and t...|pressed tofu cube...|indian|
|channa masala (ch...|this fantastic in...|indian|
|bengali chicken c...|thy this deliciou...|indian|
|  indian sweet bread|a crisp a

In [115]:
# Remove punctuation and digits

recipe_clean = recipe.select(regexp_replace('Title', "[^a-zA-Z\\s]", "").alias('title'), 
    (regexp_replace('Description', "[^a-zA-Z\\s]", "").alias('des')),'label')

In [116]:
recipe_clean.show()

+--------------------+--------------------+------+
|               title|                 des| label|
+--------------------+--------------------+------+
|  indian peanut stew|this is an easy a...|indian|
|        roomali roti|there is no leave...|indian|
|spicy sweet potat...|its important to ...|indian|
|        chicken saag|the classic india...|indian|
|paleo slow cooker...|boneless pork loi...|indian|
|bombay chicken an...|chicken parts are...|indian|
|indian carrots pe...|potatoes peas and...|indian|
|wendys indian but...|this recipe resem...|indian|
|    indian chickpeas|garbanzo beans on...|indian|
|dal makhani india...|these richly spic...|indian|
|               raita|chopped tomatoes ...|indian|
|yogurtmarinated s...|a yogurtbased mar...|indian|
|indianspiced roas...|spicy roasted chi...|indian|
|cauliflower and t...|pressed tofu cube...|indian|
|channa masala chi...|this fantastic in...|indian|
|bengali chicken c...|thy this deliciou...|indian|
|  indian sweet bread|a crisp a

In [117]:
# Remove Stopwords

# Tokenize text
tokenizer = Tokenizer(inputCol="des", outputCol="des_token")
recipe = tokenizer.transform(recipe_clean).select('title','des','des_token','label')
# tokenized.select("Description", "Des_words")\
    #.withColumn("tokens", countTokens(col("Des_words"))).show(truncate=False)

# Remove stopwords
remover = StopWordsRemover(inputCol='des_token', outputCol='des_clean')
recipe_no_stopw = remover.transform(recipe).select('title','des_clean', 'label')
recipe_no_stopw.show()

+--------------------+--------------------+------+
|               title|           des_clean| label|
+--------------------+--------------------+------+
|  indian peanut stew|[easy, authentic,...|indian|
|        roomali roti|[leavening, simpl...|indian|
|spicy sweet potat...|[important, use, ...|indian|
|        chicken saag|[classic, indian,...|indian|
|paleo slow cooker...|[boneless, pork, ...|indian|
|bombay chicken an...|[chicken, parts, ...|indian|
|indian carrots pe...|[potatoes, peas, ...|indian|
|wendys indian but...|[recipe, resemble...|indian|
|    indian chickpeas|[garbanzo, beans,...|indian|
|dal makhani india...|[richly, spiced, ...|indian|
|               raita|[chopped, tomatoe...|indian|
|yogurtmarinated s...|[yogurtbased, mar...|indian|
|indianspiced roas...|[spicy, roasted, ...|indian|
|cauliflower and t...|[pressed, tofu, c...|indian|
|channa masala chi...|[fantastic, india...|indian|
|bengali chicken c...|[thy, delicious, ...|indian|
|  indian sweet bread|[crisp, s

In [118]:
recipe = recipe_no_stopw

# Pattern Exploration

In [120]:
# Filter out different recipes
# Create temp table
recipe.createOrReplaceTempView('recipes')

recipe_ind = sqlContext.sql("SELECT * FROM recipes WHERE label == 'indian'")
recipe_ita = sqlContext.sql("SELECT * FROM recipes WHERE label == 'italian'")
recipe_mex = sqlContext.sql("SELECT * FROM recipes WHERE label == 'mexican'")

In [121]:
# Create frequency list
import pyspark.sql.functions as f

top_n = 15

ind_counts = recipe_ind.select(f.explode('des_clean').alias('col')).groupBy('col').count()
ind_des_freq = ind_counts.orderBy(ind_counts["count"].desc()).limit(top_n)

ita_counts = recipe_ita.select(f.explode('des_clean').alias('col')).groupBy('col').count()
ita_des_freq = ita_counts.orderBy(ita_counts["count"].desc()).limit(top_n)

mex_counts = recipe_mex.select(f.explode('des_clean').alias('col')).groupBy('col').count()
mex_des_freq = mex_counts.orderBy(mex_counts["count"].desc()).limit(top_n)

In [122]:
# View then in one dataframe
from pyspark.sql.functions import monotonically_increasing_id 

df1 = ind_des_freq.withColumn("row_id", monotonically_increasing_id())
df2 = ita_des_freq.withColumn("row_id", monotonically_increasing_id())
df3 = mex_des_freq.withColumn("row_id", monotonically_increasing_id())

des_freq = df1.join(df2,("row_id")).join(df3,("row_id")).drop("row_id")
des_freq.show()

+---------+-----+---------+-----+---------+-----+
|      col|count|      col|count|      col|count|
+---------+-----+---------+-----+---------+-----+
|   indian|  129|  italian|   84|  chicken|  132|
|    curry|  107|    sauce|   69|   cheese|   83|
|     dish|   85|   cheese|   66|     beef|   81|
|  chicken|   83|    pasta|   54|   recipe|   79|
|   recipe|   62|  chicken|   49|  mexican|   77|
|     rice|   54|   recipe|   44|     corn|   72|
|     made|   53|     easy|   42|     make|   69|
|    spicy|   52|delicious|   39|    sauce|   67|
|   spices|   51|   garlic|   37|    beans|   66|
|     easy|   45| tomatoes|   36|    salsa|   63|
|    sauce|   44|     dish|   35|    spicy|   55|
| simmered|   40|   tomato|   35|     easy|   55|
|   yogurt|   38|     make|   32|tortillas|   54|
|     make|   36|    fresh|   32|     rice|   51|
|delicious|   35|    bread|   31|   ground|   49|
+---------+-----+---------+-----+---------+-----+

