In [1]:
!pip install pyspark
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

import pyspark

number_cores = int(os.environ['NUM_CPUS'])
memory_gb = int(os.environ['AVAILABLE_MEMORY_MB']) // 1024

conf = (
    pyspark.SparkConf()
        .setMaster('local[{}]'.format(number_cores))
        .set('spark.driver.memory', '{}g'.format(memory_gb))
)
sc = pyspark.SparkContext(conf=conf)



In [2]:
print(sc)

<SparkContext master=local[4] appName=pyspark-shell>


In [3]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

In [4]:
# Load the dataset
indian = sqlContext.read.csv("/project/Project/DataEngineeringGroupAO/Recipe_dataset/data_indian.csv",header=True)
italian = sqlContext.read.csv("/project/Project/DataEngineeringGroupAO/Recipe_dataset/data_italian.csv",header=True)
mexican = sqlContext.read.csv("/project/Project/DataEngineeringGroupAO/Recipe_dataset/data_mexican.csv",header=True)

In [5]:
# Label the data
from pyspark.sql.functions import lit

indian = indian.withColumn("label",lit("indian"))
italian = italian.withColumn("label",lit("italian"))
mexican = mexican.withColumn("label",lit("mexican"))

In [6]:
indian.show()

+--------------------+--------------------+------+
|               Title|         Description| label|
+--------------------+--------------------+------+
|  Indian Peanut Stew|This is an easy, ...|indian|
|        Roomali Roti|There is no leave...|indian|
|Spicy Sweet Potat...|It's important to...|indian|
|        Chicken Saag|The classic India...|indian|
|Paleo Slow Cooker...|Boneless pork loi...|indian|
|Bombay Chicken an...|Chicken parts are...|indian|
|Indian Carrots, P...|Potatoes, peas an...|indian|
|Wendy's Indian Bu...|This recipe resem...|indian|
|    Indian Chickpeas|Garbanzo beans, o...|indian|
|Dal Makhani (Indi...|These richly spic...|indian|
|               Raita|Chopped tomatoes ...|indian|
|Yogurt-Marinated ...|A yogurt-based ma...|indian|
|Indian-Spiced Roa...|Spicy roasted chi...|indian|
|Cauliflower and T...|Pressed tofu cube...|indian|
|Channa Masala (Ch...|This fantastic In...|indian|
|Bengali Chicken C...|Thy this deliciou...|indian|
|  Indian Sweet Bread|A crisp a

In [7]:
# Combine 3 dataset into one

from functools import reduce
from pyspark.sql import DataFrame

def unionAll(dfs):
    return reduce(DataFrame.unionAll, dfs)

dfs = [indian, italian, mexican]
recipe = unionAll(dfs)
recipe.show()

+--------------------+--------------------+------+
|               Title|         Description| label|
+--------------------+--------------------+------+
|  Indian Peanut Stew|This is an easy, ...|indian|
|        Roomali Roti|There is no leave...|indian|
|Spicy Sweet Potat...|It's important to...|indian|
|        Chicken Saag|The classic India...|indian|
|Paleo Slow Cooker...|Boneless pork loi...|indian|
|Bombay Chicken an...|Chicken parts are...|indian|
|Indian Carrots, P...|Potatoes, peas an...|indian|
|Wendy's Indian Bu...|This recipe resem...|indian|
|    Indian Chickpeas|Garbanzo beans, o...|indian|
|Dal Makhani (Indi...|These richly spic...|indian|
|               Raita|Chopped tomatoes ...|indian|
|Yogurt-Marinated ...|A yogurt-based ma...|indian|
|Indian-Spiced Roa...|Spicy roasted chi...|indian|
|Cauliflower and T...|Pressed tofu cube...|indian|
|Channa Masala (Ch...|This fantastic In...|indian|
|Bengali Chicken C...|Thy this deliciou...|indian|
|  Indian Sweet Bread|A crisp a

In [8]:
recipe.count()

1500

In [9]:
# Convert it to RDD
recipe_rdd = recipe.rdd

In [10]:
recipe_rdd.take(5)

[Row(Title='Indian Peanut Stew', Description='This is an easy, authentic dish from South Asia that appeals to a wide range of tastes. The…', label='indian'),
 Row(Title='Roomali Roti', Description='There is no leavening in this simple, tender Indian flatbread of bread flour, oil, salt and…', label='indian'),
 Row(Title='Spicy Sweet Potato Salad', Description="It's important to use good mayonnaise in this recipe, and to let the cooked potatoes chill…", label='indian'),
 Row(Title='Chicken Saag', Description='The classic Indian chicken and spinach dish gets richness from sour cream.', label='indian'),
 Row(Title='Paleo Slow Cooker Pork Loin', Description='Boneless pork loin slowly cooks in a curried fruit sauce until tender and delicious.', label='indian')]

Not sure if we can split data as what I did in pandas, if not, we could get training, test dataset (csv) prepared before loading them in.

# Data Cleaning

In [11]:
from pyspark.sql.functions import lower, col

recipe_rdd_1 = recipe.select(*[lower(col(col_name)).name(col_name) for col_name in recipe.columns])
recipe_rdd_1.show()

+--------------------+--------------------+------+
|               Title|         Description| label|
+--------------------+--------------------+------+
|  indian peanut stew|this is an easy, ...|indian|
|        roomali roti|there is no leave...|indian|
|spicy sweet potat...|it's important to...|indian|
|        chicken saag|the classic india...|indian|
|paleo slow cooker...|boneless pork loi...|indian|
|bombay chicken an...|chicken parts are...|indian|
|indian carrots, p...|potatoes, peas an...|indian|
|wendy's indian bu...|this recipe resem...|indian|
|    indian chickpeas|garbanzo beans, o...|indian|
|dal makhani (indi...|these richly spic...|indian|
|               raita|chopped tomatoes ...|indian|
|yogurt-marinated ...|a yogurt-based ma...|indian|
|indian-spiced roa...|spicy roasted chi...|indian|
|cauliflower and t...|pressed tofu cube...|indian|
|channa masala (ch...|this fantastic in...|indian|
|bengali chicken c...|thy this deliciou...|indian|
|  indian sweet bread|a crisp a

In [12]:
from pyspark.ml.feature import Tokenizer

tokenizer = Tokenizer(inputCol="Description", outputCol="Description_1")
recipe_rdd_2 = tokenizer.transform(recipe).select('Title', "Description_1", 'Label')

recipe_rdd_2.show()

+--------------------+--------------------+------+
|               Title|       Description_1| Label|
+--------------------+--------------------+------+
|  Indian Peanut Stew|[this, is, an, ea...|indian|
|        Roomali Roti|[there, is, no, l...|indian|
|Spicy Sweet Potat...|[it's, important,...|indian|
|        Chicken Saag|[the, classic, in...|indian|
|Paleo Slow Cooker...|[boneless, pork, ...|indian|
|Bombay Chicken an...|[chicken, parts, ...|indian|
|Indian Carrots, P...|[potatoes,, peas,...|indian|
|Wendy's Indian Bu...|[this, recipe, re...|indian|
|    Indian Chickpeas|[garbanzo, beans,...|indian|
|Dal Makhani (Indi...|[these, richly, s...|indian|
|               Raita|[chopped, tomatoe...|indian|
|Yogurt-Marinated ...|[a, yogurt-based,...|indian|
|Indian-Spiced Roa...|[spicy, roasted, ...|indian|
|Cauliflower and T...|[pressed, tofu, c...|indian|
|Channa Masala (Ch...|[this, fantastic,...|indian|
|Bengali Chicken C...|[thy, this, delic...|indian|
|  Indian Sweet Bread|[a, crisp

In [13]:
from pyspark.ml.feature import StopWordsRemover

# Define a list of stop words or use default list
remover = StopWordsRemover()
stopwords = remover.getStopWords() 

# Display default list
stopwords[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your']

In [14]:
# Specify input/output columns
remover.setInputCol("Description_1")
remover.setOutputCol("Description_2")

# Transform existing dataframe with the StopWordsRemover
recipe_rdd_3 = remover.transform(recipe_rdd_2).select('Title', "Description_2", 'Label')

# Display
recipe_rdd_3.show()

+--------------------+--------------------+------+
|               Title|       Description_2| Label|
+--------------------+--------------------+------+
|  Indian Peanut Stew|[easy,, authentic...|indian|
|        Roomali Roti|[leavening, simpl...|indian|
|Spicy Sweet Potat...|[important, use, ...|indian|
|        Chicken Saag|[classic, indian,...|indian|
|Paleo Slow Cooker...|[boneless, pork, ...|indian|
|Bombay Chicken an...|[chicken, parts, ...|indian|
|Indian Carrots, P...|[potatoes,, peas,...|indian|
|Wendy's Indian Bu...|[recipe, resemble...|indian|
|    Indian Chickpeas|[garbanzo, beans,...|indian|
|Dal Makhani (Indi...|[richly, spiced, ...|indian|
|               Raita|[chopped, tomatoe...|indian|
|Yogurt-Marinated ...|[yogurt-based, ma...|indian|
|Indian-Spiced Roa...|[spicy, roasted, ...|indian|
|Cauliflower and T...|[pressed, tofu, c...|indian|
|Channa Masala (Ch...|[fantastic, india...|indian|
|Bengali Chicken C...|[thy, delicious, ...|indian|
|  Indian Sweet Bread|[crisp, s

In [18]:
# Removing digits from RDD

from pyspark.sql.functions import when,udf
from pyspark.sql.types import BooleanType
from pyspark.sql.types import ArrayType, StringType
from pyspark.sql.types import StructType

def is_digit(value):
    if value:
        return value.isdigit()
    else:
        return False

is_digit_udf = udf(is_digit, BooleanType())

filter_length_udf = udf(lambda row: [x for x in row if not is_digit(x)], ArrayType(StringType()))
recipe_rdd_4 = recipe_rdd_3.withColumn('Description_2', filter_length_udf(col('Description_2')))
recipe_rdd_4.show()

+--------------------+--------------------+------+
|               Title|       Description_2| Label|
+--------------------+--------------------+------+
|  Indian Peanut Stew|[easy,, authentic...|indian|
|        Roomali Roti|[leavening, simpl...|indian|
|Spicy Sweet Potat...|[important, use, ...|indian|
|        Chicken Saag|[classic, indian,...|indian|
|Paleo Slow Cooker...|[boneless, pork, ...|indian|
|Bombay Chicken an...|[chicken, parts, ...|indian|
|Indian Carrots, P...|[potatoes,, peas,...|indian|
|Wendy's Indian Bu...|[recipe, resemble...|indian|
|    Indian Chickpeas|[garbanzo, beans,...|indian|
|Dal Makhani (Indi...|[richly, spiced, ...|indian|
|               Raita|[chopped, tomatoe...|indian|
|Yogurt-Marinated ...|[yogurt-based, ma...|indian|
|Indian-Spiced Roa...|[spicy, roasted, ...|indian|
|Cauliflower and T...|[pressed, tofu, c...|indian|
|Channa Masala (Ch...|[fantastic, india...|indian|
|Bengali Chicken C...|[thy, delicious, ...|indian|
|  Indian Sweet Bread|[crisp, s

### Pattern Exploration

In [19]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

# Patterns from different recipes

#Create a  SQL temporary view (more below)
recipe_rdd_4.createOrReplaceTempView("recipe")

# Filter out different recipes
recipe_ind = sqlContext.sql("SELECT * FROM recipe WHERE Label == 'indian'")
recipe_ita = sqlContext.sql("SELECT * FROM recipe WHERE Label == 'italian'")
recipe_mex = sqlContext.sql("SELECT * FROM recipe WHERE Label == 'mexican'")

In [20]:
recipe_rdd_5 = recipe_ind.select('Title')
recipe_rdd_5.show()

+--------------------+
|               Title|
+--------------------+
|  Indian Peanut Stew|
|        Roomali Roti|
|Spicy Sweet Potat...|
|        Chicken Saag|
|Paleo Slow Cooker...|
|Bombay Chicken an...|
|Indian Carrots, P...|
|Wendy's Indian Bu...|
|    Indian Chickpeas|
|Dal Makhani (Indi...|
|               Raita|
|Yogurt-Marinated ...|
|Indian-Spiced Roa...|
|Cauliflower and T...|
|Channa Masala (Ch...|
|Bengali Chicken C...|
|  Indian Sweet Bread|
| Rosy's Palak Paneer|
|Roti Bread from I...|
|Indian Vegetable ...|
+--------------------+
only showing top 20 rows



In [21]:
from pyspark.ml.feature import RegexTokenizer

tokenizer = RegexTokenizer(inputCol="Title", outputCol="Title2", pattern= " |,|[()]")
tokenized = tokenizer.transform(recipe_rdd_5)
tokenized.show(truncate=False)

+---------------------------------------------------------+--------------------------------------------------------------+
|Title                                                    |Title2                                                        |
+---------------------------------------------------------+--------------------------------------------------------------+
|Indian Peanut Stew                                       |[indian, peanut, stew]                                        |
|Roomali Roti                                             |[roomali, roti]                                               |
|Spicy Sweet Potato Salad                                 |[spicy, sweet, potato, salad]                                 |
|Chicken Saag                                             |[chicken, saag]                                               |
|Paleo Slow Cooker Pork Loin                              |[paleo, slow, cooker, pork, loin]                             |
|Bombay Chicken 

In [22]:
import nltk
nltk.download('punkt')

from nltk.tokenize import word_tokenize as WordTokenizer

def word_tokenizer(data, col):
    token=[]
    for item in data[col]:
         token.append(WordTokenizer(item))

    return token

token = word_tokenizer(tokenized, 'Title2')
tokenized.insert(index, 'token_column', token)

# wordTokenizeRDD = tokenized.select('Title2').rdd.map(word_TokenizeFunct(tokenized))
# wordTokenizeRDD.collect()

[nltk_data] Downloading package punkt to /home/faculty/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


TypeError: Column is not iterable

In [56]:
wordTokenizeRDD = tokenized.rdd.reduceByKey(lambda x,y : x + y)
wordTokenizeRDD.collect()

[('Roomali Roti', ['roomali', 'roti']),
 ('Spicy Sweet Potato Salad', ['spicy', 'sweet', 'potato', 'salad']),
 ('Raita', ['raita']),
 ('Cauliflower and Tofu Masala', ['cauliflower', 'and', 'tofu', 'masala']),
 ('Bengali Chicken Curry with Potatoes',
  ['bengali', 'chicken', 'curry', 'with', 'potatoes']),
 ('Potatoes Madras', ['potatoes', 'madras']),
 ("Steve's Chicken Korma", ["steve's", 'chicken', 'korma']),
 ('Kashmiri-Style Kidney Beans with Turnips',
  ['kashmiri-style', 'kidney', 'beans', 'with', 'turnips']),
 ('Maharaja Curry', ['maharaja', 'curry']),
 ('Curried Chicken and Rice Salad',
  ['curried', 'chicken', 'and', 'rice', 'salad']),
 ('Kashmiri Lamb', ['kashmiri', 'lamb']),
 ('Rajma (Kidney Bean Curry)', ['rajma', 'kidney', 'bean', 'curry']),
 ('Lamb Madras Curry', ['lamb', 'madras', 'curry']),
 ('Apple Chutney', ['apple', 'chutney']),
 ('Chicken Biryani, Hyderabadi Style',
  ['chicken', 'biryani', 'hyderabadi', 'style']),
 ('Mild Curry Powder', ['mild', 'curry', 'powder']),


In [53]:
def word_TokenizeFunct(x):
    splitted = [word_tokenize(x) for line in x for word in line.split(',')]
    return splitted

wordTokenizeRDD = tokenized.select('Title2').rdd.flatMap(lambda x: (x.Title2,1))
wordTokenizeRDD.collect()

[['indian', 'peanut', 'stew'],
 1,
 ['roomali', 'roti'],
 1,
 ['spicy', 'sweet', 'potato', 'salad'],
 1,
 ['chicken', 'saag'],
 1,
 ['paleo', 'slow', 'cooker', 'pork', 'loin'],
 1,
 ['bombay', 'chicken', 'and', 'rice'],
 1,
 ['indian', 'carrots', 'peas', 'and', 'potatoes'],
 1,
 ["wendy's", 'indian', 'butter', 'chicken'],
 1,
 ['indian', 'chickpeas'],
 1,
 ['dal', 'makhani', 'indian', 'lentils'],
 1,
 ['raita'],
 1,
 ['yogurt-marinated', 'salmon', 'fillets', 'dahi', 'machhali', 'masaledar'],
 1,
 ['indian-spiced', 'roasted', 'chickpeas'],
 1,
 ['cauliflower', 'and', 'tofu', 'masala'],
 1,
 ['channa', 'masala', 'chickpea', 'curry'],
 1,
 ['bengali', 'chicken', 'curry', 'with', 'potatoes'],
 1,
 ['indian', 'sweet', 'bread'],
 1,
 ["rosy's", 'palak', 'paneer'],
 1,
 ['roti', 'bread', 'from', 'india'],
 1,
 ['indian', 'vegetable', 'rice'],
 1,
 ['aloo', 'gobi', 'masala', 'cauliflower', 'and', 'potato', 'curry'],
 1,
 ['tandoori', 'fish'],
 1,
 ['punjabi', 'chicken', 'in', 'thick', 'gravy']

In [None]:
# Second Way

In [40]:
recipe.show()

+--------------------+--------------------+------+
|               Title|         Description| label|
+--------------------+--------------------+------+
|  Indian Peanut Stew|This is an easy, ...|indian|
|        Roomali Roti|There is no leave...|indian|
|Spicy Sweet Potat...|It's important to...|indian|
|        Chicken Saag|The classic India...|indian|
|Paleo Slow Cooker...|Boneless pork loi...|indian|
|Bombay Chicken an...|Chicken parts are...|indian|
|Indian Carrots, P...|Potatoes, peas an...|indian|
|Wendy's Indian Bu...|This recipe resem...|indian|
|    Indian Chickpeas|Garbanzo beans, o...|indian|
|Dal Makhani (Indi...|These richly spic...|indian|
|               Raita|Chopped tomatoes ...|indian|
|Yogurt-Marinated ...|A yogurt-based ma...|indian|
|Indian-Spiced Roa...|Spicy roasted chi...|indian|
|Cauliflower and T...|Pressed tofu cube...|indian|
|Channa Masala (Ch...|This fantastic In...|indian|
|Bengali Chicken C...|Thy this deliciou...|indian|
|  Indian Sweet Bread|A crisp a

In [64]:
recipe_rdd_5 = recipe.select("Description").rdd.flatMap(lambda x: x)
recipe_rdd_5.collect()

['This is an easy, authentic dish from South Asia that appeals to a wide range of tastes. The…',
 'There is no leavening in this simple, tender Indian flatbread of bread flour, oil, salt and…',
 "It's important to use good mayonnaise in this recipe, and to let the cooked potatoes chill…",
 'The classic Indian chicken and spinach dish gets richness from sour cream.',
 'Boneless pork loin slowly cooks in a curried fruit sauce until tender and delicious.',
 'Chicken parts are brushed with a butter and curry mixture and baked in a mixture of rice,…',
 'Potatoes, peas and carrots are cooked with Indian spices for an easy yet exotic side dish.',
 "This recipe resembles a dish from an Indian restaurant in my town. I love it. It's got a great…",
 'Garbanzo beans, onions, and spices are simmered together in this typical Northern…',
 'These richly spiced lentils are simmered for two hours in a spicy tomato sauce and finished…',
 'Chopped tomatoes and cucumbers are tossed with a sour cream, yogur

In [65]:
lowerCase_sentRDD = recipe_rdd_5.map(lambda x : x.lower())
lowerCase_sentRDD.collect()

['this is an easy, authentic dish from south asia that appeals to a wide range of tastes. the…',
 'there is no leavening in this simple, tender indian flatbread of bread flour, oil, salt and…',
 "it's important to use good mayonnaise in this recipe, and to let the cooked potatoes chill…",
 'the classic indian chicken and spinach dish gets richness from sour cream.',
 'boneless pork loin slowly cooks in a curried fruit sauce until tender and delicious.',
 'chicken parts are brushed with a butter and curry mixture and baked in a mixture of rice,…',
 'potatoes, peas and carrots are cooked with indian spices for an easy yet exotic side dish.',
 "this recipe resembles a dish from an indian restaurant in my town. i love it. it's got a great…",
 'garbanzo beans, onions, and spices are simmered together in this typical northern…',
 'these richly spiced lentils are simmered for two hours in a spicy tomato sauce and finished…',
 'chopped tomatoes and cucumbers are tossed with a sour cream, yogur

In [88]:
def sent_TokenizeFunct(x):
    return nltk.sent_tokenize(x)

sentenceTokenizeRDD = lowerCase_sentRDD.map(sent_TokenizeFunct)
sentenceTokenizeRDD.collect()

AttributeError: 'PipelinedRDD' object has no attribute '_jdf'