In [2]:
!pip install pyspark
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

import pyspark

number_cores = int(os.environ['NUM_CPUS'])
memory_gb = int(os.environ['AVAILABLE_MEMORY_MB']) // 1024

conf = (
    pyspark.SparkConf()
        .setMaster('local[{}]'.format(number_cores))
        .set('spark.driver.memory', '{}g'.format(memory_gb))
)
sc = pyspark.SparkContext(conf=conf)

Processing /home/faculty/.cache/pip/wheels/84/30/e3/c51c5cd0229631e662d29d7b578a3e5949a4c8db033ffb70aa/pyspark-2.4.5-py2.py3-none-any.whl
Collecting py4j==0.10.7
  Using cached py4j-0.10.7-py2.py3-none-any.whl (197 kB)
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.7 pyspark-2.4.5


In [3]:
print(sc)

<SparkContext master=local[4] appName=pyspark-shell>


In [4]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)


In [5]:
# Load the dataset
indian = sqlContext.read.csv("/project/Project/DataEngineeringGroupAO/Recipe_dataset/data_indian.csv",header=True)
italian = sqlContext.read.csv("/project/Project/DataEngineeringGroupAO/Recipe_dataset/data_italian.csv",header=True)
mexican = sqlContext.read.csv("/project/Project/DataEngineeringGroupAO/Recipe_dataset/data_mexican.csv",header=True)

In [7]:
# Label the data
from pyspark.sql.functions import lit

indian = indian.withColumn("label",lit("indian"))
italian = italian.withColumn("label",lit("italian"))
mexican = mexican.withColumn("label",lit("mexican"))

In [8]:
indian.show()

+--------------------+--------------------+------+
|               Title|         Description| label|
+--------------------+--------------------+------+
|  Indian Peanut Stew|This is an easy, ...|indian|
|        Roomali Roti|There is no leave...|indian|
|Spicy Sweet Potat...|It's important to...|indian|
|        Chicken Saag|The classic India...|indian|
|Paleo Slow Cooker...|Boneless pork loi...|indian|
|Bombay Chicken an...|Chicken parts are...|indian|
|Indian Carrots, P...|Potatoes, peas an...|indian|
|Wendy's Indian Bu...|This recipe resem...|indian|
|    Indian Chickpeas|Garbanzo beans, o...|indian|
|Dal Makhani (Indi...|These richly spic...|indian|
|               Raita|Chopped tomatoes ...|indian|
|Yogurt-Marinated ...|A yogurt-based ma...|indian|
|Indian-Spiced Roa...|Spicy roasted chi...|indian|
|Cauliflower and T...|Pressed tofu cube...|indian|
|Channa Masala (Ch...|This fantastic In...|indian|
|Bengali Chicken C...|Thy this deliciou...|indian|
|  Indian Sweet Bread|A crisp a

In [9]:
# Combine 3 dataset into one

from functools import reduce
from pyspark.sql import DataFrame

def unionAll(dfs):
    return reduce(DataFrame.unionAll, dfs)

dfs = [indian, italian, mexican]
recipe = unionAll(dfs)
recipe.show()

+--------------------+--------------------+------+
|               Title|         Description| label|
+--------------------+--------------------+------+
|  Indian Peanut Stew|This is an easy, ...|indian|
|        Roomali Roti|There is no leave...|indian|
|Spicy Sweet Potat...|It's important to...|indian|
|        Chicken Saag|The classic India...|indian|
|Paleo Slow Cooker...|Boneless pork loi...|indian|
|Bombay Chicken an...|Chicken parts are...|indian|
|Indian Carrots, P...|Potatoes, peas an...|indian|
|Wendy's Indian Bu...|This recipe resem...|indian|
|    Indian Chickpeas|Garbanzo beans, o...|indian|
|Dal Makhani (Indi...|These richly spic...|indian|
|               Raita|Chopped tomatoes ...|indian|
|Yogurt-Marinated ...|A yogurt-based ma...|indian|
|Indian-Spiced Roa...|Spicy roasted chi...|indian|
|Cauliflower and T...|Pressed tofu cube...|indian|
|Channa Masala (Ch...|This fantastic In...|indian|
|Bengali Chicken C...|Thy this deliciou...|indian|
|  Indian Sweet Bread|A crisp a

In [27]:
recipe.count()

1500

In [28]:
# Count number of missing values
from pyspark.sql.functions import col,sum
recipe.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in recipe.columns)).show()

# null may result in failed mapreduce tasks

+-----+-----------+-----+
|Title|Description|label|
+-----+-----------+-----+
|    0|          0|    0|
+-----+-----------+-----+



In [11]:
# Convert it to RDD
recipe_rdd = recipe.rdd

In [12]:
recipe_rdd.take(5)

[Row(Title='Indian Peanut Stew', Description='This is an easy, authentic dish from South Asia that appeals to a wide range of tastes. The…', label='indian'),
 Row(Title='Roomali Roti', Description='There is no leavening in this simple, tender Indian flatbread of bread flour, oil, salt and…', label='indian'),
 Row(Title='Spicy Sweet Potato Salad', Description="It's important to use good mayonnaise in this recipe, and to let the cooked potatoes chill…", label='indian'),
 Row(Title='Chicken Saag', Description='The classic Indian chicken and spinach dish gets richness from sour cream.', label='indian'),
 Row(Title='Paleo Slow Cooker Pork Loin', Description='Boneless pork loin slowly cooks in a curried fruit sauce until tender and delicious.', label='indian')]

Not sure if we can split data as what I did in pandas, if not, we could get training, test dataset (csv) prepared before loading them in.

# Data Cleaning

In [32]:
from pyspark.sql.functions import lower, col

recipe_spdf_1 = recipe.select(*[lower(col(col_name)).name(col_name) for col_name in recipe.columns])
recipe_spdf_1.show()

+--------------------+--------------------+------+
|               Title|         Description| label|
+--------------------+--------------------+------+
|  indian peanut stew|this is an easy, ...|indian|
|        roomali roti|there is no leave...|indian|
|spicy sweet potat...|it's important to...|indian|
|        chicken saag|the classic india...|indian|
|paleo slow cooker...|boneless pork loi...|indian|
|bombay chicken an...|chicken parts are...|indian|
|indian carrots, p...|potatoes, peas an...|indian|
|wendy's indian bu...|this recipe resem...|indian|
|    indian chickpeas|garbanzo beans, o...|indian|
|dal makhani (indi...|these richly spic...|indian|
|               raita|chopped tomatoes ...|indian|
|yogurt-marinated ...|a yogurt-based ma...|indian|
|indian-spiced roa...|spicy roasted chi...|indian|
|cauliflower and t...|pressed tofu cube...|indian|
|channa masala (ch...|this fantastic in...|indian|
|bengali chicken c...|thy this deliciou...|indian|
|  indian sweet bread|a crisp a

In [57]:
from pyspark.ml.feature import Tokenizer

tokenizer = Tokenizer(inputCol="Description", outputCol="Description_1")
recipe_spdf_2 = tokenizer.transform(recipe).select('Title', "Description_1", 'Label')

recipe_spdf_2.show(truncate = False)

+---------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------+------+
|Title                                                    |Description_1                                                                                                      |Label |
+---------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------+------+
|Indian Peanut Stew                                       |[this, is, an, easy,, authentic, dish, from, south, asia, that, appeals, to, a, wide, range, of, tastes., the…]    |indian|
|Roomali Roti                                             |[there, is, no, leavening, in, this, simple,, tender, indian, flatbread, of, bread, flour,, oil,, salt, and…]      |indian|
|Spicy Sweet Potato Salad                                 |[it's, important, to, use,

In [15]:
from pyspark.ml.feature import StopWordsRemover

# Define a list of stop words or use default list
remover = StopWordsRemover()
stopwords = remover.getStopWords() 

# Display default list
stopwords[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your']

In [56]:
# Specify input/output columns
remover.setInputCol("Description_1")
remover.setOutputCol("Description_2")

# Transform existing dataframe with the StopWordsRemover
recipe_spdf_3 = remover.transform(recipe_spdf_2).select('Title', "Description_2", 'Label')

# Display
recipe_spdf_3.show()

+--------------------+--------------------+------+
|               Title|       Description_2| Label|
+--------------------+--------------------+------+
|  Indian Peanut Stew|[easy,, authentic...|indian|
|        Roomali Roti|[leavening, simpl...|indian|
|Spicy Sweet Potat...|[important, use, ...|indian|
|        Chicken Saag|[classic, indian,...|indian|
|Paleo Slow Cooker...|[boneless, pork, ...|indian|
|Bombay Chicken an...|[chicken, parts, ...|indian|
|Indian Carrots, P...|[potatoes,, peas,...|indian|
|Wendy's Indian Bu...|[recipe, resemble...|indian|
|    Indian Chickpeas|[garbanzo, beans,...|indian|
|Dal Makhani (Indi...|[richly, spiced, ...|indian|
|               Raita|[chopped, tomatoe...|indian|
|Yogurt-Marinated ...|[yogurt-based, ma...|indian|
|Indian-Spiced Roa...|[spicy, roasted, ...|indian|
|Cauliflower and T...|[pressed, tofu, c...|indian|
|Channa Masala (Ch...|[fantastic, india...|indian|
|Bengali Chicken C...|[thy, delicious, ...|indian|
|  Indian Sweet Bread|[crisp, s

In [50]:
# Removing digits from SparkDF

from pyspark.sql.functions import when,udf
from pyspark.sql.types import BooleanType
from pyspark.sql.types import ArrayType, StringType
from pyspark.sql.types import StructType

def is_digit(value):
    if value:
        return value.isdigit()
    else:
        return False

is_digit_udf = udf(is_digit, BooleanType())

filter_length_udf = udf(lambda row: [x for x in row if not is_digit(x)], ArrayType(StringType()))
recipe_spdf_4 = recipe_spdf_2.withColumn('Description_2', filter_length_udf(col('Description_2')))
recipe_spdf_4.show()

+---------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------+------+
|Title                                                    |Description_1                                                                                                      |Label |
+---------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------+------+
|Indian Peanut Stew                                       |[this, is, an, easy,, authentic, dish, from, south, asia, that, appeals, to, a, wide, range, of, tastes., the…]    |indian|
|Roomali Roti                                             |[there, is, no, leavening, in, this, simple,, tender, indian, flatbread, of, bread, flour,, oil,, salt, and…]      |indian|
|Spicy Sweet Potato Salad                                 |[it's, important, to, use,

In [None]:
# Remove '' (nan strings)
str_list = filter(lambda row: [x for x in row if not is_digit(x)], str_list)

### Pattern Exploration

In [44]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

# Patterns from different recipes

#Create a  SQL temporary view (more below)
recipe_spdf_4.createOrReplaceTempView("recipe")

# Filter out different recipes
recipe_ind = sqlContext.sql("SELECT * FROM recipe WHERE Label == 'indian'")
recipe_ita = sqlContext.sql("SELECT * FROM recipe WHERE Label == 'italian'")
recipe_mex = sqlContext.sql("SELECT * FROM recipe WHERE Label == 'mexican'")

In [74]:
recipe_spdf_5 = recipe_ind.select('Title')
recipe_spdf_5.show()

+--------------------+
|               Title|
+--------------------+
|  Indian Peanut Stew|
|        Roomali Roti|
|Spicy Sweet Potat...|
|        Chicken Saag|
|Paleo Slow Cooker...|
|Bombay Chicken an...|
|Indian Carrots, P...|
|Wendy's Indian Bu...|
|    Indian Chickpeas|
|Dal Makhani (Indi...|
|               Raita|
|Yogurt-Marinated ...|
|Indian-Spiced Roa...|
|Cauliflower and T...|
|Channa Masala (Ch...|
|Bengali Chicken C...|
|  Indian Sweet Bread|
| Rosy's Palak Paneer|
|Roti Bread from I...|
|Indian Vegetable ...|
+--------------------+
only showing top 20 rows



In [164]:
from pyspark.ml.feature import RegexTokenizer

tokenizer = RegexTokenizer(inputCol="Title", outputCol="Title2", pattern= "\W|s")
tokenized = tokenizer.transform(recipe_spdf_5)
tokenized.show(truncate=False)

+---------------------------------------------------------+--------------------------------------------------------------+
|Title                                                    |Title2                                                        |
+---------------------------------------------------------+--------------------------------------------------------------+
|Indian Peanut Stew                                       |[indian, peanut, tew]                                         |
|Roomali Roti                                             |[roomali, roti]                                               |
|Spicy Sweet Potato Salad                                 |[picy, weet, potato, alad]                                    |
|Chicken Saag                                             |[chicken, aag]                                                |
|Paleo Slow Cooker Pork Loin                              |[paleo, low, cooker, pork, loin]                              |
|Bombay Chicken 

In [136]:
tokenized.select('Title2').collect().

[Row(Title2=['indian', 'peanut', 'stew']),
 Row(Title2=['roomali', 'roti']),
 Row(Title2=['spicy', 'sweet', 'potato', 'salad']),
 Row(Title2=['chicken', 'saag']),
 Row(Title2=['paleo', 'slow', 'cooker', 'pork', 'loin']),
 Row(Title2=['bombay', 'chicken', 'and', 'rice']),
 Row(Title2=['indian', 'carrots', 'peas', 'and', 'potatoes']),
 Row(Title2=["wendy's", 'indian', 'butter', 'chicken']),
 Row(Title2=['indian', 'chickpeas']),
 Row(Title2=['dal', 'makhani', 'indian', 'lentils']),
 Row(Title2=['raita']),
 Row(Title2=['yogurt-marinated', 'salmon', 'fillets', 'dahi', 'machhali', 'masaledar']),
 Row(Title2=['indian-spiced', 'roasted', 'chickpeas']),
 Row(Title2=['cauliflower', 'and', 'tofu', 'masala']),
 Row(Title2=['channa', 'masala', 'chickpea', 'curry']),
 Row(Title2=['bengali', 'chicken', 'curry', 'with', 'potatoes']),
 Row(Title2=['indian', 'sweet', 'bread']),
 Row(Title2=["rosy's", 'palak', 'paneer']),
 Row(Title2=['roti', 'bread', 'from', 'india']),
 Row(Title2=['indian', 'vegetable'

In [167]:
# from nltk.tokenize import word_tokenize

# def word_TokenizeFunct(lists):
#     splitted = [word_tokenize(x) for x in lists]
#     return splitted

# wordTokenizeRDD = tokenized.select('Title2').rdd.map(word_TokenizeFunct(tokenized))
# wordTokenizeRDD.collect()

In [168]:
# wordTokenizeRDD = wordTokenizeRDD.reduceByKey(lambda x,y : x + y)
# wordTokenizeRDD.collect()

In [None]:
# Second Way

In [None]:
recipe.show()

In [169]:
recipe_rdd_5 = recipe.select("Description").rdd.flatMap(lambda x: x)
recipe_rdd_5.collect()

['This is an easy, authentic dish from South Asia that appeals to a wide range of tastes. The…',
 'There is no leavening in this simple, tender Indian flatbread of bread flour, oil, salt and…',
 "It's important to use good mayonnaise in this recipe, and to let the cooked potatoes chill…",
 'The classic Indian chicken and spinach dish gets richness from sour cream.',
 'Boneless pork loin slowly cooks in a curried fruit sauce until tender and delicious.',
 'Chicken parts are brushed with a butter and curry mixture and baked in a mixture of rice,…',
 'Potatoes, peas and carrots are cooked with Indian spices for an easy yet exotic side dish.',
 "This recipe resembles a dish from an Indian restaurant in my town. I love it. It's got a great…",
 'Garbanzo beans, onions, and spices are simmered together in this typical Northern…',
 'These richly spiced lentils are simmered for two hours in a spicy tomato sauce and finished…',
 'Chopped tomatoes and cucumbers are tossed with a sour cream, yogur

In [170]:
lowerCase_sentRDD = recipe_rdd_5.map(lambda x : x.lower())
lowerCase_sentRDD.collect()

['this is an easy, authentic dish from south asia that appeals to a wide range of tastes. the…',
 'there is no leavening in this simple, tender indian flatbread of bread flour, oil, salt and…',
 "it's important to use good mayonnaise in this recipe, and to let the cooked potatoes chill…",
 'the classic indian chicken and spinach dish gets richness from sour cream.',
 'boneless pork loin slowly cooks in a curried fruit sauce until tender and delicious.',
 'chicken parts are brushed with a butter and curry mixture and baked in a mixture of rice,…',
 'potatoes, peas and carrots are cooked with indian spices for an easy yet exotic side dish.',
 "this recipe resembles a dish from an indian restaurant in my town. i love it. it's got a great…",
 'garbanzo beans, onions, and spices are simmered together in this typical northern…',
 'these richly spiced lentils are simmered for two hours in a spicy tomato sauce and finished…',
 'chopped tomatoes and cucumbers are tossed with a sour cream, yogur

In [171]:
def sent_TokenizeFunct(x):
    return nltk.sent_tokenize(x)

sentenceTokenizeRDD = lowerCase_sentRDD.map(sent_TokenizeFunct)
sentenceTokenizeRDD.collect()

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 2 in stage 156.0 failed 1 times, most recent failure: Lost task 2.0 in stage 156.0 (TID 248, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/anaconda/envs/Python3/lib/python3.6/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 377, in main
    process()
  File "/opt/anaconda/envs/Python3/lib/python3.6/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 372, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/opt/anaconda/envs/Python3/lib/python3.6/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 400, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/opt/anaconda/envs/Python3/lib/python3.6/site-packages/pyspark/python/lib/pyspark.zip/pyspark/util.py", line 99, in wrapper
    return f(*args, **kwargs)
  File "<ipython-input-171-772e1a887984>", line 2, in sent_TokenizeFunct
NameError: name 'nltk' is not defined

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:592)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:575)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$class.foreach(Iterator.scala:891)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$15.apply(RDD.scala:990)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$15.apply(RDD.scala:990)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1891)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1879)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1878)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1878)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:927)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2112)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2061)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2050)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:738)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:990)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:385)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:989)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:166)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at sun.reflect.GeneratedMethodAccessor97.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/anaconda/envs/Python3/lib/python3.6/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 377, in main
    process()
  File "/opt/anaconda/envs/Python3/lib/python3.6/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 372, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/opt/anaconda/envs/Python3/lib/python3.6/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 400, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/opt/anaconda/envs/Python3/lib/python3.6/site-packages/pyspark/python/lib/pyspark.zip/pyspark/util.py", line 99, in wrapper
    return f(*args, **kwargs)
  File "<ipython-input-171-772e1a887984>", line 2, in sent_TokenizeFunct
NameError: name 'nltk' is not defined

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:592)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:575)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$class.foreach(Iterator.scala:891)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$15.apply(RDD.scala:990)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$15.apply(RDD.scala:990)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more
