In [1]:
import findspark
findspark.init()
from pyspark import SparkContext
from nltk.corpus import stopwords

In [2]:
sc = SparkContext()

In [3]:
# Creating Spark RDD from a text file, and spliting text with flatMap() operation.  

In [4]:
raw_data = sc.textFile("shakespear_text.txt")
splited_text = raw_data.flatMap(lambda x : x.split())
print("Total number of words in splited text:",splited_text.count())

Total number of words in splited text: 961114


In [5]:
# Creating w stop words set for later deletion.

In [6]:
stop_words = set(stopwords.words('english'))

In [7]:
# Transformation of text for counting words:
# 1. Filtering data with stop_words set.
# 2. Creating a tuple for each word with its count = 1: a key - value tuples.
# 3. Counting of words with reduceByKey() - transformation combines values (adding word count) with same key.
# 4. Swaping keys and values inside tuples for sorting.
# 5. Sorting RDD to obtain words with highest count. 

In [8]:
text_no_stop_words = splited_text.filter(lambda x: x.lower() not in stop_words)
word_tuples = text_no_stop_words.map(lambda w: (w,1))
result = word_tuples.reduceByKey(lambda x,y: x + y)
result_swap = result.map(lambda x: (x[1],x[0]))                                     
result_sort = result_swap.sortByKey(ascending = False) 

In [9]:
# Printing 10 most frequent words in text. 

In [10]:
for word in result_sort.take(10):
    print("{} has {} counts".format(word[1],word[0]))

thou has 4514 counts
thy has 3918 counts
shall has 3248 counts
good has 2169 counts
would has 2133 counts
Enter has 2005 counts
thee has 1888 counts
hath has 1720 counts
like has 1642 counts
you, has 1568 counts
