# Word Count with Spark

The purpose of this project is to use Spark’s filtering, mapping and reducing capabilities to implement a simple word count example that summarizes the words in Romeo and Juliet. 

In [1]:
import nltk

In [2]:
# Loading the NLTK Stop Word
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [6]:
from pyspark import SparkConf
configuration = SparkConf().setAppName('RomeoAndJulietCounter')\
                .setMaster('local[*]')

In [7]:
# Configuring a SparkContext
from pyspark import SparkContext
sc = SparkContext(conf=configuration)

In [8]:
from textblob.utils import strip_punc
tokenized = sc.textFile('RomeoAndJuliet.txt')\
            .map(lambda line: strip_punc(line, all=True).lower())\
            .flatMap(lambda line: line.split())

In [9]:
# Removing the stop words
filtered = tokenized.filter(lambda word: word not in stop_words)

In [10]:
from operator import add
word_counts = filtered.map(lambda word: (word, 1)).reduceByKey(add)

In [11]:
# Locating Words with Counts Greater Than or Equal to 60
filtered_counts = word_counts.filter(lambda item: item[1] >= 60)

In [12]:
from operator import itemgetter
sorted_items = sorted(filtered_counts.collect(),
                key=itemgetter(1), reverse=True)

In [18]:
# Display the results
max_len = max([len(word) for word, count in sorted_items]) 
for word, count in sorted_items:
    print(f'{word:>{max_len}}: {count}')

   romeo: 298
    thou: 277
  juliet: 178
     thy: 170
   nurse: 146
 capulet: 141
    love: 136
    thee: 135
   shall: 110
    lady: 109
   friar: 104
    come: 94
mercutio: 83
    good: 80
benvolio: 79
   enter: 75
      go: 75
    i’ll: 71
  tybalt: 69
   death: 69
   night: 68
lawrence: 67
     man: 65
    hath: 64
     one: 60
