# Lab 3 - word count with Spark

Let's run a simple wordcount example on the file 'princessbride.txt' - it's already loaded into HDFS for you. 

First, create a spark context with 5 executors, each with 2G memory

In [1]:
# Set up Spark Context
from pyspark import SparkContext, SparkConf

SparkContext.setSystemProperty('spark.executor.memory', '2g')
conf = SparkConf()
conf.set('spark.executor.instances', 5)
sc = SparkContext('yarn-client', 'Spark-lab3', conf=conf)

As before, load the princessbride.txt file into a Spark RDD, but now filter out any empty lines.
Print the first 5 lines

In [2]:
file = sc.textFile("princessbride.txt")
file.filter(lambda x: len(x)>0).take(5)

[u'Once at the top, Inigo helps Buttercup and Vizzini.  Vizzini saws at the rope with his dagger.  Fezzik and Inigo are looking down at the masked man clinging to the cliff after Vizzini has cut the rope. ',
 u"Fezzik:  He's got very good arms. ",
 u"Vizzini: Joining them. HE DIDN'T FALL? INCONCEIVABLE! ",
 u"Inigo:  You keep using that word. I do not think it means what you think it means. Pause, hushed.  My God! He's climbing! ",
 u"Vizzini:  Whoever he is, he's obviously seen us with the princess and must therefore die. You, carry her. We'll head straight for the Guilder frontier. Catch up when he's dead. If he falls, fine. If not, the sword. "]

As our first word-count program we will just separate words in a sentence by spaces. In Python you can use split(" ") to get the list of words. 
Write the word-count Spark program to display all the words and their occurrence count.

In [3]:
counts1 = file.flatMap(lambda line: line.split(" ")).map(lambda word: (word, 1)) \
             .reduceByKey(lambda a, b: a + b)
counts1.collect()   

[(u'', 155),
 (u'stained-glass', 1),
 (u'Black:Nods.', 1),
 (u'hand.', 1),
 (u'either.Switches', 1),
 (u"So's", 1),
 (u'carry', 1),
 (u'kill', 3),
 (u'cancels', 1),
 (u'Forces', 1),
 (u'go', 1),
 (u'uncoils', 1),
 (u'looks,', 1),
 (u'hate', 3),
 (u"I'm", 3),
 (u'Oh,', 1),
 (u'looking', 1),
 (u'certainly', 1),
 (u'depart', 1),
 (u'Domingo', 1),
 (u'just', 3),
 (u'cliff.', 1),
 (u'with', 4),
 (u'father', 3),
 (u'WHAT', 1),
 (u'better', 1),
 (u'window', 1),
 (u'black', 2),
 (u'hands.', 1),
 (u'easy', 1),
 (u'Catch', 1),
 (u'reach', 2),
 (u'through', 1),
 (u'over', 5),
 (u'hope', 1),
 (u'it.', 1),
 (u'me', 6),
 (u'it,', 2),
 (u'which', 1),
 (u'Whether', 1),
 (u'Please', 1),
 (u'Returns', 1),
 (u'duel', 1),
 (u'Pause,', 1),
 (u'they', 1),
 (u'not', 10),
 (u'However,', 1),
 (u'did', 1),
 (u'Black:Climbing', 1),
 (u'help,', 1),
 (u'lately.', 1),
 (u'Switches', 1),
 (u'Fezzik:', 2),
 (u'Kill', 1),
 (u'man.', 2),
 (u'amazing!', 1),
 (u'Black', 6),
 (u"I'M", 1),
 (u'draw', 1),
 (u'Begins', 3),
 

Notice that "words" in our case include the empty string "", and words can also have punctuation. write another variation of word-count that 
1. Gets rid of puncutation characters (hint: see string.punctuation in Python)
2. removes words of length 0 or 1
3. Converts every word to lower-case

In [4]:
import string

file = sc.textFile("princessbride.txt")
counts2 = file.map(lambda line: filter(lambda x: x not in string.punctuation, line)) \
             .flatMap(lambda line: line.split(" ")) \
             .map(lambda word: word.lower()) \
             .filter(lambda word: len(word)>1) \
             .map(lambda word: (word, 1)) \
             .reduceByKey(lambda a, b: a + b)
counts2.collect() 

[(u'all', 1),
 (u'just', 3),
 (u'tingtingting', 1),
 (u'boot', 2),
 (u'over', 5),
 (u'switches', 1),
 (u'known', 1),
 (u'carry', 1),
 (u'kill', 4),
 (u'go', 1),
 (u'useful', 1),
 (u'smiling', 1),
 (u'hate', 3),
 (u'shrugs', 1),
 (u'certainly', 1),
 (u'stairs', 1),
 (u'father', 7),
 (u'fitting', 1),
 (u'better', 1),
 (u'lower', 1),
 (u'window', 1),
 (u'black', 30),
 (u'easy', 1),
 (u'sorry', 1),
 (u'through', 1),
 (u'hope', 1),
 (u'me', 10),
 (u'them', 1),
 (u'sword', 14),
 (u'very', 2),
 (u'clang', 4),
 (u'they', 3),
 (u'not', 12),
 (u'runs', 1),
 (u'did', 1),
 (u'youll', 2),
 (u'leave', 1),
 (u'this', 3),
 (u'spaniard', 1),
 (u'promise', 1),
 (u'catch', 1),
 (u'ruins', 1),
 (u'draw', 1),
 (u'force', 1),
 (u'fair', 1),
 (u'mind', 1),
 (u'terrain', 1),
 (u'request', 1),
 (u'dead', 1),
 (u'empties', 1),
 (u'see', 1),
 (u'us', 1),
 (u'comforting', 1),
 (u'holding', 1),
 (u'fail', 2),
 (u'unless', 1),
 (u'happen', 1),
 (u'cold', 1),
 (u'special', 1),
 (u'rope', 6),
 (u'what', 3),
 (u'wwwww

Print the top-10 words by count

In [5]:
counts3 = counts2.sortBy(lambda (k,v): v, ascending=False)
counts3.take(10) 

[(u'the', 46),
 (u'man', 44),
 (u'inigo', 42),
 (u'in', 41),
 (u'you', 41),
 (u'to', 35),
 (u'black', 30),
 (u'sword', 14),
 (u'his', 13),
 (u'my', 13)]

How many words are there in total?

In [6]:
counts3.count()

424