In [62]:
from pyspark import SparkContext
import re

In [63]:
#Creating the spark context
sc = SparkContext.getOrCreate()
sc

In [64]:
#Reading the file
rdd =  sc.textFile("../datasets/book.txt")
rdd.take(10)

['Self-Employment: Building an Internet Business of One',
 'Achieving Financial and Personal Freedom through a Lifestyle Technology Business',
 'By Frank Kane',
 '',
 '',
 '',
 'Copyright � 2015 Frank Kane. ',
 'All rights reserved worldwide.',
 '',
 '']

In [65]:
#Casting all lines to lower
toLower = rdd.map(lambda x: x.lower())
toLower.take(10)

['self-employment: building an internet business of one',
 'achieving financial and personal freedom through a lifestyle technology business',
 'by frank kane',
 '',
 '',
 '',
 'copyright � 2015 frank kane. ',
 'all rights reserved worldwide.',
 '',
 '']

In [66]:
#Removing all the number and ponctuation
removePonctuation = toLower.map(lambda x: re.sub("[^\w\s]","", x))
removePonctuation.take(10)

['selfemployment building an internet business of one',
 'achieving financial and personal freedom through a lifestyle technology business',
 'by frank kane',
 '',
 '',
 '',
 'copyright  2015 frank kane ',
 'all rights reserved worldwide',
 '',
 '']

In [67]:
#Splitting the data
splittedByWord = removePonctuation.flatMap(lambda x: x.split(" "))
splittedByWord.take(10)

['selfemployment',
 'building',
 'an',
 'internet',
 'business',
 'of',
 'one',
 'achieving',
 'financial',
 'and']

In [68]:
#Transforming in a tuple
groupedCount = splittedByWord.map(lambda x: [x,1])
groupedCount.take(10)

[['selfemployment', 1],
 ['building', 1],
 ['an', 1],
 ['internet', 1],
 ['business', 1],
 ['of', 1],
 ['one', 1],
 ['achieving', 1],
 ['financial', 1],
 ['and', 1]]

In [69]:
#Making a reduce
reducedCount = groupedCount.reduceByKey(lambda x,y: x+y)
reducedCount.take(10)

[('an', 178),
 ('internet', 22),
 ('business', 376),
 ('of', 964),
 ('achieving', 1),
 ('financial', 17),
 ('lifestyle', 44),
 ('technology', 11),
 ('frank', 10),
 ('kane', 10)]

In [70]:
#Inverting the tuple to sort
flipped = reducedCount.map(lambda x: [x[1],x[0]])
flipped.take(10)

[[178, 'an'],
 [22, 'internet'],
 [376, 'business'],
 [964, 'of'],
 [1, 'achieving'],
 [17, 'financial'],
 [44, 'lifestyle'],
 [11, 'technology'],
 [10, 'frank'],
 [10, 'kane']]

In [71]:
sortedRdd = flipped.sortByKey()
sortedRdd.take(10)

[(1, 'achieving'),
 (1, 'contents'),
 (1, 'preparation'),
 (1, 'skillset'),
 (1, 'determination'),
 (1, 'blame'),
 (1, 'devoted'),
 (1, 'commuted'),
 (1, 'rewarded'),
 (1, 'rolemodel')]

In [72]:
for word in sortedRdd.collect():
    print("Word: {} Count: {}".format(word[1], word[0]))

Word: achieving Count: 1
Word: contents Count: 1
Word: preparation Count: 1
Word: skillset Count: 1
Word: determination Count: 1
Word: blame Count: 1
Word: devoted Count: 1
Word: commuted Count: 1
Word: rewarded Count: 1
Word: rolemodel Count: 1
Word: marriage Count: 1
Word: ultimatum Count: 1
Word: weeks Count: 1
Word: walked Count: 1
Word: nor Count: 1
Word: heart Count: 1
Word: societys Count: 1
Word: smarts Count: 1
Word: selfsufficient Count: 1
Word: rebel Count: 1
Word: magnitude Count: 1
Word: justify Count: 1
Word: surprising Count: 1
Word: starving Count: 1
Word: quits Count: 1
Word: tenure Count: 1
Word: americans Count: 1
Word: 14 Count: 1
Word: fringe Count: 1
Word: internal Count: 1
Word: religious Count: 1
Word: questioning Count: 1
Word: society Count: 1
Word: instill Count: 1
Word: grades Count: 1
Word: graduated Count: 1
Word: absorbed Count: 1
Word: promotes Count: 1
Word: barely Count: 1
Word: fulfill Count: 1
Word: cup Count: 1
Word: ramen Count: 1
Word: noodles Cou

Word: market Count: 66
Word: thats Count: 67
Word: those Count: 68
Word: should Count: 69
Word: products Count: 69
Word: before Count: 70
Word: many Count: 70
Word: most Count: 70
Word: might Count: 70
Word: ad Count: 70
Word: good Count: 72
Word: day Count: 72
Word: ads Count: 75
Word: no Count: 76
Word: like Count: 76
Word: probably Count: 76
Word: other Count: 78
Word: yourself Count: 78
Word: only Count: 79
Word: into Count: 79
Word: sales Count: 79
Word: find Count: 80
Word: these Count: 82
Word: was Count: 85
Word: money Count: 86
Word: who Count: 87
Word: job Count: 89
Word: much Count: 89
Word: also Count: 91
Word: than Count: 92
Word: one Count: 95
Word: youll Count: 97
Word: when Count: 102
Word: even Count: 104
Word: website Count: 107
Word: may Count: 107
Word: make Count: 108
Word: company Count: 114
Word: by Count: 121
Word: some Count: 121
Word: want Count: 122
Word: get Count: 122
Word: their Count: 122
Word: customers Count: 123
Word: dont Count: 133
Word: all Count: 1

In [73]:
def wordCount(pathOfFile):
    file = sc.textFile(pathOfFile)
    preProcessedRdd = file.map(lambda x: x.lower()) \
                        .map(lambda x: re.sub("[^\w\s]", "", x)) \
                        .flatMap(lambda x: x.split(" "))
    countRdd = preProcessedRdd.map(lambda x: [x,1]).reduceByKey(lambda x,y: x+y).map(lambda x: [x[1],x[0]])
    sortedRdd = countRdd.sortByKey()
    for word in sortedRdd.collect():
        print("Word: {} Count: {}".format(word[1], word[0]))

In [74]:
wordCount("../datasets/book.txt")

Word: achieving Count: 1
Word: contents Count: 1
Word: preparation Count: 1
Word: skillset Count: 1
Word: determination Count: 1
Word: blame Count: 1
Word: devoted Count: 1
Word: commuted Count: 1
Word: rewarded Count: 1
Word: rolemodel Count: 1
Word: marriage Count: 1
Word: ultimatum Count: 1
Word: weeks Count: 1
Word: walked Count: 1
Word: nor Count: 1
Word: heart Count: 1
Word: societys Count: 1
Word: smarts Count: 1
Word: selfsufficient Count: 1
Word: rebel Count: 1
Word: magnitude Count: 1
Word: justify Count: 1
Word: surprising Count: 1
Word: starving Count: 1
Word: quits Count: 1
Word: tenure Count: 1
Word: americans Count: 1
Word: 14 Count: 1
Word: fringe Count: 1
Word: internal Count: 1
Word: religious Count: 1
Word: questioning Count: 1
Word: society Count: 1
Word: instill Count: 1
Word: grades Count: 1
Word: graduated Count: 1
Word: absorbed Count: 1
Word: promotes Count: 1
Word: barely Count: 1
Word: fulfill Count: 1
Word: cup Count: 1
Word: ramen Count: 1
Word: noodles Cou