In [1]:
from pyspark.sql import SparkSession
import re

In [2]:
spark = SparkSession.builder.appName("WordCount").getOrCreate()

In [3]:
spark.version

u'2.0.2'

In [4]:
def remove_punc(line):
    line = line.lower().strip()
    line = re.sub('[^0-9a-zA-Z ]', '', line)
    return line

In [5]:
text = spark.read.text('war_and_peace.txt')
text.show()

+--------------------+
|               value|
+--------------------+
|The Project Guten...|
|                    |
|This eBook is for...|
|no restrictions w...|
|under the terms o...|
|eBook or online a...|
|                    |
|                    |
|Title: War and Peace|
|                    |
| Author: Leo Tolstoy|
|                    |
|Translators: Loui...|
|                    |
|Posting Date: Jan...|
|                    |
|Last Updated: Mar...|
|                    |
|   Language: English|
|                    |
+--------------------+
only showing top 20 rows



In [6]:
lines = text.rdd.map(lambda r: r[0])
words = lines.map(remove_punc).flatMap(lambda line: line.split(' ')).filter(lambda word: word != '')
pairs = words.map(lambda word: (word, 1))
counts = pairs.reduceByKey(lambda x, y: x + y)

In [7]:
top20counts = counts.takeOrdered(20, lambda x: -x[1])

In [8]:
for item in top20counts:
    print item[0], item[1]

the 34570
and 22159
to 16716
of 14991
a 10521
he 9809
in 8801
his 7967
that 7813
was 7329
with 5695
had 5354
it 5179
her 4700
not 4658
him 4574
at 4538
i 4106
but 4013
on 3998


In [9]:
spark.stop()