In [0]:
import requests
import pyspark
import re

# URL of full works of Shakespeare as available in the public domain on Gutenberg
gutenberg_url = 'https://www.gutenberg.org/ebooks/100.txt.utf-8'
shakespeare_response = requests.get(gutenberg_url)


In [0]:
shakespeare_response.status_code

200

In [0]:
shakespeare_text = shakespeare_response.text.split('\n')

In [0]:
# Create an RDD containing the works of Shakeseare
shakespeare=spark.sparkContext.parallelize(shakespeare_text)

In [0]:
# SHAPING

# Create an RDD, called shakespeare_flat, including all words in a single dimension, in lowercase and without punctuation using the methods map, lower and replace
word_count = shakespeare.flatMap(lambda x: x.split(" "))\
    .map(lambda x: re.sub(r"[,.!?:;'—]","",x).strip().lower())\
        .filter(lambda x: x!='')\
            .map(lambda x: (x,1))\
                .reduceByKey(lambda x,y: x+y)\
                    .sortBy(lambda x: x[1], ascending=False)




The total number of different words used by Shakespeare in his Complete Works is:

In [0]:
word_count.count()

34934

The top-10 words used by Shakespeare are:

In [0]:
display(word_count.take(10))

_1,_2
the,30428
and,28452
i,21666
to,20680
of,18856
a,16216
you,14401
my,13155
in,12426
that,11785


Of all the words he used, he used some only once. How many words were used only once?

In [0]:
once_used_words = word_count.filter(lambda x: x[1]==1)
once_used_words.count()

16703

That's quite a lot, let's take a look at some of them

In [0]:
once_used_words.take(100)

[('niggarding', 1),
 ('all-eating', 1),
 ('refigured', 1),
 ('astronomy', 1),
 ('swift-footed', 1),
 ('faring', 1),
 ('23', 1),
 ('presagers', 1),
 ('25', 1),
 ('famoused', 1),
 ('arising', 1),
 ('friend)', 1),
 ('31', 1),
 ('salving', 1),
 ('undivided', 1),
 ('43', 1),
 ('come)', 1),
 ('59', 1),
 ('pebbled', 1),
 ('61', 1),
 ('62', 1),
 ('beated', 1),
 ('65', 1),
 ('present’st', 1),
 ('choirs', 1),
 ('learned’s', 1),
 ('81', 1),
 ('found)', 1),
 ('precious)', 1),
 ('garments’', 1),
 ('stewards', 1),
 ('vermilion', 1),
 ('99', 1),
 ('intermixed’', 1),
 ('descriptions', 1),
 ('prefiguring', 1),
 ('dear-purchased', 1),
 ('118', 1),
 ('bevel', 1),
 ('123', 1),
 ('sway’st', 1),
 ('wood’s', 1),
 ('129', 1),
 ('damasked', 1),
 ('131', 1),
 ('149', 1),
 ('fistula', 1),
 ('quicklier', 1),
 ('mites', 1),
 ('christendoms', 1),
 ('naturalize', 1),
 ('soundness', 1),
 ('goers', 1),
 ('applications', 1),
 ('isbel’s', 1),
 ('cherisher', 1),
 ('steward_]', 1),
 ('principles', 1),
 ('monarchysee', 1),

Oops, there are stage directions in there, could do some better cleaning