In [6]:
from notebook_data.common import init_spark
spark, sc = init_spark.create()

Initializing Spark session ...
Initialized


In [7]:
arr = [i for i in range(100)]

In [8]:
arr_rdd = sc.parallelize(arr)

In [9]:
arr_rdd.take(10)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [10]:
arr_rdd.count()

100

In [13]:
# reading data from a text file (a book)
rdd = sc.textFile('w04/data/Complete Works of William Shakespeare.txt')

In [14]:
# number of lines in the book
rdd.count()

196390

In [15]:
# taking top ten lines of the rdd for previewing
rdd.take(10)

['The Project Gutenberg eBook of The Complete Works of William Shakespeare',
 '    ',
 'This ebook is for the use of anyone anywhere in the United States and',
 'most other parts of the world at no cost and with almost no restrictions',
 'whatsoever. You may copy it, give it away or re-use it under the terms',
 'of the Project Gutenberg License included with this ebook or online',
 'at www.gutenberg.org. If you are not located in the United States,',
 'you will have to check the laws of the country where you are located',
 'before using this eBook.',
 '']

In [16]:
# create a new rdd from the origin rdd - which created by reading the text file
words_rdd = rdd.flatMap(lambda line: line.split())

In [17]:
words_rdd

PythonRDD[10] at RDD at PythonRDD.scala:53

In [18]:
# Get 10 words for previewing
words_rdd.take(10)

['The',
 'Project',
 'Gutenberg',
 'eBook',
 'of',
 'The',
 'Complete',
 'Works',
 'of',
 'William']

In [19]:
word_map_rdd = words_rdd.map(lambda word: (word, 1))

In [20]:
word_map_rdd.take(10)

[('The', 1),
 ('Project', 1),
 ('Gutenberg', 1),
 ('eBook', 1),
 ('of', 1),
 ('The', 1),
 ('Complete', 1),
 ('Works', 1),
 ('of', 1),
 ('William', 1)]

In [21]:
word_count_rdd = word_map_rdd.reduceByKey(lambda a, b: a + b)

In [22]:
word_count_rdd.take(10)

[('The', 4630),
 ('Project', 79),
 ('Gutenberg', 22),
 ('eBook', 4),
 ('of', 17111),
 ('Complete', 3),
 ('Works', 4),
 ('William', 35),
 ('Shakespeare', 5),
 ('This', 1218)]

In [23]:
swap_word_count_rdd = word_count_rdd.map(lambda x: (x[1], x[0]))

In [24]:
swap_word_count_rdd.take(10)

[(4630, 'The'),
 (79, 'Project'),
 (22, 'Gutenberg'),
 (4, 'eBook'),
 (17111, 'of'),
 (3, 'Complete'),
 (4, 'Works'),
 (35, 'William'),
 (5, 'Shakespeare'),
 (1218, 'This')]

In [25]:
swap_word_count_rdd.sortByKey(ascending=False).take(10)

[(25689, 'the'),
 (20717, 'I'),
 (19849, 'and'),
 (17111, 'of'),
 (17075, 'to'),
 (13730, 'a'),
 (11397, 'my'),
 (10943, 'in'),
 (9527, 'you'),
 (8361, 'is')]

In [26]:
# collect everything - be CAUTIOUS !!
# word_count_rdd.collect()

In [27]:
# filter words by conditions: get all words which have length longer than 5 chars
long_words_rdd = words_rdd.filter(lambda x: len(x) > 5)

In [28]:
long_words_rdd.take(10)

['Project',
 'Gutenberg',
 'Complete',
 'William',
 'Shakespeare',
 'anyone',
 'anywhere',
 'United',
 'States',
 'almost']

In [31]:
sc.setCheckpointDir('checkpoints/w04/rdd')

In [32]:
long_words_rdd.checkpoint()