In [1]:
import os
import sys
spark_path = os.environ['SPARK_HOME']
sys.path.append(spark_path + "/bin")
sys.path.append(spark_path + "/python")
sys.path.append(spark_path + "/python/pyspark/")
sys.path.append(spark_path + "/python/lib")
sys.path.append(spark_path + "/python/lib/pyspark.zip")
sys.path.append(spark_path + "/python/lib/py4j-0.10.9-src.zip")

import findspark
findspark.init()
import pyspark

In [2]:
number_cores = 6
memory_gb = 16
conf = (pyspark.SparkConf().setMaster('local[{}]'.format(number_cores)).set('spark.driver.memory', '{}g'.format(memory_gb)))
sc = pyspark.SparkContext(conf=conf)

In [3]:
textFile = sc.textFile("/users/trush/CSC496/SparkComputingEnvironment/data/shakespeare/shakespeare-complete.txt")
wordcount = textFile.flatMap(lambda line: line.split(" ")) \
            .map(lambda word: (word, 1)) \
            .reduceByKey(lambda a, b: a + b)
wordcount.saveAsTextFile("/users/trush/CSC496/SparkComputingEnvironment/data/output/output-wordcount-01")

In [4]:
textFile.getNumPartitions()

2

In [5]:
textFile_4 = textFile.repartition(4)
textFile_4.getNumPartitions()

4

In [6]:
wordcount = textFile_4.flatMap(lambda line: line.split(" ")) \
            .map(lambda word: (word, 1)) \
            .reduceByKey(lambda a, b: a + b)
wordcount.saveAsTextFile("/users/trush/CSC496/SparkComputingEnvironment/data/output/out-wordcount-02")

In [7]:
wordcount = textFile_4.flatMap(lambda line: line.split(" ")) \
            .map(lambda word: (word, 1)) \
            .reduceByKey(lambda a, b: a + b)
print(wordcount)

PythonRDD[25] at RDD at PythonRDD.scala:53


In [8]:
wordcount.take(10)

[('of', 16819),
 ('Shakespeare', 5),
 ('', 264326),
 ('Author:', 1),
 ('Date:', 1),
 ('[eBook', 1),
 ('16,', 1),
 ('Language:', 1),
 ('Character', 2),
 ('set', 423)]

In [9]:
local_words = wordcount.collect()
print(local_words[1:10])

[('Shakespeare', 5), ('', 264326), ('Author:', 1), ('Date:', 1), ('[eBook', 1), ('16,', 1), ('Language:', 1), ('Character', 2), ('set', 423)]


In [10]:
textFile_4.take(10)

['Title: The Complete Works of William Shakespeare',
 '',
 'Author: William Shakespeare',
 '',
 'Release Date: January 1994 [eBook #100]',
 '[Most recently updated: August 16, 2021]',
 '',
 'Language: English',
 '',
 'Character set encoding: UTF-8']

In [11]:
wordcount_1 = textFile_4.flatMap(lambda line: line.split(" "))
wordcount_1.take(10)

['Title:',
 'The',
 'Complete',
 'Works',
 'of',
 'William',
 'Shakespeare',
 '',
 'Author:',
 'William']

In [12]:
wordcount_2 = wordcount_1.map(lambda word: (word, 1))
wordcount_2.take(10)

[('Title:', 1),
 ('The', 1),
 ('Complete', 1),
 ('Works', 1),
 ('of', 1),
 ('William', 1),
 ('Shakespeare', 1),
 ('', 1),
 ('Author:', 1),
 ('William', 1)]

In [13]:
wordcount_3 = wordcount_2.reduceByKey(lambda a, b: a + b)
wordcount_3.take(10)

[('of', 16819),
 ('Shakespeare', 5),
 ('', 264326),
 ('Author:', 1),
 ('Date:', 1),
 ('[eBook', 1),
 ('16,', 1),
 ('Language:', 1),
 ('Character', 2),
 ('set', 423)]

In [14]:
import string
translator = str.maketrans('', '', string.punctuation)
wordcount_enhanced = textFile.flatMap(lambda line: line.split(" ")) \
           .map(lambda word: (word.translate(translator).lower(), 1)) \
           .reduceByKey(lambda a, b: a + b)
wordcount_enhanced.take(100)

[('project', 100),
 ('gutenberg', 31),
 ('ebook', 13),
 ('of', 18804),
 ('shakespeare', 10),
 ('', 264489),
 ('this', 7145),
 ('is', 9690),
 ('use', 360),
 ('anyone', 12),
 ('anywhere', 8),
 ('in', 12241),
 ('united', 20),
 ('other', 725),
 ('world', 674),
 ('at', 2738),
 ('no', 4051),
 ('restrictions', 2),
 ('whatsoever', 17),
 ('may', 1803),
 ('give', 1414),
 ('away', 907),
 ('reuse', 2),
 ('online', 4),
 ('are', 3720),
 ('have', 6262),
 ('check', 32),
 ('country', 163),
 ('where', 1362),
 ('before', 946),
 ('using', 14),
 ('title', 98),
 ('january', 3),
 ('1994', 1),
 ('100', 3),
 ('16', 3),
 ('language', 44),
 ('set', 509),
 ('encoding', 1),
 ('start', 39),
 ('contents', 49),
 ('ends', 55),
 ('tragedy', 25),
 ('cleopatra', 266),
 ('as', 6170),
 ('like', 1930),
 ('comedy', 12),
 ('errors', 15),
 ('prince', 773),
 ('denmark', 26),
 ('king', 2952),
 ('henry', 629),
 ('fourth', 54),
 ('sixth', 36),
 ('third', 262),
 ('john', 531),
 ('julius', 19),
 ('caesar', 507),
 ('love’s', 77),
 ('

In [15]:
wordcount_filtered = wordcount_enhanced.filter(lambda x: x[0] != '')
wordcount_filtered.take(100)

[('project', 100),
 ('gutenberg', 31),
 ('ebook', 13),
 ('of', 18804),
 ('shakespeare', 10),
 ('this', 7145),
 ('is', 9690),
 ('use', 360),
 ('anyone', 12),
 ('anywhere', 8),
 ('in', 12241),
 ('united', 20),
 ('other', 725),
 ('world', 674),
 ('at', 2738),
 ('no', 4051),
 ('restrictions', 2),
 ('whatsoever', 17),
 ('may', 1803),
 ('give', 1414),
 ('away', 907),
 ('reuse', 2),
 ('online', 4),
 ('are', 3720),
 ('have', 6262),
 ('check', 32),
 ('country', 163),
 ('where', 1362),
 ('before', 946),
 ('using', 14),
 ('title', 98),
 ('january', 3),
 ('1994', 1),
 ('100', 3),
 ('16', 3),
 ('language', 44),
 ('set', 509),
 ('encoding', 1),
 ('start', 39),
 ('contents', 49),
 ('ends', 55),
 ('tragedy', 25),
 ('cleopatra', 266),
 ('as', 6170),
 ('like', 1930),
 ('comedy', 12),
 ('errors', 15),
 ('prince', 773),
 ('denmark', 26),
 ('king', 2952),
 ('henry', 629),
 ('fourth', 54),
 ('sixth', 36),
 ('third', 262),
 ('john', 531),
 ('julius', 19),
 ('caesar', 507),
 ('love’s', 77),
 ('lost', 284),
 (