In [6]:
# Remember to install JDK (or use https://colab.google.com) <----<<<
#   -  https://www.oracle.com/java/technologies/downloads/

%pip install pyspark




In [6]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("HelloWorld") \
    .getOrCreate()
sc = spark.sparkContext
sc.setLogLevel('WARN')

In [5]:
nums = sc.parallelize([1,2,3,4])
print(nums.map(lambda x: x*x).collect())

[1, 4, 9, 16]


In [1]:
# Download the Book of Mormon as a text file from Gutenburg
!curl -L https://ia601205.us.archive.org/18/items/thebookofmormon00017gut/mormon13.txt > bookOfMormon.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 1551k  100 1551k    0     0  2288k      0 --:--:-- --:--:-- --:--:-- 2292k


In [7]:
# Read all the lines of the text file (textFile auto does parallelization)
lines = sc.textFile("bookOfMormon.txt")

lines.top(10)

['zealously striving to repair all the injuries which they had done',
 'zealous for keeping the commandments of God.',
 'youth, and ye stand in need to be nourished by your brothers. ',
 'yourselves?',
 'yourselves, that ye may have hope as well as your brethren from',
 'yourselves, that ye have been sufficiently humble?  That your',
 'yourselves, and your thoughts, and your words, and your deeds,',
 'yourselves wrath against the day of judgment.',
 'yourselves unto him that he may have power over you, to blind',
 'yourselves treasures in heaven, where nothing doth corrupt, and']

In [8]:
# Count how many lines there are
lines.count()

40116

In [9]:
# Count how many duplicate lines?
allCount = lines.count()
distinctCount = lines.distinct().count()

allCount - distinctCount

7651

In [14]:
# What are some of those duplicate lines?

# start each line count at one, key = line (key, count)
lineCounts = lines.map(lambda line: (line, 1)) 

lineCounts.take(10)

[('', 1),
 ('****This is the Project Gutenberg edition of Book of Mormon****', 1),
 ('This 13th edition should be labeled mormon13.txt or mormon13.zip', 1),
 ('***This edition is being officially released on March 8, 1992***', 1),
 ('', 1),
 ('[Date last updated: May 22, 2005]', 1),
 (' ', 1),
 ('Corrected EDITIONS of our etexts get a new NUMBER, xxxxx11.txt.', 1),
 ('VERSIONS based on separate sources get new LETTER, xxxxx10a.txt.', 1),
 ('', 1)]

In [15]:
# reduce by key in a parrellel way, matching up duplicate lines and summing their counts
lineCounts = lineCounts.reduceByKey(lambda lineCount1, lineCount2: lineCount1 + lineCount2) 

lineCounts.take(10)

[('', 6937),
 ('****This is the Project Gutenberg edition of Book of Mormon****', 1),
 ('This 13th edition should be labeled mormon13.txt or mormon13.zip', 1),
 ('[Date last updated: May 22, 2005]', 1),
 (' ', 2),
 ('Corrected EDITIONS of our etexts get a new NUMBER, xxxxx11.txt.', 1),
 ('to get any etext selected, entered, proofread, edited, copyright', 1),
 ('projected audience is one hundred million readers.  If our value', 1),
 ('The Goal of Project Gutenberg is to Give Away One Trillion Etext', 1),
 ('Files by the December 31, 2001.  [10,000 x 100,000,000=Trillion]', 1)]

In [22]:
sortedLineCounts = lineCounts.sortBy(lambda t: t[1], False)
sortedLineCounts.take(20)

[('', 6937),
 ('them.', 33),
 ('people.', 18),
 ('God.', 15),
 ('land.', 15),
 ('wilderness.', 14),
 ('saying:', 11),
 ('Chapter 2', 10),
 ('Chapter 3', 10),
 ('Chapter 4', 10),
 ('him.', 10),
 ('Israel.', 10),
 ('Chapter 1', 10),
 ('Chapter 5', 10),
 ('Chapter 6', 10),
 ('Chapter 7', 10),
 ('Chapter 8', 9),
 ('judges over the people of Nephi.', 9),
 ('Chapter 9', 9),
 ('Nephi.', 8)]

In [None]:
# What would you you need to count all the words?

In [2]:
# Python tip - 
import re
line = "What is this? 1. A Helicopter. 2. A plane. 3. Super hero"
re.split('[^a-zA-Z]', line)

['What',
 'is',
 'this',
 '',
 '',
 '',
 '',
 'A',
 'Helicopter',
 '',
 '',
 '',
 '',
 'A',
 'plane',
 '',
 '',
 '',
 '',
 'Super',
 'hero']

In [4]:
# Python tip
"This Had somE uPPerCase LeTtErs".lower()

'this had some uppercase letters'

In [10]:
# SparkSQL tip!
# Flat map  -- take rows of arrays and make each element in array a new row a

sentences = sc.parallelize(["Hello World!", "Take your vitamins!", "Get enough sleep!"])
words = sentences.flatMap(lambda sentence: sentence.split())
words.take(20)

['Hello', 'World!', 'Take', 'your', 'vitamins!', 'Get', 'enough', 'sleep!']