In [None]:
# Remember to install JDK (or use https://colab.research.google.com/) <----<<<
#   -  https://www.oracle.com/java/technologies/downloads/

%pip install pyspark


In [None]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("HelloWorld") \
    .getOrCreate()
sc = spark.sparkContext
sc.setLogLevel('WARN')

In [None]:
nums = sc.parallelize([1,2,3,4])
print(nums.map(lambda x: x*x).collect())

In [None]:
# Download the Book of Mormon as a text file from Gutenburg
!curl -L https://ia601205.us.archive.org/18/items/thebookofmormon00017gut/mormon13.txt > bookOfMormon.txt

In [None]:
# Read all the lines of the text file (textFile auto does parallelization)
lines = sc.textFile("bookOfMormon.txt")

lines.top(10)

In [None]:
# Count how many lines there are
lines.count()

In [None]:
# Count how many duplicate lines?
allCount = lines.count()
distinctCount = lines.distinct().count()

allCount - distinctCount

In [None]:
# What are some of those duplicate lines?

# start each line count at one, key = line (key, count)
lineCounts = lines.map(lambda line: (line, 1)) 

lineCounts.take(10)

In [None]:
# reduce by key in a parrellel way, matching up duplicate lines and summing their counts
lineCounts = lineCounts.reduceByKey(lambda lineCount1, lineCount2: lineCount1 + lineCount2) 

lineCounts.take(10)

In [None]:
sortedLineCounts = lineCounts.sortBy(lambda t: t[1], False)
sortedLineCounts.take(20)

In [None]:
# What would you you need to count all the words?

In [None]:
# Python tip - 
import re
line = "What is this? 1. A Helicopter. 2. A plane. 3. Super hero"
re.split('[^a-zA-Z]', line)

In [None]:
# Python tip
"This Had somE uPPerCase LeTtErs".lower()

In [None]:
# SparkSQL tip!
# Flat map  -- take rows of arrays and make each element in array a new row a

sentences = sc.parallelize(["Hello World!", "Take your vitamins!", "Get enough sleep!"])
words = sentences.flatMap(lambda sentence: sentence.split())
words.take(20)