# Prerequisites

In [None]:
# Update packages and install required java version
!apt-get update
!apt-get install openjdk-21-jdk-headless -qq > /dev/null

# download and unzip spark
!wget -nc -q https://downloads.apache.org/spark/spark-4.0.0/spark-4.0.0-bin-hadoop3.tgz
!tar xf spark-4.0.0-bin-hadoop3.tgz

# get data for labs
!wget -nc -O around_the_world_in_80_days.txt https://www.gutenberg.org/ebooks/103.txt.utf-8

# install findspark
!pip install -q findspark

0% [Working]            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:2 https://cli.github.com/packages stable InRelease
Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:4 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:6 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Hit:11 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Fetched 257 kB in 1s (197 kB/s)
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelea

In [None]:
import os
import findspark

# set env vars for java and spark
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-21-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-4.0.0-bin-hadoop3"

# start findspark so notebook can interact with spark
findspark.init()


In [None]:
# what does findspark do? use the ?? magic command to find out
# Note 1: in colab, this may open in a side panel
# Note 2: this magic command is often helpful when encountering an object in a
# notebook that is unfamiliar. More information will be displayed if it exists
?? findspark

# 1. Word Count

Instructions:  
For each cell marked "double-click and add explanation here" please answer the question in your own words.  
In the section where you complete the code to perform basic nlp text cleaning and exploration tasks, the goal is to chain all of the transformations together in a single function. For learning and exploration purposes, it is acceptable to have each step seperate, but the last cell in this section should be one function with all transformations chained together.  
For steps c and f, it is acceptable to use your favorite chatbot to generate a list of common stop words (c) and punctuation (e) for use in the code. As these are common steps in nlp/text processing tasks, there are pleanty of libraries to help with this such as nltk, but there is no need to import extra dependencies for this lab unless you are already familiar with working with them.

In [None]:
# start a spark session and create spark context for making rdd
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("word_count") \
    .getOrCreate()

sc = spark.sparkContext
print('lol')

In [None]:
# Defind the rdd
rdd = sc.textFile('/content/around_the_world_in_80_days.txt')

In [None]:
# view the first x lines of the rdd
rdd.take(20)

In [None]:
# example lambda function
words = rdd.flatMap(lambda lines: lines.split(' '))

In [None]:
# Note and explain the output of the below command
words


reference to the RDD // the adress

<ADD EXPLANATIONâ€¯HERE>

In [None]:
# Note and explain the output of the following command, focusing on the difference with the
# above command
words.collect()

compute a list of all the word. What we see is the list of the word of the book

In [None]:
# nicer print
for w in words.collect():
    print(w)

In [None]:
# Print first x words
words.take(20)

In [None]:
# Use cell magic command to help understand what the rdd.flatMap function is doing in the next cell.
# Insert a text/markdown cell and explain in your own words.

It associate the word to a tupple, it's before the grouping

In [None]:
# Initialize a word counter by creating a tuple with word and cound of 1
words = rdd.flatMap(lambda lines: lines.split(' ')) \
                    .map(lambda word: (word, 1))

for w in words.collect():
    print(w)

In [None]:
# a. count the occurence of each word
wordscount = words.reduceByKey(lambda x, y : x+y).collect()

In [None]:
# b. a common first step in text analysis, change all capital letters to lower case
words.map(lambda w: w.lower())

In [None]:
# c. eliminate the stop words.

words.filter(lambda w: w.lower() not in ["stop"])

In [None]:
# d. sort in alphabetical order
words.reduceByKey(lambda x, y : x+y).sortByKey().collect()



In [None]:
# e. sort descending by word frequency
rdd.flatMap(lambda lines: lines.split(' ')) \
                    .map(lambda word: (word, 1)) \
                    .reduceByKey(lambda x, y : x+y) \
                    .sortBy(lambda x: x[1], ascending=False).collect()


In [None]:
# f. remove punctuations and blank spaces

import re
words = words.map(lambda w: re.sub(r'[^a-zA-Z0-9]', '', w)) \
             .filter(lambda w: w != '')

# 2. What does the following cell block do?
Comment the code below line by line after the provided hash-tag. You should be able to explain each line while respecting the pep8 style guide of 79 characters or less per line!

In [None]:
 # Create an RDD of tuples (name, age)
dataRDD = sc.parallelize([("Brooke", 20), ("Denny", 31), ("Jules", 30),
("TD", 35), ("Brooke", 25)])

# Try to undestand what this code does (line by line)
agesRDD = (dataRDD
  # key is the name and value of tuple with age and counter
  .map(lambda x: (x[0], (x[1], 1)))
  # reduce by adding the age and count the number of name
  .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))
  # doing the mean of the age per name
  .map(lambda x: (x[0], x[1][0]/x[1][1])))