In [None]:
# create RDD
# 3 methods
# 1st: pass an existing object to SparkContext's parallelize method
# 2nd: load data from an external hard drive (HDFS) or from Amazon s3 bucket or lines from a text
# 3rd: from an existing RDD

# 1st method: parallelized collection 
numRDD = sc.parallelize([1, 2, 3, 4])
helloRDD = sc.parallelize('Hello World')
type(helloRDD)

# create RDD using the external datasets, textFile() method
fileRDD = sc.textFile('README.md')
type(fileRDD)


# LOAD DATA INTO RDDs
# understanding how Spark deals with partitions allows us to control parallelism
# create an RDD using SparkContext's parallelize method with 6 partitions
numRDD = sc.parallelize(range(10), minPartitions=6)

# or we can use this method
numRDD = sc.textFile('README.md', minPartitions=6)

print('The number of partitions in numRDD is', numRDD.getNumPartitions())

# RDD operations in PySpark
# operations = transformation + actions
# transformation creates RDD, action computes on RDDs
# transformations follow Lazy evaluation, which enables RDDs to be fault tolerant

# map() transformation
# create RDD using SparkContext's parallelize method
RDD = sc.parallelize([1, 2, 3, 4])
# apply map function to each element
RDD_map = RDD.map(lambda x: x * x)

# filter transformation returns a new RDD with elements that pass the condition
RDD = sc.parallelize([1, 2, 3, 4])
RDD_filter = RDD.filter(lambda x: x > 2)

# flatMap transformation returns many values for each element in the original RDD
RDD = sc.parallelize(['Hello world', 'How are you'])
RDD_flatmap = RDD.flatMap(lambda x: x.split(" "))

# union transformation returns the union of one RDD with another RDD
inputRDD = sc.textFile("logs.txt")
errorRDD = inputRDD.filter(lambda x: "error" in x.split())
warningsRDD = inputRDD.filter(lambda x: "warnings" in x.split())
combinedRDD = errorRDD.union(warningsRDD)

In [None]:
# actions are operations applied to RDD and return a value
# basic RDD actions: collect(), take(N), first(), count()
RDD_map.collect()
RDD_map.take(2)
RDD_map.first()
RDD_flatmap.count()

In [None]:
# work with RDD key/value pairs
# pair RDDs: special data structure
# pair RDDs: key = identifier, value = data
# 2 ways to create a pair RDD: from a list of key-value tuple or from a regular RDD
# get the data into the key/value form

# create a pair RDD from a list of key-value tuple
my_tuple = [('Sam', 23), ('Mary', 34), ('Peter', 25)]
pairRDD_tuple = sc.parallelize(my_tuple)

# create a pair RDD from regular RDDs
my_list = ['Sam 23', 'Mary 34', 'Peter 25']
regularRDD = sc.parallelize(my_list)
pairRDD_RDD = regularRDD.map(lambda s: (s.split(' ')[0], s.split(' ')[1]))

# some transformations for pairRDDs: reduceByKey(), groupByKey(), sortByKey(), join()

# practice reduceByKey() transformation
regularRDD = sc.parallelize(["Messi", 23], ["Ronaldo", 34]
                            ["Neymar", 22], ["Messi", 24])
pairRDD_reducebykey = regularRDD.reduceByKey(lambda x, y: x + y)
pairRDD_reducebykey.collect()

# sorting of data: sortByKey() transformation
pairRDD_reducebykey_rev = pairRDD_reducebykey.map(lambda x: (x[1], x[0]))
pairRDD_reducebykey_rev.sortByKey(ascending=False).collect()

# group values with the same key: groupByKey()
airports = [("US", "JFK"), ("UK", "LHR"), ("FR", "CDG"), ("US", "SFO")]
regularRDD = sc.parallelize(airports)
pairRDD_group = regularRDD.groupByKey().collect()
for cont, air in pairRDD_group:
    print(cont, list(air))
    
# join() transformation to connect two pair RDDs based on their key
RDD1 = sc.parallelize([("Messi", 24), ("Ronaldo", 32)], ("Neymar", 24))
RDD2 = sc.parallelize([("Ronaldo", 40), ("Neymar", 120), ("Messi", 50)])
RDD1.join(RDD2).collect()

# Practice
# Create PairRDD Rdd with key value pairs
Rdd = sc.parallelize([(1, 2), (3, 4), (3, 6), (4, 5)])

# Apply reduceByKey() operation on Rdd
Rdd_Reduced = Rdd.reduceByKey(lambda x, y: x + y)

# Iterate over the result and print the output
for num in Rdd_Reduced.collect(): 
  print("Key {} has {} Counts".format(num[0], num[1]))

# ADVANCED RDD actions
# reduce() action - avoid collect() because of the size
x = [1, 3, 4, 6]
RDD = sc.parallelize(x)
RDD.reduce(lambda x, y : x + y)
# saveAsTextFile() - each partition is saved separately as a file inside a directory
RDD.saveAsTextFile("FileName")
# coalesce() - save RDD as a single text file
RDD.coalesce(1).saveAsTextFile("FileName")

# RDD actions: countByKey(), collectAsMap()
rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])
for k, v in rdd.countByKey().items():
    print(k, v)
    
# collectAsMap(): returns k-v pairs as dict
sc.parallelize([(1, 2), (3, 4)]).collectAsMap()

# Practice
# Transform the rdd with countByKey()
total = Rdd.countByKey()

# What is the type of total?
print("The type of total is", type(total))

# Iterate over the total and print the output
for k, v in total.items(): 
  print("key", k, "has", v, "counts")

# Create a baseRDD from the file path
baseRDD = sc.textFile(file_path)

# Split the lines of baseRDD into words
splitRDD = baseRDD.flatMap(lambda x: x.split(' '))

# Count the total number of words
print("Total number of words in splitRDD:", splitRDD.count())

# Convert the words in lower case and remove stop words from stop_words
splitRDD_no_stop = splitRDD.filter(lambda x: x.lower() not in stop_words)

# Create a tuple of the word and 1 
splitRDD_no_stop_words = splitRDD_no_stop.map(lambda w: (w, 1))

# Count of the number of occurences of each word
resultRDD = splitRDD_no_stop_words.reduceByKey(lambda x, y: x + y)

# Display the first 10 words and their frequencies
for word in resultRDD.take(10):
	print(word)

# Swap the keys and values 
resultRDD_swap = resultRDD.map(lambda x: (x[1], x[0]))

# Sort the keys in descending order
resultRDD_swap_sort = resultRDD_swap.sortByKey(ascending=False)

# Show the top 10 most frequent words and their frequencies
for word in resultRDD_swap_sort.take(10):
    print("{} has {} counts". format(word[1], word[0]))

# Sort the reduced RDD with the key by descending order
Rdd_Reduced_Sort = Rdd_Reduced.sortByKey(ascending=False)

# Iterate over the result and print the output
for num in Rdd_Reduced_Sort.collect():
  print("Key {} has {} Counts".format(num[0], num[1]))

# PySpark DATAFRAME
# Spark high level API for working with structured data
# PySpark SQL is a Spark library for structured data
# PySpark SQL provides the structure of data and the computation being performed
# PySpark SQL provides a programming abstraction called DataFrames
# SparkSession does for DataFrames what the SparkContext does for RDDs
# SparkSession creates DataFrames, registers DataFrames as tables, executes SQL over tables, cache tables
# DataFrames can be created by spark in two ways: from existing RDDs - createDataFrame(), from various sources using
# SparkSession's read method
# Schema is a structure of data in DataFrame, it helps Spark optimize queries on the data more efficiently

# CREATE a DataFrame from RDD
iphones_RDD = sc.parallelize([("XR", 2018, 5.65, 2.79, 6.24),
                              ("Xs", 2018, 5.94, 2.98, 6.84)])
names = ["Model", "Year", "Height", "Width", "Weight"]
iphones_df = spark.createDataFrame(iphones_RDD, schema=names)
type(iphones_df)

# CREATE a DataFrame from a csv file
df_csv = spark.read.csv("filename.csv", header=True, inferSchema=True)
# CREATE a DataFrame from a json file
df_csv = spark.read.json("filename.json", header=True, inferSchema=True)

# Practice
# Create a list of tuples
sample_list = [('Mona',20), ('Jennifer',34), ('John',20), ('Jim',26)]

# Create a RDD from the list
rdd = sc.parallelize(sample_list)

# Create a PySpark DataFrame
names_df = spark.createDataFrame(rdd, schema=['Name', 'Age'])

# Check the type of names_df
print("The type of names_df is", type(names_df))