In [57]:
import os
import sys

os.environ["SPARK_HOME"] = "/usr/spark2.4.3"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
# In below two lines, use /usr/bin/python2.7 if you want to use Python 2
os.environ["PYSPARK_PYTHON"] = "/usr/local/anaconda/bin/python" 
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/local/anaconda/bin/python"
sys.path.insert(0, os.environ["PYLIB"] +"/py4j-0.10.7-src.zip")
sys.path.insert(0, os.environ["PYLIB"] +"/pyspark.zip")

In [58]:
from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName("appName")
sc = SparkContext(conf=conf)

ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=appName, master=local[*]) created by __init__ at <ipython-input-3-470ac038cc01>:3 

In [61]:
# Show the version of Python used by Spark
print('Python version for Spark: ', sc.pythonVer)

Python version for Spark:  3.6


In [62]:
# Master is the URL of the cluster or “local” string to run in local mode.
print('Master of Spark Context: ', sc.master)

Master of Spark Context:  local[*]


In [63]:
### Use of lambda() with map()
my_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
my_list

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [64]:
# Print my_list in the console
print("Input list is", my_list)

Input list is [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


In [66]:
# Square all numbers in my_list
squared_list_lambda = list(map(lambda x: x**2, my_list))

In [67]:
# Print the result of the map function
print("The squared numbers are", squared_list_lambda)

The squared numbers are [1, 4, 9, 16, 25, 36, 49, 64, 81, 100]


In [68]:
### Use of lambda() with filter()
my_list2 = [10, 21, 31, 40, 51, 60, 72, 80, 93, 101]
# Print my_list2 in the console
print("Input list is:", my_list2)

Input list is: [10, 21, 31, 40, 51, 60, 72, 80, 93, 101]


In [69]:
# Filter numbers divisible by 10
filtered_list = list(filter(lambda x: (x%10 == 0), my_list2))
# Print the numbers divisible by 10
print("Numbers divisible by 10 are:", filtered_list)

Numbers divisible by 10 are: [10, 40, 60, 80]


In [73]:
mylist = ["Spark", "is", "a", "framework", "for", "Big Data processing"]
print(mylist)

['Spark', 'is', 'a', 'framework', 'for', 'Big Data processing']


In [74]:
### 1. Create RDD from a collection and turn it into parallelized collection
# Use parallelize method for parallelizing an existing collection of objects (e.g. list, array, etc).

# Create an RDD from a list of words
myRDD = sc.parallelize(mylist)

# Print out the type of the created object
print("The type of RDD is", type(myRDD))

The type of RDD is <class 'pyspark.rdd.RDD'>


In [75]:
for eachword in myRDD.collect():
    print(eachword)

Spark
is
a
framework
for
Big Data processing


In [76]:
### 2. Create RDD from external dataset
# Use textFile metod to load data from external datasets (e.g. files in HDFS, Amazon S3 objects, local text files).
# This the most common method to create RDDs

# Make sure the data file has been uploaded to the Databricks file system
file_path='Spark.md'

# Print the file_path
print("The file_path is", file_path)

# Create a fileRDD from file_path
fileRDD = sc.textFile(file_path)

# Check the type of fileRDD
print("The file type of fileRDD is", type(fileRDD))

The file_path is Spark.md
The file type of fileRDD is <class 'pyspark.rdd.RDD'>


In [77]:
### 1. Map and collection on RDD
numbRDD = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
for eachnum in numbRDD.collect():
    print(eachnum)

1
2
3
4
5
6
7
8
9
10


In [79]:
#Create map() transformation to cube numbers
ReciRDD = numbRDD.map(lambda x: 1/x)
for rv in ReciRDD.collect():
    print(rv)

1.0
0.5
0.3333333333333333
0.25
0.2
0.16666666666666666
0.14285714285714285
0.125
0.1111111111111111
0.1


In [80]:
### 2. Filter and count on RDD
# Make sure the data file has been uploaded to the Databricks file system
file_path='sample.txt'
# Create a fileRDD from file_path
fileRDD = sc.textFile(file_path)

In [83]:
for eachline in fileRDD.take(10):
    print(eachline)

Hi!

This is first class for learning Big Data Technologies.

I love teaching this course to you.

Learning is fun.

Join me in having hands-on experience in big data tools.


In [85]:
# Create filter() transformation to select the lines containing the keyword Spark
fileRDD_filter = fileRDD.filter(lambda line: 'in' in line)
for eachline in fileRDD_filter.collect():
    print(eachline)

This is first class for learning Big Data Technologies.
I love teaching this course to you.
Learning is fun.
Join me in having hands-on experience in big data tools.


In [88]:
# How many lines are there in fileRDD?
print("\n The total number of lines with the keyword Spark is", fileRDD_filter.count())


 The total number of lines with the keyword Spark is 4


In [89]:
### 3. First and take on RDD
# Print the first line filtered fileRDD
fileRDD_filter.first()

'This is first class for learning Big Data Technologies.'

In [90]:
# Print the first 3 lines of filtered fileRDD
for line in fileRDD_filter.take(3):
    print(line)

This is first class for learning Big Data Technologies.
I love teaching this course to you.
Learning is fun.


In [93]:
### 4. flatMap on RDD
# split the lines of the filtered fileRDD into individual words
fileRDD_flatMap = fileRDD_filter.flatMap(lambda x: x.split(' '))

# Print the first 10 words of the
for word in fileRDD_flatMap.take(5):
    print(word)

This
is
first
class
for


In [94]:
fileRDD.collect()

['Hi!',
 '',
 'This is first class for learning Big Data Technologies.',
 '',
 'I love teaching this course to you.',
 '',
 'Learning is fun.',
 '',
 'Join me in having hands-on experience in big data tools.']

In [97]:
### 1. ReduceBykey and Collect
# reduceByKey() operates on key, value (k,v) pairs and
# merges the values for each key

# Create PairRDD Rdd with key value pairs (player, goal)
Rdd = sc.parallelize([('Messi', 23), ('Ronaldo', 34), ('Ronaldo', 25), ('Neymar', 26), ('Messi',24)])

In [98]:
# Transform the RDD with reduceByKey() into a pair RDD 'Rdd_Reduced'
# by adding the values with the same key.
Rdd_Reduced = Rdd.reduceByKey(lambda x, y: x + y)

In [99]:
# Iterate over the result and print the output
for num in Rdd_Reduced.collect():
    print("Key {} has {} Counts".format(num[0], num[1]))

Key Ronaldo has 59 Counts
Key Messi has 47 Counts
Key Neymar has 26 Counts


In [28]:
### 2. SortByKey and Collect
# sortByKey() sorts the pair RDD based on the key
# Sort the reduced RDD with the key (i.e. player name) by descending order
Rdd_Reduced_Sort = Rdd_Reduced.sortByKey(ascending=False)

# Iterate over the result and print the output
print('\nSort by player name by descending order:\n ')
for kv in Rdd_Reduced_Sort.collect():
    print('{} has {} goals'.format(kv[0], kv[1]))


Sort by player name by descending order:
 
Ronaldo has 59 goals
Neymar has 26 goals
Messi has 47 goals


In [29]:
# Sort the reduced RDD with number of goals by ascending order
Rdd_Reduced_Rev = Rdd_Reduced.map(lambda x: (x[1], x[0])) # swap key with value
print("\nSort by number of goals by ascending order:\n")
for kv in Rdd_Reduced_Rev.sortByKey(ascending=True).collect(): # sort by number of goals ascendingly
    print('{} has {} goals'.format(kv[1], kv[0]))


Sort by number of goals by ascending order:

Neymar has 26 goals
Messi has 47 goals
Ronaldo has 59 goals


In [30]:
### 3. GroupByKey and Join
# group the airports for the same country with groupByKey() transformation
airports = [('US', 'JFK'), ('UK', 'LHR'), ('UK', 'MAN'), ('US', 'SFO'), ('UK', 'LGW'), ('FR', 'CDG')]
Rdd = sc.parallelize(airports)
Rdd_group = Rdd.groupByKey().collect()
print('\nGroup airports in each country:')
for country, airport in Rdd_group:
    print(country, list(airport))


Group airports in each country:
US ['JFK', 'SFO']
UK ['LHR', 'MAN', 'LGW']
FR ['CDG']


In [31]:
# Merge two pair RDDs together based on their key with join() transformation
Rdd1 = sc.parallelize([('Messi', 34), ('Ronaldo', 32), ('Neymar', 24)])
Rdd2= sc.parallelize([('Ronaldo', 80), ('Neymar', 50), ('Messi', 80)])
print('\nMerge RDDs:')
Rdd1.join(Rdd2).collect()


Merge RDDs:


[('Neymar', (24, 50)), ('Ronaldo', (32, 80)), ('Messi', (34, 80))]

In [33]:
### 1. CountByKey
# Keys for a pair RDD can be counted by countByKey() action.
# count the number of elements for each key
Rdd = sc.parallelize([('a', 1), ('b', 1), ('a', 2), ('a', 3)])
for key, val in Rdd.countByKey().items():
    print(key, ':', val)

a : 3
b : 1


In [34]:
### 2. Reduce
# use reduce() to compute the sum of all elements in a RDD
x = [1, 3, 5 , 7]
RDD = sc.parallelize(x)
sum = RDD.reduce(lambda x, y: x+y)
print('Result of reduce: ', sum)

Result of reduce:  16


In [35]:
### 3. SaveAsTextFile
x = ['Sam', 'Peter', 'Alice', 'Tom']
RDD = sc.parallelize(x)
RDD.saveAsTextFile('tempFile1') # check your current working directory and see if a new directory created
RDD.coalesce(1).saveAsTextFile('tempFile2') # Save as a single file

In [37]:
### 4. CollectAsMap
dict= sc.parallelize([(1,2), (3,4)]).collectAsMap()
print('Type of result: ', type(dict))
print('Result of collectAsMap: ', dict)

Type of result:  <class 'dict'>
Result of collectAsMap:  {1: 2, 3: 4}
