**Table of contents**<a id='toc0_'></a>    
- [collect()](#toc1_)    
- [count()](#toc2_)    
- [take(n)](#toc3_)    
- [first()](#toc4_)    
- [takeSample(withReplacement, num, [seed])](#toc5_)    
- [takeOrdered(num, [key])](#toc6_)    
- [saveAsTextFile(path)](#toc7_)    
- [saveAsSequenceFile(path)](#toc8_)    
- [countByKey()](#toc9_)    
- [foreach(func)](#toc10_)    
- [reduce(func)](#toc11_)    
- [fold(zeroValue)(func)](#toc12_)    
- [aggregate(zeroValue)(seqOp, combOp)](#toc13_)    
- [collectAsMap()](#toc14_)    
- [lookup(key)](#toc15_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("RDD-Examples").getOrCreate()
sc = spark.sparkContext

# <a id='toc1_'></a>[collect()](#toc0_)



In [3]:
rdd = sc.parallelize([1, 2, 3, 4])
result = rdd.collect()
print(result)  # Output: [1, 2, 3, 4]


[1, 2, 3, 4]


# <a id='toc2_'></a>[count()](#toc0_)



In [4]:
rdd = sc.parallelize([1, 2, 3, 4])
result = rdd.count()
print(result)  # Output: 4


4


# <a id='toc3_'></a>[take(n)](#toc0_)



In [5]:
# Take action: Retrieve the n elements of the RDD

rdd = sc.parallelize([1, 2, 3, 4])
result = rdd.take(2)
print(result)  # Output: [1, 2]


[1, 2]


In [None]:
# create an RDD of integers
rdd = sc.parallelize([5, 7, 3, 9, 1])

# use take() to get the first 3 elements of the RDD
take_result = rdd.take(3)

print("First 3 elements:", take_result)


# top(n)

In [None]:
# create an RDD of integers
rdd = sc.parallelize([5, 7, 3, 9, 1])

# use top() to get the top 3 elements of the RDD
top_result = rdd.top(3)

# print the results
print("Top 3 elements:", top_result)

# <a id='toc4_'></a>[first()](#toc0_)



In [6]:
# First action: Retrieve the first element of the RDD

rdd = sc.parallelize([1, 2, 3, 4])
result = rdd.first()
print(result)  # Output: 1


1


# <a id='toc5_'></a>[takeSample(withReplacement, num, [seed])](#toc0_)



In [7]:
rdd = sc.parallelize([1, 2, 3, 4, 5])
result = rdd.takeSample(False, 3)
print(result)  # Output: a random subset of 3 elements


[3, 2, 4]


In [None]:
#Returns a list which takes 3 arguments:
# 1. Whether to repeat the already taken elements
# 2. Number of elements which you  want to take
# 3. Seed value

# Case 1: When the wanted values greater then the original sample size, we can see duplicate values

rdd = sc.parallelize(range(1,10)).takeSample(True,11,23)

rdd

In [None]:
# Case 1: When the wanted values greater then the original sample size, we will see the original array

rdd = sc.parallelize(range(1,10)).takeSample(False,50,1)

rdd

# <a id='toc6_'></a>[takeOrdered(num, [key])](#toc0_)



In [8]:
rdd = sc.parallelize([5, 2, 3, 1, 4])
result = rdd.takeOrdered(3)
print(result)  # Output: [1, 2, 3]


[1, 2, 3]


# <a id='toc7_'></a>[saveAsTextFile(path)](#toc0_)



In [9]:
rdd = sc.parallelize([1, 2, 3, 4])
rdd.saveAsTextFile("./data/output/output.txt")


# <a id='toc8_'></a>[saveAsSequenceFile(path)](#toc0_)



In [3]:
rdd = sc.parallelize([('a', 1), ('b', 2)])
rdd.saveAsSequenceFile("./data/output/output.seq")


# <a id='toc9_'></a>[countByKey()](#toc0_)



In [5]:
rdd = sc.parallelize([('a', 1), ('b', 2), ('a', 3)])
result = rdd.countByKey()
print(result)  # Output: {'a': 2, 'b': 1}


defaultdict(<class 'int'>, {'a': 2, 'b': 1})


# <a id='toc10_'></a>[foreach(func)](#toc0_)



In [8]:
rdd = sc.parallelize([1, 2, 3, 4])
rdd.foreach(lambda x: print(x))

# <a id='toc11_'></a>[reduce(func)](#toc0_)



In [7]:
rdd = sc.parallelize([1, 2, 3, 4])
result = rdd.reduce(lambda a, b: a + b)
print(result)  # Output: 10


10


# <a id='toc12_'></a>[fold(zeroValue)(func)](#toc0_)



In [8]:
rdd = sc.parallelize([1, 2, 3, 4])
result = rdd.fold(0, lambda a, b: a + b)
print(result)  # Output: 10


10


In [None]:
rdd1 = sc.parallelize([1, 2, 3, 4, 5])
total_sum = rdd.fold(100, lambda acc, x: acc + x)
print(total_sum) # This Prints 515

rdd2 = sc.parallelize([1, 2, 3, 4, 5])
total_sum = rdd.fold(10, lambda acc, x: acc + x)
print(total_sum) # This Prints 65

rdd3 = sc.parallelize([1, 2, 3, 4, 5])
total_sum = rdd.fold(0, lambda acc, x: acc + x)
print(total_sum) # This prints 15

rdd4 = sc.parallelize([1, 2, 3, 4, 5],1)
total_sum = rdd4.fold(10, lambda acc, x: acc + x)
print(total_sum) # This prints 35

rdd3 = sc.parallelize([1, 2, 3, 4, 5])
total_sum = rdd.fold(10, lambda acc, x: acc + x)
print(total_sum) # This prints 65

rdd4 = sc.parallelize([1, 2, 3, 4, 5],4)
total_sum = rdd4.fold(30, lambda acc, x: acc + x)
print(total_sum) # This prints 165 30*4 + 15 + 30


In [None]:
## FOLD ##

"""
•	fold is similar to reduce in that it is an action that aggregates the elements of an RDD into a single result. However, fold differs from reduce in that it allows you to specify an initial value for the accumulation.
•	fold is an action in PySpark that allows you to aggregate the elements of an RDD using a given function. It takes two arguments: a zero value and a binary operator.
"""

rdd = sc.parallelize([1, 2, 3, 4, 5]) 
total_sum = rdd.fold(100, lambda acc, x: acc + x) 
print(total_sum)  # This Prints 515

rdd = sc.parallelize([1, 2, 3, 4, 5]) 
total_sum = rdd.fold(10, lambda acc, x: acc + x) 
print(total_sum) # This Prints 65

rdd = sc.parallelize([1, 2, 3, 4, 5]) 
total_sum = rdd.fold(0, lambda acc, x: acc + x) 
print(total_sum) # This prints 5*0 + 15 + 0 = 15


rdd4 = sc.parallelize([1, 2, 3, 4, 5],3)
total_sum = rdd4.fold(10, lambda acc, x: acc + x)
print(total_sum) # This print 10*3 + 15 + 10 = 55 


rdd4 = sc.parallelize([1, 2, 3, 4, 5],4)
total_sum = rdd4.fold(30, lambda acc, x: acc + x)
print(total_sum) # This prints 165  30*4 + 15 + 30 = 165 


In [None]:
rdd4 = sc.parallelize([1, 2, 3, 4, 5],1)
total_sum = rdd4.fold(10, lambda acc, x: acc + x)
print(total_sum) # This prints 35  10*2 + 15 = 35

In [None]:
rdd3 = sc.parallelize([1, 2, 3, 4, 5])
total_sum = rdd3.fold(10, lambda acc, x: acc + x)
print(total_sum) # This prints 65 10*5 + 15 = 65

In [None]:
rdd4 = sc.parallelize([1, 2, 3, 4, 5],4)
total_sum = rdd4.fold(30, lambda acc, x: acc + x)
print(total_sum) # This prints 165 30*4 + 15 + 30


# <a id='toc13_'></a>[aggregate(zeroValue)(seqOp, combOp)](#toc0_)



In [9]:
rdd = sc.parallelize([1, 2, 3, 4])
result = rdd.aggregate((0, 0),
                       (lambda acc, value: (acc[0] + value, acc[1] + 1)),
                       (lambda acc1, acc2: (acc1[0] + acc2[0], acc1[1] + acc2[1])))
print(result)  # Output: (10, 4)


(10, 4)


# <a id='toc14_'></a>[collectAsMap()](#toc0_)



In [10]:
rdd = sc.parallelize([('a', 1), ('b', 2)])
result = rdd.collectAsMap()
print(result)  # Output: {'a': 1, 'b': 2}


{'a': 1, 'b': 2}


# <a id='toc15_'></a>[lookup(key)](#toc0_)



In [11]:
rdd = sc.parallelize([('a', 1), ('b', 2), ('a', 3)])
result = rdd.lookup('a')
print(result)  # Output: [1, 3]


[1, 3]
