# Algorithm Examples in Python vs PySpark
The objective to help students understand the differences between implementing simple algorithms in Python and PySpark. 

In [1]:
import findspark
findspark.init()
from pyspark import SparkContext
from pyspark.sql import SparkSession
sc = SparkContext("local")
spark = SparkSession.builder.getOrCreate()

# Sum of elements in a list

In [3]:
# Python code
arr = [1, 2, 3, 4, 5]
result = 0
for i in range(len(arr)):
    result += arr[i]
print(result)

15


In [4]:
# PySpark code
rdd = sc.parallelize([1, 2, 3, 4, 5])
result = rdd.reduce(lambda a, b: a + b)
print(result)

15


# Word Count

In [5]:
# Python code
text = "hello world hello"
word_dict = {}
for word in text.split():
    word_dict[word] = word_dict.get(word, 0) + 1
print(word_dict)


{'hello': 2, 'world': 1}


In [6]:
# PySpark code
text = sc.parallelize(["hello world hello"])
result = (text.flatMap(lambda line: line.split(" "))
          .map(lambda word: (word, 1))
          .reduceByKey(lambda a, b: a + b)
          .collect())

print(dict(result))

{'hello': 2, 'world': 1}


# Finding Maximum Element

In [7]:
# Python code
arr = [2, 4, 1, 8, 5]
max_val = arr[0]
for i in range(1, len(arr)):
    if arr[i] > max_val:
        max_val = arr[i]
print(max_val)

8


In [8]:
# PySpark code
rdd = sc.parallelize([2, 4, 1, 8, 5])
result = rdd.reduce(lambda a, b: a if a > b else b)
print(result)

8


# Counting Occurrences of a Specific Element

In [9]:
# Python code
arr = [2, 3, 4, 2, 8, 2]
target = 2

result = 0
for i in range(len(arr)):
    if arr[i] == target:
        result += 1
print(result)

3


In [10]:
# PySpark code
rdd = sc.parallelize([2, 3, 4, 2, 8, 2])
target = 2
result = rdd.filter(lambda x: x == target).count()
print(result)

3


# Finding Prime Numbers in a Range

In [12]:
# Python code

def is_prime(n):
    if n <= 1:
        return False
    for i in range(2, int(n ** 0.5)+1):
        if n % i == 0:
            return False
    return True

range_start = 2
range_end = 30

prime_numbers = []
for i in range(range_start, range_end + 1):
    if is_prime(i):
        prime_numbers.append(i)
print(prime_numbers)

[2, 3, 5, 7, 11, 13, 17, 19, 23, 29]


In [14]:
# PySpark code
range_start = 2
range_end = 30
rdd = sc.parallelize(range(range_start, range_end))
prime_numbers = rdd.filter(is_prime).collect()
print(prime_numbers)

[2, 3, 5, 7, 11, 13, 17, 19, 23, 29]


# Histogram Generation

In [15]:
arr = [1, 1, 1, 2, 2, 3]
result = {}
for i in range(len(arr)):
    result[arr[i]] = result.get(arr[i], 0) + 1
print(result)

{1: 3, 2: 2, 3: 1}


In [16]:
rdd = sc.parallelize([1, 1, 1, 2, 2, 3])
result = rdd.countByValue()
print(result)

defaultdict(<class 'int'>, {1: 3, 2: 2, 3: 1})
