<a href="https://colab.research.google.com/github/vagzikopis/SparkWordCount/blob/main/WordCount.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
# Initialize the Spark Session
from pyspark.sql import SparkSession
from pyspark import SparkContext

sc = SparkContext.getOrCreate()
spark = SparkSession.builder.appName("SherlockAssignment").getOrCreate()

In [10]:
import re

# Load data from the file (Colab path)
text_rdd = sc.textFile("/content/SherlockHolmes.txt")

def process_words(line):
    # Clean and lowercase
    clean_line = re.sub(r'[^\w\s]', '', line.lower())
    return clean_line.split()

# Execute the following transformation logic
# 1. Break lines into words and create a flat list. Apply processing function on words.
# 2. Filter out empty strings and keep only words that start with a-z characters.
# 3. Create a key-value pair with the first character of the word and (word length, 1).
# 4. Reduce-by-key groups all tuples by their key (first letter). Afterwards, sums the total length and the word counts.
# 5. Divide total word lengths with the total word counts per character.
# 6. Reorder the RDD based on the average length per character in descending order.
# 7. Action to pull the data from the cluster and return it to a local Python list.
results = text_rdd.flatMap(process_words) \
.filter(lambda word: word and word[0].isalpha()) \
.map(lambda word: (word[0], (len(word), 1))) \
.reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1])) \
.mapValues(lambda x: x[0] / x[1]) \
.sortBy(lambda x: x[1], ascending=False) \
.collect()

# Print the results
for char, avg in results:
    print(f"{char} {avg:.1f}")

c 7.2
e 7.1
q 7.0
p 7.0
r 6.9
d 6.4
v 6.0
g 5.9
s 5.8
z 5.7
u 5.6
j 5.6
k 5.4
l 5.3
f 5.2
m 5.1
n 4.8
b 4.5
w 4.3
h 3.8
a 3.7
y 3.7
t 3.6
i 3.5
x 3.4
o 3.0
