# spark - word counts

## 1. set up a spark application

In [18]:
import findspark
findspark.init()
findspark.find()
import pyspark
findspark.find()

'C:\\Program Files\\spark-3.3.0-bin-hadoop3\\'

In [20]:
from pyspark import SparkContext
import nltk
#nltk.download('stopwords')

#sc.stop()
conf = pyspark.SparkConf().setAppName('word_comparison').setMaster('local[4]')
sc = pyspark.SparkContext(conf=conf)

 ## 2. Load data

In [21]:
def filter_stop_words(word):
    from nltk.corpus import stopwords
    english_stop_words = stopwords.words("english")
    return word not in english_stop_words

def load_text(text_path):
    # Split text in words
    # Remove empty word artefacts
    # Remove stop words ('I', 'you', 'a', 'the', ...)
    vocabulary = sc.textFile(text_path, minPartitions=4)\
        .flatMap(lambda lines: lines.lower().split())\
        .flatMap(lambda word: word.split("."))\
        .flatMap(lambda word: word.split(","))\
        .flatMap(lambda word: word.split("!"))\
        .flatMap(lambda word: word.split("?"))\
        .flatMap(lambda word: word.split("'"))\
        .flatMap(lambda word: word.split("\""))\
        .filter(lambda word: word is not None and len(word) > 0)\
        .filter(filter_stop_words)

    # Count the total number of words in the text
    word_count = vocabulary.count()

    # Compute the frequency of each word: frequency = #appearances/#word_count
    word_freq = vocabulary.map(lambda word: (word, 1))\
        .reduceByKey(lambda count1, count2: count1 + count2)\
        .map(lambda word : (word[0], word[1]/float(word_count)))\

    return word_freq

#load text files
iliad = load_text('iliad.mb.txt')
odyssey = load_text('odyssey.mb.txt')

## 3. counting

In [22]:
# Join the two datasets and compute the difference in frequency
# Note that we need to write (freq or 0) because some words do not appear
# in one of the two books. Thus, some frequencies are equal to None after
# the full outer join.
join_words = iliad.fullOuterJoin(odyssey)\
    .map(lambda x : (x[0], (x[1][1] or 0) - (x[1][0] or 0))) 
#x[0] : word
#x[1][0] : iliad word frequency
#x[1][1] : odyssey word frequency
join_words.count()

11824

In [23]:
join_words.take(3)

[('classics', 1.0272212176025145e-05),
 ('translated', 5.136106088012572e-06),
 ('book', 0.00012326654611230172)]

In [24]:
#x[0] : word
#x[1] : difference of word frequencies
# 10 words that get a boost in frequency in the sequel
emerging_words = join_words.takeOrdered(10, lambda x: -x[1])
# 10 words that get a decrease in frequency in the sequel
disappearing_words = join_words.takeOrdered(10, lambda x: x[1])

In [25]:
#top 10 commun words of these two books
for word, freq_diff in emerging_words:
    print("%.2f" % (freq_diff*10000), word)

92.52 ulysses
53.63 house
48.33 telemachus
43.06 suitors
36.68 tell
33.47 ship
33.35 one
31.94 home
26.73 said
25.97 got


In [26]:
#top 10 words present in iliad but not in odyssey
for word, freq_diff in disappearing_words[::-1]:
    print("%.2f" % (freq_diff*10000), word)

-28.72 jove
-31.46 horses
-40.66 fight
-44.56 spear
-47.24 ships
-54.71 achilles
-61.74 achaeans
-65.52 hector
-72.71 trojans
-89.71 son
