In [1]:
from pyspark.sql import SparkSession

# (8 cores, 16gb per machine) x 5 = 40 cores

# New API
spark_session = SparkSession\
        .builder\
        .master("spark://192.168.1.153:7077") \
        .appName("martin_luther_king")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.shuffle.service.enabled", True)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",4)\
        .getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext

In [2]:
def split_line(line):
    return line.split('\n')


# A.1.1
linesEN = spark_context.textFile("hdfs://192.168.1.153:9000/europarl/europarl-v7.sv-en.en")
print(linesEN.count())
# A.1.2
linesSWE = spark_context.textFile("hdfs://192.168.1.153:9000/europarl/europarl-v7.sv-en.sv")
print(linesSWE.count())
# A.1.3
if(linesSWE.count() == linesEN.count()):
    print("Yes they match :)")

# A.1.4
# English partitions
print(linesEN.getNumPartitions())

# Swedish partitions
print(linesSWE.getNumPartitions())

1862234
1862234
Yes they match :)
2
3


In [3]:
# A.2.1
# lower case function 
def lower_case(line):
    lower = line.map(lambda x: x.lower())
    return lower

# lower case english
lower_english = lower_case(linesEN)

# lower case swedish
lower_swedish = lower_case(linesSWE)

# split space function 
def split_space(line):
    splitz = line.split(' ')
    return splitz

# split words english
split_english = lower_english.map(split_space)

# split words swedish
split_swedish = lower_swedish.map(split_space)

# A.2.2
print(split_english.take(10))
print(split_swedish.take(10))

# A.2.3
if(split_english.count() == split_swedish.count()):
    print("Yes they match :)")


[['resumption', 'of', 'the', 'session'], ['i', 'declare', 'resumed', 'the', 'session', 'of', 'the', 'european', 'parliament', 'adjourned', 'on', 'friday', '17', 'december', '1999,', 'and', 'i', 'would', 'like', 'once', 'again', 'to', 'wish', 'you', 'a', 'happy', 'new', 'year', 'in', 'the', 'hope', 'that', 'you', 'enjoyed', 'a', 'pleasant', 'festive', 'period.'], ['although,', 'as', 'you', 'will', 'have', 'seen,', 'the', 'dreaded', "'millennium", "bug'", 'failed', 'to', 'materialise,', 'still', 'the', 'people', 'in', 'a', 'number', 'of', 'countries', 'suffered', 'a', 'series', 'of', 'natural', 'disasters', 'that', 'truly', 'were', 'dreadful.'], ['you', 'have', 'requested', 'a', 'debate', 'on', 'this', 'subject', 'in', 'the', 'course', 'of', 'the', 'next', 'few', 'days,', 'during', 'this', 'part-session.'], ['in', 'the', 'meantime,', 'i', 'should', 'like', 'to', 'observe', 'a', "minute'", 's', 'silence,', 'as', 'a', 'number', 'of', 'members', 'have', 'requested,', 'on', 'behalf', 'of', '

In [4]:
# A.3.1
def word_count(line):
    county = line.map(lambda w: (w,1)).reduceByKey(lambda a, b : a+b).takeOrdered(10, lambda x: -x[1])
    return county


# preprocessed english text -> using function from above  
english_corpus = lower_english.flatMap(split_space)    


# preprocessed swedish text -> using function from above  
swedish_corpus = lower_swedish.flatMap(split_space) 


# 10 most frequent words English
most_freq_english = word_count(english_corpus)
print(most_freq_english)

# 10 most frequent words Swedish
most_freq_swedish = word_count(swedish_corpus)
print(most_freq_swedish)


# A.3.2
print("")
print("Considering the number of lines, yes the word counts do seem reasonable")





[('the', 3498375), ('of', 1659758), ('to', 1539760), ('and', 1288401), ('in', 1085993), ('that', 797516), ('a', 773522), ('is', 758050), ('for', 534242), ('we', 522849)]
[('att', 1706293), ('och', 1344830), ('i', 1050774), ('det', 924866), ('som', 913276), ('för', 908680), ('av', 738068), ('är', 694381), ('en', 620310), ('vi', 539797)]

Considering the number of lines, yes the word counts do seem reasonable


In [67]:
# A.4.1

# 1: key the lines by their line number using RDDs from A.2
en_1 = split_english.zipWithIndex()
sv_1 = split_swedish.zipWithIndex()

# 2: swap the key and value - line number is the key
en_2 = en_1.map(lambda x: (x[1], x[0]))
sv_2 = sv_1.map(lambda x: (x[1], x[0]))

# 3: join the two RDDs together
en_sw_3 = en_2.join(sv_2)

# 4: filter to exclude line pairs that have an empty/ missing sentence
def if_empty(x):
    if((len(x[1][1]) > 0) and (len(x[1][0]) > 0)):
        return x

en_sw_4 = en_sw_3.filter(if_empty)

# 5: filter to leave only pairs of sentence with small number of words
def num_words(x):
    if((len(x[1][1]) < 10) and (len(x[1][0]) < 10)):
        return x
    
en_sw_5 = en_sw_4.filter(num_words)

# 6: filter to leave only pairs of sentences with the same number of words in each sentence
def same_num(x):
    if((len(x[1][1]) == len(x[1][0]))):
        return x
    
en_sw_6 = en_sw_5.filter(same_num)



# 7: for each sentence pair, map so that you pair each in order word in the two sentences --> use zip 
def zipped(x):
    x = zip(x[1][0],x[1][1])
    x = list(x)
    return x

en_sw_7 = en_sw_6.flatMap(zipped)

# 8: use reduce to count the number of occurences of the word-translation-pairs
def tuple_count(line):
    county = line.map(lambda w: (w,1)).reduceByKey(lambda a, b : a+b)
    return county

en_sw_8 = tuple_count(en_sw_7)

# 9 : print some of the most frequently occuring pair of words
en_sw_9 = en_sw_8.takeOrdered(10, lambda x : -x[1])
print(en_sw_9)

# yes, the majority of the translations seem reasonable 

[(('is', 'är'), 10040), (('we', 'vi'), 5530), (('i', 'jag'), 5020), (('this', 'detta'), 3252), (('closed.', 'avslutad.'), 2964), (('and', 'och'), 2917), (('a', 'en'), 2888), (('it', 'det'), 2866), (('that', 'det'), 2806), (('not', 'inte'), 2650)]


In [68]:
print("hello world")

hello world


In [None]:
# release the cores for another application!
# spark_context.stop()