In [1]:
from pyspark.sql import SparkSession

spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.87:7077") \
        .appName("yuen_ting_cheung")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.shuffle.service.enabled", True)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",4)\
        .getOrCreate()
        
spark_context = spark_session.sparkContext

## Question A.1

In [2]:
# A.1.1 Read the English transcripts with Spark, and count the number of lines.
english_transcript = spark_context.textFile("hdfs://192.168.2.87:9000/europarl/europarl-v7.sv-en.en")
en_count = english_transcript.count()
print("No. of lines of English transcripts: " + str(en_count))

# A.1.2 Do the same with the other language
swedish_transcript = spark_context.textFile("hdfs://192.168.2.87:9000/europarl/europarl-v7.sv-en.sv")
sv_count = swedish_transcript.count()
print("No. of lines of Swedish transcripts: " + str(sv_count))

# A.1.3 Verify that the line counts are the same for the two languages.
if (en_count == sv_count):
    print("The line counts are the same")
else:
    print("The line counts are not the same")
    
# A.1.4 Count the number of partitions.
en_p_count = english_transcript.getNumPartitions()
sv_p_count = swedish_transcript.getNumPartitions()
print("No. of partitions of English RDD: " + str(en_p_count))
print("No. of partitions of Swedish RDD: " + str(sv_p_count))
print("No. of partitions of both RDDs: " + str(en_p_count+sv_p_count))

No. of lines of English transcripts: 1862234
No. of lines of Swedish transcripts: 1862234
The line counts are the same
No. of partitions of English RDD: 2
No. of partitions of Swedish RDD: 3
No. of partitions of both RDDs: 5


## Question A.2

In [39]:
# A.2.1 Pre-process the text from both RDDs by doing the following:
#      Lowercase the text
#      Tokenize the text (split on space)

def pre_process(content):
    return content.lower().split(' ')

english_processed = english_transcript.map(lambda line:pre_process(line))
swedish_processed = swedish_transcript.map(lambda line:pre_process(line))
    
# A.2.2 Inspect 10 entries from each of your RDDs to verify your pre-processing.
print(english_processed.take(10))
print(swedish_processed.take(10))

[['resumption', 'of', 'the', 'session'], ['i', 'declare', 'resumed', 'the', 'session', 'of', 'the', 'european', 'parliament', 'adjourned', 'on', 'friday', '17', 'december', '1999,', 'and', 'i', 'would', 'like', 'once', 'again', 'to', 'wish', 'you', 'a', 'happy', 'new', 'year', 'in', 'the', 'hope', 'that', 'you', 'enjoyed', 'a', 'pleasant', 'festive', 'period.'], ['although,', 'as', 'you', 'will', 'have', 'seen,', 'the', 'dreaded', "'millennium", "bug'", 'failed', 'to', 'materialise,', 'still', 'the', 'people', 'in', 'a', 'number', 'of', 'countries', 'suffered', 'a', 'series', 'of', 'natural', 'disasters', 'that', 'truly', 'were', 'dreadful.'], ['you', 'have', 'requested', 'a', 'debate', 'on', 'this', 'subject', 'in', 'the', 'course', 'of', 'the', 'next', 'few', 'days,', 'during', 'this', 'part-session.'], ['in', 'the', 'meantime,', 'i', 'should', 'like', 'to', 'observe', 'a', "minute'", 's', 'silence,', 'as', 'a', 'number', 'of', 'members', 'have', 'requested,', 'on', 'behalf', 'of', '

In [40]:
# A.2.3 Verify that the line counts still match after the pre-processing.
en_count_processed = english_processed.count()
print("No. of lines of the processed English transcripts: " + str(en_count_processed))
sv_count_processed = swedish_processed.count()
print("No. of lines of the processed Swedish transcripts: " + str(sv_count_processed))
if (en_count_processed == sv_count_processed):
    print("The line counts are the same")
else:
    print("The line counts are not the same")

No. of lines of the processed English transcripts: 1862234
No. of lines of the processed Swedish transcripts: 1862234
The line counts are the same


## Question A.3

In [85]:
# A.3.1 Use Spark to compute the 10 most frequently according words in the English language 
# corpus. Repeat for the other language.

from operator import add
#Use flatMap to convert "list of lists of words" to "list of words"
en_word_list = english_processed.flatMap(lambda w: w)\
    .map(lambda w: (w,1))

en_word_count_list = en_word_list.reduceByKey(add)

top10_en_word = en_word_count_list.takeOrdered(10, key=lambda x: -x[1])
print(top10_en_word)

[('the', 3498375), ('of', 1659758), ('to', 1539760), ('and', 1288401), ('in', 1085993), ('that', 797516), ('a', 773522), ('is', 758050), ('for', 534242), ('we', 522849)]


In [42]:
sv_word_list = swedish_processed.flatMap(lambda w: w)\
    .map(lambda w: (w,1))
    
sv_word_count_list = sv_word_list.reduceByKey(add)

top10_sv_word = sv_word_count_list.takeOrdered(10, key=lambda x: -x[1])
print(top10_sv_word)

[('att', 1706293), ('och', 1344830), ('i', 1050774), ('det', 924866), ('som', 913276), ('för', 908680), ('av', 738068), ('är', 694381), ('en', 620310), ('vi', 539797)]


In [43]:
# A.3.2 Verify that your results are reasonable.
print(top10_en_word)
print(top10_sv_word)
# The results are reasonable because the top 10 most frequently used words of both language are very similar.
# e.g. Swedish word 'och' corresponds to English word 'and', these two words are both included in the top 10 lists 
#      and have similar counts 

[('the', 3498375), ('of', 1659758), ('to', 1539760), ('and', 1288401), ('in', 1085993), ('that', 797516), ('a', 773522), ('is', 758050), ('for', 534242), ('we', 522849)]
[('att', 1706293), ('och', 1344830), ('i', 1050774), ('det', 924866), ('som', 913276), ('för', 908680), ('av', 738068), ('är', 694381), ('en', 620310), ('vi', 539797)]


## Question A.4

In [44]:
# A.4.1 Use this parallel corpus to mine some translations in the form of word pairs, for the two
# languages. We’ll achieve this by looking for pairs of words that frequently occur in the same
# position within lines.

# 1. Key the lines by their line number (hint: ZipWithIndex()).
en_1 = english_processed.zipWithIndex()
sv_1 = swedish_processed.zipWithIndex()

print(en_1.take(5))
print(sv_1.take(5))

[(['resumption', 'of', 'the', 'session'], 0), (['i', 'declare', 'resumed', 'the', 'session', 'of', 'the', 'european', 'parliament', 'adjourned', 'on', 'friday', '17', 'december', '1999,', 'and', 'i', 'would', 'like', 'once', 'again', 'to', 'wish', 'you', 'a', 'happy', 'new', 'year', 'in', 'the', 'hope', 'that', 'you', 'enjoyed', 'a', 'pleasant', 'festive', 'period.'], 1), (['although,', 'as', 'you', 'will', 'have', 'seen,', 'the', 'dreaded', "'millennium", "bug'", 'failed', 'to', 'materialise,', 'still', 'the', 'people', 'in', 'a', 'number', 'of', 'countries', 'suffered', 'a', 'series', 'of', 'natural', 'disasters', 'that', 'truly', 'were', 'dreadful.'], 2), (['you', 'have', 'requested', 'a', 'debate', 'on', 'this', 'subject', 'in', 'the', 'course', 'of', 'the', 'next', 'few', 'days,', 'during', 'this', 'part-session.'], 3), (['in', 'the', 'meantime,', 'i', 'should', 'like', 'to', 'observe', 'a', "minute'", 's', 'silence,', 'as', 'a', 'number', 'of', 'members', 'have', 'requested,', 'o

In [45]:
# 2. Swap the key and value - so that the line number is the key.
en_2 = en_1.map(lambda x: (x[1], x[0]))
sv_2 = sv_1.map(lambda x: (x[1], x[0]))

print(en_2.take(5))
print(sv_2.take(5))

[(0, ['resumption', 'of', 'the', 'session']), (1, ['i', 'declare', 'resumed', 'the', 'session', 'of', 'the', 'european', 'parliament', 'adjourned', 'on', 'friday', '17', 'december', '1999,', 'and', 'i', 'would', 'like', 'once', 'again', 'to', 'wish', 'you', 'a', 'happy', 'new', 'year', 'in', 'the', 'hope', 'that', 'you', 'enjoyed', 'a', 'pleasant', 'festive', 'period.']), (2, ['although,', 'as', 'you', 'will', 'have', 'seen,', 'the', 'dreaded', "'millennium", "bug'", 'failed', 'to', 'materialise,', 'still', 'the', 'people', 'in', 'a', 'number', 'of', 'countries', 'suffered', 'a', 'series', 'of', 'natural', 'disasters', 'that', 'truly', 'were', 'dreadful.']), (3, ['you', 'have', 'requested', 'a', 'debate', 'on', 'this', 'subject', 'in', 'the', 'course', 'of', 'the', 'next', 'few', 'days,', 'during', 'this', 'part-session.']), (4, ['in', 'the', 'meantime,', 'i', 'should', 'like', 'to', 'observe', 'a', "minute'", 's', 'silence,', 'as', 'a', 'number', 'of', 'members', 'have', 'requested,',

In [50]:
# 3. Join the two RDDs together according to the line number key, so you have pairs of lines with the same line number.
rdd3 = en_2.join(sv_2).sortByKey()
print(rdd3.take(5))

[(0, (['resumption', 'of', 'the', 'session'], ['återupptagande', 'av', 'sessionen'])), (1, (['i', 'declare', 'resumed', 'the', 'session', 'of', 'the', 'european', 'parliament', 'adjourned', 'on', 'friday', '17', 'december', '1999,', 'and', 'i', 'would', 'like', 'once', 'again', 'to', 'wish', 'you', 'a', 'happy', 'new', 'year', 'in', 'the', 'hope', 'that', 'you', 'enjoyed', 'a', 'pleasant', 'festive', 'period.'], ['jag', 'förklarar', 'europaparlamentets', 'session', 'återupptagen', 'efter', 'avbrottet', 'den', '17', 'december.', 'jag', 'vill', 'på', 'nytt', 'önska', 'er', 'ett', 'gott', 'nytt', 'år', 'och', 'jag', 'hoppas', 'att', 'ni', 'haft', 'en', 'trevlig', 'semester.'])), (2, (['although,', 'as', 'you', 'will', 'have', 'seen,', 'the', 'dreaded', "'millennium", "bug'", 'failed', 'to', 'materialise,', 'still', 'the', 'people', 'in', 'a', 'number', 'of', 'countries', 'suffered', 'a', 'series', 'of', 'natural', 'disasters', 'that', 'truly', 'were', 'dreadful.'], ['som', 'ni', 'kunnat',

In [62]:
# 4. Filter to exclude line pairs that have an empty/missing “corresponding” sentence.
rdd4 = rdd3.filter(lambda x: len(x[1][0]) > 0 and len(x[1][1]) > 0)
print(rdd4.take(5))

[(0, (['resumption', 'of', 'the', 'session'], ['återupptagande', 'av', 'sessionen'])), (1, (['i', 'declare', 'resumed', 'the', 'session', 'of', 'the', 'european', 'parliament', 'adjourned', 'on', 'friday', '17', 'december', '1999,', 'and', 'i', 'would', 'like', 'once', 'again', 'to', 'wish', 'you', 'a', 'happy', 'new', 'year', 'in', 'the', 'hope', 'that', 'you', 'enjoyed', 'a', 'pleasant', 'festive', 'period.'], ['jag', 'förklarar', 'europaparlamentets', 'session', 'återupptagen', 'efter', 'avbrottet', 'den', '17', 'december.', 'jag', 'vill', 'på', 'nytt', 'önska', 'er', 'ett', 'gott', 'nytt', 'år', 'och', 'jag', 'hoppas', 'att', 'ni', 'haft', 'en', 'trevlig', 'semester.'])), (2, (['although,', 'as', 'you', 'will', 'have', 'seen,', 'the', 'dreaded', "'millennium", "bug'", 'failed', 'to', 'materialise,', 'still', 'the', 'people', 'in', 'a', 'number', 'of', 'countries', 'suffered', 'a', 'series', 'of', 'natural', 'disasters', 'that', 'truly', 'were', 'dreadful.'], ['som', 'ni', 'kunnat',

In [63]:
# 5. Filter to leave only pairs of sentences with a small number of words per sentence,
# this should give a more reliable translation (you can experiment).
rdd5 = rdd4.filter(lambda x: len(x[1][0]) < 10 and len(x[1][1]) < 10)
print(rdd5.take(5))

[(0, (['resumption', 'of', 'the', 'session'], ['återupptagande', 'av', 'sessionen'])), (5, (['please', 'rise,', 'then,', 'for', 'this', "minute'", 's', 'silence.'], ['jag', 'ber', 'er', 'resa', 'er', 'för', 'en', 'tyst', 'minut.'])), (6, (['(the', 'house', 'rose', 'and', 'observed', 'a', "minute'", 's', 'silence)'], ['(parlamentet', 'höll', 'en', 'tyst', 'minut.)'])), (7, (['madam', 'president,', 'on', 'a', 'point', 'of', 'order.'], ['fru', 'talman!', 'det', 'gäller', 'en', 'ordningsfråga.'])), (13, (['madam', 'president,', 'on', 'a', 'point', 'of', 'order.'], ['fru', 'talman!', 'det', 'gäller', 'en', 'ordningsfråga.']))]


In [65]:
# 6. Filter to leave only pairs of sentences with the same number of words in each sentence.
rdd6 = rdd5.filter(lambda x: len(x[1][0]) == len(x[1][1]))
print(rdd6.count())
print(rdd6.take(5))

74008
[(50, (['agenda'], ['arbetsplan'])), (96, (['that', 'did', 'not', 'happen.'], ['så', 'blev', 'inte', 'fallet.'])), (98, (['this', 'is', 'an', 'important', 'matter.'], ['detta', 'är', 'en', 'viktig', 'fråga.'])), (141, (['i', 'congratulate', 'him', 'on', 'his', 'excellent', 'report.'], ['jag', 'gratulerar', 'honom', 'till', 'hans', 'utmärkta', 'betänkande.'])), (183, (['the', 'debate', 'is', 'closed.'], ['jag', 'förklarar', 'debatten', 'avslutad.']))]


In [79]:
# 7. For each sentence pair, map to give a list of word pairs (in order) from the two
# sentences. We no longer need the line numbers. (hint: use python’s built in zip() function)
rdd7 = rdd6.map(lambda s: list(zip(s[1][0],s[1][1])))
print(rdd7.take(10))

[[('agenda', 'arbetsplan')], [('that', 'så'), ('did', 'blev'), ('not', 'inte'), ('happen.', 'fallet.')], [('this', 'detta'), ('is', 'är'), ('an', 'en'), ('important', 'viktig'), ('matter.', 'fråga.')], [('i', 'jag'), ('congratulate', 'gratulerar'), ('him', 'honom'), ('on', 'till'), ('his', 'hans'), ('excellent', 'utmärkta'), ('report.', 'betänkande.')], [('the', 'jag'), ('debate', 'förklarar'), ('is', 'debatten'), ('closed.', 'avslutad.')], [('the', 'omröstningen'), ('vote', 'kommer'), ('will', 'att'), ('take', 'äga'), ('place', 'rum'), ('tomorrow', 'i'), ('at', 'morgon'), ('12', 'kl.'), ('p.m.', '12.00.')], [('transport', 'transport'), ('of', 'av'), ('dangerous', 'farligt'), ('goods', 'gods'), ('by', 'på'), ('road', 'väg')], [('the', 'jag'), ('debate', 'förklarar'), ('is', 'debatten'), ('closed.', 'avslutad.')], [('the', 'omröstningen'), ('vote', 'kommer'), ('will', 'att'), ('take', 'äga'), ('place', 'rum'), ('tomorrow', 'i'), ('at', 'morgon'), ('12', 'kl.'), ('p.m.', '12.00.')], [('t

In [88]:
# 8. Use reduce to count the number of occurrences of the word-translation-pairs.
rdd8 = rdd7.flatMap(lambda x:x).map(lambda x:(x,1)).reduceByKey(add)
print(rdd8.take(10))

[(('that', 'så'), 122), (('his', 'hans'), 161), (('excellent', 'utmärkta'), 27), (('closed.', 'avslutad.'), 2980), (('goods', 'gods'), 3), (('by', 'på'), 10), (('not', 'vänta'), 1), (('hope.', 'sig.'), 1), (('to', 'hur'), 4), (('but', 'på'), 1)]


In [89]:
# 9. Print some of the most frequently occurring pairs of words.
print(rdd8.takeOrdered(10, key=lambda x: -x[1]))

[(('is', 'är'), 10070), (('we', 'vi'), 5539), (('i', 'jag'), 5040), (('this', 'detta'), 3257), (('closed.', 'avslutad.'), 2980), (('and', 'och'), 2926), (('a', 'en'), 2892), (('it', 'det'), 2868), (('that', 'det'), 2807), (('not', 'inte'), 2652)]


In [None]:
# Do your translations seem reasonable?
# Yes, the translations of top 10 most frequently occurring pairs of words are reasonable.

In [90]:
# release the cores for another application!
spark_context.stop()