In [None]:
import re
from collections import Counter,defaultdict 		
import numpy as np

words_sv = re.findall(r'\w+', open('/content/europarl-v7.sv-en.lc.sv').read())
words_en = re.findall(r'\w+', open('/content/europarl-v7.sv-en.lc.en').read())


### **(a) Warmup**

In [None]:
words_freq_sv = Counter(words_sv)
words_freq_en = Counter(words_en)
total_words_sv = sum(words_freq_sv.values())
total_words_en = sum(words_freq_en.values())
print("10 most frequent words in English language")
print(words_freq_en.most_common(10))
print("10 most frequent words in Swedish language")
print(words_freq_sv.most_common(10))
print("\nThe probability that randomly picked word is 'speaker'")
print((words_freq_en['speaker']/total_words_en) +( words_freq_sv['speaker']/total_words_sv))
print("The probability that randomly picked word is 'zebra'")
print((words_freq_en['zebra']/total_words_en) +( words_freq_sv['zebra']/total_words_sv))

10 most frequent words in English language
[('the', 19327), ('of', 9344), ('to', 8814), ('and', 6949), ('in', 6124), ('is', 4400), ('that', 4357), ('a', 4271), ('we', 3223), ('this', 3222)]
10 most frequent words in Swedish language
[('att', 9181), ('och', 7038), ('i', 5954), ('det', 5687), ('som', 5028), ('för', 4959), ('av', 4013), ('är', 3840), ('en', 3724), ('vi', 3211)]

The probability that randomly picked word is 'speaker'
3.890702388502196e-05
The probability that randomly picked word is 'zebra'
0.0


### **(b) Language modeling**

In [None]:
def cal_prob_bigram(bigram_freq,words_freq):
  prob = {}
  for bigram in bigram_freq.items():
    prob[bigram[0]] = bigram[1]/words_freq[bigram[0][0]]
  return prob

def cal_prob_sentence(input_string,bigram_prob ):
  list_words=input_string.split(' ')
  bigram_list = [(list_words[i],list_words[i+1]) for i in range(len(list_words)-1)]
  prob_sentence = 1
  count_bigrams_common = 0
  for bi in bigram_list:
    if bi in bigram_prob:
      count_bigrams_common+=1
      prob_sentence *= bigram_prob[bi]
  if(count_bigrams_common==0):
    prob_sentence = 0
  return prob_sentence

In [None]:
data =  open('/content/europarl-v7.sv-en.lc.en').read().split(' ')
bigram_freq = Counter((data[idx],data[idx+1]) for idx in range(len(data) - 1)) 
words_freq = Counter(data)
bigram_prob = cal_prob_bigram(bigram_freq, words_freq)

In [None]:
input1 ="sweety if you  wish"
input2 = "sweety thank you"
input3 = "at the request of a french member , mr zimeray , a petition has already been presented , which many people signed , including myself . however , i would ask you , in accordance with the line which is now constantly followed by the european parliament and by the whole of the european community , to make representations , using the weight of your prestigious office and the institution you represent , to the president and to the governor of texas , mr bush , who has the power to order a stay of execution and to reprieve the condemned person .this is all in accordance with the principles that we have always upheld ."
input4 = "I like zebra"

print("Probabilities of different sentences: ")
print(input1)
print(cal_prob_sentence(input1,bigram_prob))
print('\n')
print(input2)
print(cal_prob_sentence(input2,bigram_prob))
print('\n')
print(input3)
print(cal_prob_sentence(input3,bigram_prob))
print('\n')
print(input4)
print(cal_prob_sentence(input4,bigram_prob))
print('\n')

Probabilities of different sentences: 
sweety if you  wish
0.08854166666666667


sweety thank you
0.20512820512820512


at the request of a french member , mr zimeray , a petition has already been presented , which many people signed , including myself . however , i would ask you , in accordance with the line which is now constantly followed by the european parliament and by the whole of the european community , to make representations , using the weight of your prestigious office and the institution you represent , to the president and to the governor of texas , mr bush , who has the power to order a stay of execution and to reprieve the condemned person .this is all in accordance with the principles that we have always upheld .
8.52526689177826e-189


I like zebra
0




### **(c) Translation modeling**

In [None]:
eng = open('/content/europarl-v7.sv-en.lc.en').read()
sve = open('/content/europarl-v7.sv-en.lc.sv').read()

eng_sentences = eng.split('\n')[:10000]
#eng_sentences.pop()
sve_sentences = sve.split('\n')[:10000]
#sve_sentences.pop()

In [None]:
t  =  defaultdict ( float )

for i in range(len(eng_sentences)):
  e = eng_sentences[i]
  s = sve_sentences[i]
  eng_words = list(set(e.split()))
  sve_words = list(set(s.split()))
  eng_words.append("NULL")
  for each_s in sve_words:
    for each_e in eng_words:
      t [( each_s, each_e  )] =  1.0 



In [None]:
def get_top(t,word,count):
  dict_ = dict( filter(lambda elem: elem[0][1] == word, t.items()))
  dict_sorted = dict(sorted(dict_.items(), key=lambda item: item[1],reverse=True))
  return(list(dict_sorted.items())[:count])

num_iterations = 5
for ite in range(num_iterations):
  count  =  defaultdict ( float )
  total_e  =  defaultdict ( float )
  total_s  =  defaultdict ( float )
  for i in range(len(eng_sentences)):
    e = eng_sentences[i].split(' ')
    s = sve_sentences[i].split(' ')
    
    for s_word in s:
      #print(s_word)
      total_s [ s_word ] =  0.0
      for e_word in e:
        total_s [ s_word ] +=  t [(s_word  , e_word )]

    for s_word in s:
      for e_word in e:
        sigma = t [(s_word  , e_word )]/total_s[ s_word ] # Compute alignment prob.
        count[(e_word  , s_word )] += sigma # Update pseudocount
        total_e[e_word]+= sigma # Update pseudocount



In [None]:
t

defaultdict(float,
            {('den', 'december'): 1.0,
             ('den', 'wish'): 1.0,
             ('den', 'session'): 1.0,
             ('den', 'a'): 1.0,
             ('den', 'period'): 1.0,
             ('den', 'once'): 1.0,
             ('den', 'year'): 1.0,
             ('den', 'would'): 1.0,
             ('den', 'festive'): 1.0,
             ('den', '17'): 1.0,
             ('den', '1999'): 1.0,
             ('den', 'parliament'): 1.0,
             ('den', 'new'): 1.0,
             ('den', 'to'): 1.0,
             ('den', 'hope'): 1.0,
             ('den', 'on'): 1.0,
             ('den', 'and'): 1.0,
             ('den', 'i'): 1.0,
             ('den', ','): 1.0,
             ('den', 'like'): 1.0,
             ('den', 'friday'): 1.0,
             ('den', 'european'): 1.0,
             ('den', 'of'): 1.0,
             ('den', 'in'): 1.0,
             ('den', 'resumed'): 1.0,
             ('den', 'adjourned'): 1.0,
             ('den', 'happy'): 1.0,
             ('den', 'a

### **(d) Decoding**

In [None]:
input = "thank you"
list_input = input.split(" ")
output = []
for l in list_input:
  sve_word = get_top(t,l,1)
  output.append(sve_word[0][0][0])
print(" ".join(output))

skall den
