In [3]:
import pandas as pd
import string
import numpy as np

def word_split(text):
    punc = string.punctuation
    for i in punc:
        text = text.replace(i,' ')
    words = text.split()
    return words

def word_counts(text):
    from collections import Counter
    # Count numbers of each words
    counts = Counter()
    words = word_split(text)
    counts.update(words)
    df = pd.DataFrame([counts]).T
    df.rename(columns = {0:'Counts'},inplace=True)
    # Sorting
    df = df.sort_values(by='Counts',ascending=False)
    return df

def lang_model(text,word_list):
    df = word_counts(text)
    words = word_split(text)
    p = df.at[word_list[0],'Counts']/len(words)
    for i in range(len(word_list)-1): # to each word as input
        cnt=1 # Set the initial value to 1 in case of 0 presense of the word combination
        for j in range(len(words)-1): # search the count in the training data 
            if words[j] == word_list[i]:
                if words[j+1] == word_list[i+1]:
                    cnt += 1
                else:
                    continue
            else:
                continue
        p = p*cnt/(1+df.at[word_list[i],'Counts'])
    return p

def trans_model(text_en,text_sv,lines_en,lines_sv,df_trans):
    df_en = word_counts(text_en)
    df_sv = word_counts(text_sv)
    df_counts = pd.DataFrame(index=df_en.index,columns=df_sv.index)
    df_counts.loc[:] = 0     # Set all counts to zero
    df_counts_en = pd.DataFrame(index=df_en.index,columns=['c(e)'])
    df_counts_en.iloc[:]=0     # Set all counts to zero
    
    for k in range(len(lines_sv)):
        words_sv = word_split(lines_sv[k])
        words_en = word_split(lines_en[k])
        for i in range(len(words_sv)): # words_sv[i] stands for each sv word
            sum_trans = 0
            for j in range(len(words_en)): # words_en[j] stands for each en word
                sum_trans += df_trans.at[words_en[j],words_sv[i]]
                #print(sum_trans)
            for j in range(len(words_en)):
                align_prob = df_trans.at[words_en[j],words_sv[i]]/sum_trans
                df_counts.at[words_en[j],words_sv[i]] += align_prob
                df_counts_en.at[words_en[j],'c(e)'] += align_prob
    df_trans = df_counts.div(df_counts_en['c(e)'],axis=0)
    return df_trans

def final_translate(df_trans,sentence,text):
    word_list = sentence.split()
    translate = []
    for word in word_list:
        df_trans = df_trans.sort_values(by=word,ascending=False)
        translate.append(df_trans[[word]].head(3).index.values.tolist())
        
    from itertools import product
    loop_val = translate
    combine = []
    for i in product(*loop_val):
        combine.append(i)
            
    score = []
    for option_list in combine:
        p = lang_model(text,option_list)
        score.append((option_list,p))
    return score[-1][0]

def final_translate_simplify(df_trans,sentence):
    word_list = sentence.split()
    translate = []
    for word in word_list:
        df_trans = df_trans.sort_values(by=word,ascending=False)
        translate.append(df_trans[[word]].head(1).index.values.tolist())
    return translate

In [4]:
# Warmup
text_en = open('/content/sample_data/europarl-v7.sv-en.lc.en').read()
df_en = word_counts(text_en)
print(df_en.head(10).index.values)

text_sv = open('/content/sample_data/europarl-v7.sv-en.lc.sv').read()
df_sv = word_counts(text_sv)
print(df_sv.head(10).index.values)

# probability of the word 'speaker'
speaker = df_en.loc['speaker','Counts']/len(word_split(text_en))
print(speaker)

# probability of the word 'zebra'
# zebra = df_en.loc['zebra','Counts']/len(word_split(text_en))
# print(zebra)

['the' 'of' 'to' 'and' 'in' 'is' 'that' 'a' 'we' 'this']
['att' 'och' 'i' 'det' 'som' 'för' 'av' 'är' 'en' 'vi']
3.890656976337024e-05


In [5]:
# Language modeling
sentence = input(" please input a sentence:")
word_list = sentence.split()
p = lang_model(text_en,word_list)
print(p)

 please input a sentence:i am going to work
4.1590933067782277e-08


In [6]:
# Translation modeling
lines_en = open('/content/sample_data/europarl-v7.sv-en.lc.en').readlines()
lines_en_partial = lines_en[0:3000]
text_en_partial = ''.join(lines_en_partial)

lines_sv = open('/content/sample_data/europarl-v7.sv-en.lc.sv').readlines()
lines_sv_partial = lines_sv[0:3000]
text_sv_partial = ''.join(lines_sv_partial)

# initiate word alignment probability table
df_en = word_counts(text_en_partial)
df_sv = word_counts(text_sv_partial)
df_trans = pd.DataFrame(index=df_en.index,columns=df_sv.index)
df_trans.iloc[:]=0.25

t = 5
for i in range(t):
    df_trans = trans_model(text_en_partial,text_sv_partial,lines_en_partial,lines_sv_partial,df_trans)
    df_trans_sort = df_trans.sort_values(axis=1,by='european',ascending=False)
    print(df_trans_sort.loc['european'].index.values[0:10])

['att' 'och' 'europeiska' 'i' 'för' 'en' 'av' 'det' 'den' 'som']
['europeiska' 'att' 'i' 'och' 'för' 'en' 'den' 'av' 'det' 'som']
['europeiska' 'i' 'att' 'den' 'en' 'och' 'europeisk' 'för' 'av' 'unionen']
['europeiska' 'europeisk' 'den' 'i' 'en' 'för' 'att' 'och' 'av' 'unionen']
['europeiska' 'europeisk' 'den' 'i' 'en' 'för' 'att' 'och' 'av' 'unionen']


In [27]:
# Decoding-Simplified method
sentence = 'jag går till skolan'
print('Input as:',sentence,'\nTranslated into:',final_translate_simplify(df_trans,sentence))

sentence = 'gott nytt år'
print('Input as:',sentence,'\nTranslated into:',final_translate_simplify(df_trans,sentence))

sentence = 'frågan är hård'
print('Input as:',sentence,'\nTranslated into:',final_translate_simplify(df_trans,sentence))

Input as: jag går till skolan 
Translated into: [['i'], ['yesterday'], ['access'], ['school']]
Input as: gott nytt år 
Translated into: [['sound'], ['millennium'], ['year']]
Input as: frågan är hård 
Translated into: [['question'], ['is'], ['tough']]
