In [6]:
import nltk
from nltk.tokenize import word_tokenize
import re
from collections import defaultdict
import math

In [4]:
DEVELOPMENT_DOCS = 'dataset/devel.docs'
DEVELOPMENT_QUERIES = 'dataset/devel.queries'
DEVELOPMENT_QREL = 'dataset/devel.qrel'
BITEXT_ENG = 'dataset/bitext.en'
BITEXT_DE = 'dataset/bitext.de'
NEWSTEST_ENG = 'dataset/newstest.en'

In [2]:
stopwords = set(nltk.corpus.stopwords.words('english'))
stemmer = nltk.stem.PorterStemmer() 

def tokenize(line, tokenizer=word_tokenize):
    utf_line = line.lower()
    return [token for token in tokenizer(utf_line)]

def extract_and_tokenize_terms(doc):
    terms = []
    for token in tokenize(doc):
        if token not in stopwords:
            if not re.search(r'\d',token) and not re.search(r'[^A-Za-z-]',token): 
                terms.append(stemmer.stem(token.lower()))
    return terms

documents = {}
f = open(DEVELOPMENT_DOCS)
for line in f:
    doc = line.split("\t")
    terms = extract_and_tokenize_terms(doc[1])
    documents[doc[0]] = terms
f.close()

In [5]:
inverted_index = defaultdict(set)

for docid, terms in documents.items():
    for term in terms:
        inverted_index[term].add(docid)

In [9]:
NO_DOCS = len(documents)
AVG_LEN_DOC = sum([len(doc) for doc in documents.values()])/len(documents)

def tf_idf_score(k1,b,term,docid):  
    ft = len(inverted_index[term]) 
    term = stemmer.stem(term.lower())
    fdt =  documents[docid].count(term)
    idf_comp = math.log((NO_DOCS - ft + 0.5)/(ft+0.5))
    tf_comp = ((k1 + 1)*fdt)/(k1*((1-b) + b*(len(documents[docid])/AVG_LEN_DOC))+fdt)
    return idf_comp * tf_comp

def create_tf_idf(k1,b):
    tf_idf = defaultdict(dict)
    for term in set(inverted_index.keys()):
        for docid in inverted_index[term]:
            tf_idf[term][docid] = tf_idf_score(k1,b,term,docid)
    return tf_idf

tf_idf = create_tf_idf(1.5,0.5)

In [15]:
def get_qtf_comp(k3,term,fqt):
    return ((k3+1)*fqt[term])/(k3 + fqt[term])

def retr_docs(query,result_count):
    q_terms = [stemmer.stem(term.lower()) for term in query.split() if term not in stopwords]
    fqt = {}
    for term in q_terms:
        fqt[term] = fqt.get(term,0) + 1
    scores = {}
    for word in fqt.keys():
        for document in inverted_index[word]:
            scores[document] = scores.get(document,0) + (tf_idf[word][document]*get_qtf_comp(0,word,fqt))
    return sorted(scores.items(),key = lambda x : x[1] , reverse=True)[:result_count]  

retr_docs("Berlin",5)

[('197203', 9.582576604808242),
 ('305807', 9.386382888517126),
 ('3354', 9.355859503705103),
 ('315275', 9.262398784650085),
 ('237254', 9.170402811746479)]

In [16]:
f = open(BITEXT_ENG)
train_sentences = []
for line in f:
    train_sentences.append(tokenize(line))
f.close()    

def check_for_unk_train(word,unigram_counts):
    if word in unigram_counts:
        return word
    else:
        unigram_counts[word] = 0
        return "UNK"

def convert_sentence_train(sentence,unigram_counts):
    return ["<s1>"] + ["<s2>"] + [check_for_unk_train(token.lower(),unigram_counts) for token in sentence] + ["</s2>"]+ ["</s1>"]

def get_counts(sentences):
    unigram_counts = {}
    for sentence in sentences:
        sentence = convert_sentence_train(sentence, unigram_counts)
        for i in range(len(sentence) - 2):
            unigram_counts[sentence[i]] = unigram_counts.get(sentence[i],0) + 1
    unigram_counts["</s1>"] = unigram_counts["<s1>"]
    unigram_counts["</s2>"] = unigram_counts["<s2>"]
    return unigram_counts

In [17]:
unigram_counts = get_counts(train_sentences)

In [55]:
unigram_counts

{'$': 170,
 '10,000': 9,
 'gold': 23,
 '?': 1390,
 '<s1>': 21750,
 '<s2>': 21750,
 'UNK': 20865,
 'san': 15,
 'francisco': 8,
 '–': 1105,
 'it': 2385,
 'has': 1488,
 'never': 138,
 'been': 673,
 'easy': 72,
 'to': 6281,
 'have': 1351,
 'a': 5782,
 'rational': 7,
 'conversation': 4,
 'about': 473,
 'the': 18147,
 'value': 46,
 'of': 8389,
 '.': 18225,
 'wouldn': 7,
 '’': 3098,
 't': 220,
 'you': 297,
 'know': 104,
 'but': 2134,
 'january': 27,
 '1980': 59,
 'was': 1531,
 'arguably': 6,
 '“': 993,
 'freak': 0,
 'peak': 8,
 '”': 1013,
 'during': 123,
 'period': 43,
 'heightened': 3,
 'geo-political': 2,
 'instability': 17,
 'one': 811,
 'answer': 55,
 ',': 15137,
 'course': 200,
 'is': 5466,
 'complete': 17,
 'collapse': 33,
 'us': 831,
 'dollar': 77,
 'yes': 29,
 'had': 370,
 'great': 167,
 'run': 50,
 'so': 667,
 'too': 290,
 'did': 241,
 'worldwide': 32,
 'housing': 16,
 'prices': 77,
 'until': 94,
 'couple': 9,
 'years': 422,
 'ago': 143,
 'instead': 58,
 'european': 356,
 'union': 18

In [18]:
token_count = sum(unigram_counts.values())

def check_for_unk_test(word,unigram_counts):
    if word in unigram_counts and unigram_counts[word] > 0:
        return word
    else:
        return "UNK"

def convert_sentence_test(sentence,unigram_counts):
    return ["<s1>"] + ["<s2>"] + [check_for_unk_test(word.lower(),unigram_counts) for word in sentence] + ["</s2>"]  + ["</s1>"]

def get_log_prob_addk(word,unigram_counts,k):
    return math.log((unigram_counts[word] + k)/(token_count + k*len(unigram_counts)))

def get_sent_log_prob_addk(sentence, unigram_counts,k):
    sentence = convert_sentence_test(sentence, unigram_counts)
    return sum([get_log_prob_addk(word, unigram_counts,k) for word in sentence])

def calculate_perplexity_uni(sentences,unigram_counts, token_count, k):
    total_log_prob = 0
    test_token_count = 0
    for sentence in sentences:
        test_token_count += len(sentence) + 2
        total_log_prob += get_sent_log_prob_addk(sentence,unigram_counts,k)
    return math.exp(-total_log_prob/test_token_count)

f = open(NEWSTEST_ENG)

test_sents = []
for line in f:
    test_sents.append(tokenize(line))
f.close()

In [19]:
ks = [0.0001,0.01,0.1,1,10]
for k in ks:
    print(str(k) +": " + str(calculate_perplexity_uni(test_sents,unigram_counts,token_count,k)))

0.0001: 631.5624344258267
0.01: 631.6372709016225
0.1: 632.3728621611093
1: 643.2578962510463
10: 814.7925245672897


In [23]:
from nltk.translate import IBMModel1
from nltk.translate import AlignedSent

eng_sents = []
de_sents = []

f = open(BITEXT_ENG)
for line in f:
    terms = tokenize(line)
    eng_sents.append(terms)
f.close()

f = open(BITEXT_DE)
for line in f:
    terms = tokenize(line)
    de_sents.append(terms)
f.close()

In [24]:
paral_sents = list(zip(eng_sents,de_sents))

In [25]:
eng_de_bt = [AlignedSent(E,G) for E,G in paral_sents]
eng_de_m = IBMModel1(eng_de_bt, 5)

de_eng_bt = [AlignedSent(G,E) for E,G in paral_sents]
de_eng_m = IBMModel1(de_eng_bt, 5)

In [28]:
combined_align = []
for i in range(len(eng_de_bt)):
    forward = {x for x in eng_de_bt[i].alignment}
    back_reversed = {x[::-1] for x in de_eng_bt[i].alignment}
    combined_align.append(forward.intersection(back_reversed))

In [31]:
de_eng_count = defaultdict(dict)
for i in range(len(de_eng_bt)):
    for item in combined_align[i]:
        de_eng_count[de_eng_bt[i].words[item[1]]][de_eng_bt[i].mots[item[0]]] =  de_eng_count[de_eng_bt[i].words[item[1]]].get(de_eng_bt[i].mots[item[0]],0) + 1

eng_de_count = defaultdict(dict)
for i in range(len(eng_de_bt)):
    for item in combined_align[i]:
        eng_de_count[eng_de_bt[i].words[item[0]]][eng_de_bt[i].mots[item[1]]] =  eng_de_count[eng_de_bt[i].words[item[0]]].get(eng_de_bt[i].mots[item[1]],0) + 1

In [32]:
de_eng_prob = defaultdict(dict)
for de in de_eng_count.keys():
    for eng in de_eng_count[de].keys():
        de_eng_prob[de][eng] = de_eng_count[de][eng]/sum(de_eng_count[de].values())


eng_de_prob = defaultdict(dict)
for eng in eng_de_count.keys():
    for de in eng_de_count[eng].keys():
        eng_de_prob[eng][de] = eng_de_count[eng][de]/sum(eng_de_count[eng].values())

In [50]:
print(de_eng_prob['frage'])
print(de_eng_prob['handlung'])
print(de_eng_prob['haus'])
print(de_eng_prob['die'])
print(de_eng_prob['englisch'])


{'question': 1.0}
{'spans': 0.5, 'side': 0.5}
{'house': 0.625, 'charity': 0.125, 'hospitalized': 0.125, 'offset': 0.125}
{'the': 1.0}
{'english': 0.875, 'significantly': 0.125}


In [34]:
def de_eng_noisy(german):
    noisy={}
    for eng in de_eng_prob[german].keys():
        noisy[eng] = eng_de_prob[eng][german]+ get_log_prob_addk(eng,unigram_counts,0.0001)
    return noisy

In [60]:
print(eng_de_prob['happy'])
print(eng_de_prob['the'])
print(eng_de_prob['sad'])
print(eng_de_prob['english'])
print(eng_de_prob['house'])

{'irische': 0.25, 'glücklich': 0.25, 'portemonnaie': 0.25, 'vorläufig': 0.25}
{'den': 0.030258662762323085, 'die': 0.7484952009110135, 'der': 0.2010736944851147, 'das': 0.014803969415975272, 'des': 0.002277533756303888, 'dem': 0.0016268098259313486, 'im': 0.0014641288433382138}
{'traurigen': 0.5, 'vielzahl': 0.5}
{'englisch': 0.7777777777777778, 'englische': 0.1111111111111111, 'megalomanischen': 0.1111111111111111}
{'haus': 0.8333333333333334, 'ade': 0.16666666666666666}


In [58]:
print(de_eng_prob['frage'])
print(de_eng_prob['handlung'])
print(de_eng_prob['haus'])
print(de_eng_prob['die'])
print(de_eng_prob['englisch'])

{'question': 1.0}
{'spans': 0.5, 'side': 0.5}
{'house': 0.625, 'charity': 0.125, 'hospitalized': 0.125, 'offset': 0.125}
{'the': 1.0}
{'english': 0.875, 'significantly': 0.125}


In [59]:
def de_eng_direct(query):
    query_english = [] 
    query_tokens = tokenize(query)
    
    for token in query_tokens:
        try:
            query_english.append(max(de_eng_prob[token], key=de_eng_prob[token].get))
        except:
            query_english.append(token)
    return " ".join(query_english)

def de_eng_noisy_translate(query):  
    query_english = [] 
    query_tokens = tokenize(query)
    for token in query_tokens:
        try:
            query_english.append(max(de_eng_noisy(token), key=de_eng_noisy(token).get))
        except:
            query_english.append(token)
    return " ".join(query_english)
f = open(DEVELOPMENT_QUERIES)
lno = plno = 0
german_qs, test_query_trans_sents = {}, []
for line in f:
    lno+=1
    query_id = line.split('\t')[0]
    query_german = line.split('\t')[1]  
    german_qs[query_id] = query_german.strip()
    translation = str(de_eng_noisy_translate(query_german))
    if plno<5:
        print(query_id + "\n" + "German: " + str(query_german) + "\n" + "English: " + translation +"\n\n")
        plno+=1
    test_query_trans_sents.append(translation)
    if lno==100:
        break
f.close()

82
German: der ( von engl . action : tat , handlung , bewegung ) ist ein filmgenre des unterhaltungskinos , in welchem der fortgang der äußeren handlung von zumeist spektakulär inszenierten kampf - und gewaltszenen vorangetrieben und illustriert wird .

English: the ( , leninism . action : rattling , side , movement ) is a filmgenre the unterhaltungskinos , in paulson the fortgang the trumpet side , zumeist spektakulär inszenierten fight - and gewaltszenen annan and illustriert is .


116
German: die ( einheitenzeichen : u für unified atomic mass unit , veraltet amu für atomic mass unit ) ist eine maßeinheit der masse .

English: the ( einheitenzeichen : u for unified atomic manipulation unit , regime amu for atomic manipulation unit ) is a befuddled the masse .


240
German: der von lateinisch actualis , " wirklich " , auch aktualitätsprinzip , uniformitäts - oder gleichförmigkeitsprinzip , englisch uniformitarianism , ist die grundlegende wissenschaftliche methode in der .

English: 