In [1]:
import collections
import copy
from tqdm import tqdm
import itertools
from multiprocessing import Pool, Process
import numpy as np
import statistics
from IPython.display import clear_output

In [39]:
knowledgebase = []
with open("4B2data.txt", "r") as f:
    lines = f.read().split("\n")[:-1]
    for line in lines:
        for sent in line.split():
            knowledgebase.append(sent)
    
print(knowledgebase[0])

def calculate_ngrams(knowledge, N, normalize = False):
    ngrams = collections.defaultdict(lambda: 0)
    for line in knowledge:
        lin = ["<BOS>"] + list(line) + ["<EOS>"]
        for sli in range(len(lin) - N + 1):
            ngrams[tuple(lin[sli:sli+N])] += 1
    
    norm = collections.defaultdict(lambda: np.log(0.000000001))
    alls = sum(ngrams[k] for k in ngrams.keys())
    if not normalize:
        return ngrams
    
    for k in ngrams.keys():
        norm[k] = np.log(ngrams[k]/alls)
    return norm
    
ngrams = calculate_ngrams(knowledgebase, 6, True)
print(ngrams[('d', 'u', 'p', 'a')])
print(ngrams[('c', 'b', 'a', 'q')])


oysteóeursuątyśhceszkupowakwięcec
-20.72326583694641
-20.72326583694641


In [40]:
w1 = "dziewczyna"
w2 = "dziejszyna"

w1n = calculate_ngrams([w1], 6)
w2n = calculate_ngrams([w2], 6)

def intersect(model, query):
    probability = 0.0
    for key in query.keys():
        probability += model[key] * query[key]
    return probability

print(intersect(ngrams, w1n))
print(intersect(ngrams, w2n))


-85.04364509293124
-95.36970868539701


In [41]:
with open("4B2test.txt", "r") as f:
    marks = []
    for line in f.read().split("\n")[:-1]:
        tc, tw = line.split()
        
        cc = calculate_ngrams([tc], 6)
        wc = calculate_ngrams([tw], 6)
        
        pc = intersect(ngrams, cc)
        pw = intersect(ngrams, wc)
        
        if pc == pw:
            marks.append(0.5)
        elif pc < pw:
            marks.append(0.0)
        else:
            marks.append(1.0)
            
    print(statistics.mean(marks))

0.853885


In [58]:
letters = list(set(itertools.chain(*knowledgebase)))

def performHC(model, query):
    qn = calculate_ngrams([query], 6)
    baseline = intersect(model, qn)
    best = -100000.0
    for lit1 in range(len(query)):
        for letter1 in letters:
            for lit2 in range(len(query)):
                for letter2 in letters:
                    mockquery = copy.deepcopy(query)
                    mcql = list(mockquery)
                    mcql[lit1] = letter1
                    mcql[lit2] = letter2
                    mockquery = "".join(mcql)
                    qtmp = calculate_ngrams([mockquery], 6)
                    testv = intersect(model, qtmp)
                    best = max(best, testv)
    return abs(baseline-best)
    
performHC(ngrams, "kamiqnie")

35.26486040601285

In [60]:
with open("4B2test.txt", "r") as f:
    marks = []
    for line in tqdm(f.read().split("\n")[99000:-1]):
        tc, tw = line.split()
            
        pc = performHC(ngrams, tc)
        pw = performHC(ngrams, tw)
        
        if pc == pw:
            marks.append(0.5)
        elif pc > pw:
            marks.append(0.0)
        else:
            marks.append(1.0)
            
    print(statistics.mean(marks))

100%|██████████| 1000/1000 [16:00<00:00,  1.04it/s]

0.8655



