# Odległość edycyjna

In [2]:
from spacy.tokenizer import Tokenizer
from spacy.lang.pl import Polish
from bisect import bisect
import random

## Odległość Levenshteina

In [3]:
def LevenshteinDistance(word1, word2):
    dist_matrix = [[(0, 0)]*(len(word2)+1) for i in range(len(word1)+1)]   
    
    for i in range(len(word1)+1):
        dist_matrix[i][0] = (i, 0)
    
    for i in range(len(word2)+1):
        dist_matrix[0][i] = (i, 0)
        
    for j in range(1, 1+len(word2)):
        for i in range(1, 1+len(word1)):
            if word1[i-1] == word2[j-1]:
                substitution_cost = 0
            else:
                substitution_cost = 1
                
            min_ = min(dist_matrix[i-1][j][0]+1, dist_matrix[i][j-1][0]+1,
                       dist_matrix[i-1][j-1][0]+substitution_cost)
            
            if min_==dist_matrix[i-1][j][0]+1:
                dist_matrix[i][j] = (min_, (len(word2)+1)*(i-1)+j)
            elif min_==dist_matrix[i][j-1][0]+1:
                dist_matrix[i][j] = (min_, (len(word2)+1)*i+j-1)
            else:
                dist_matrix[i][j] = (min_, (len(word2)+1)*(i-1)+j-1)

    return dist_matrix[-1][-1], dist_matrix    

def visualization(word1, word2):
    dist, dist_matrix = LevenshteinDistance(word1, word2)
    height = len(dist_matrix)
    width = len(dist_matrix[0])
    i = height-1
    j = width-1
    output = ""
    while i>0 or j>0:
        i_, j_ = dist_matrix[i][j][1]//width, dist_matrix[i][j][1]%width
        if i_ == i-1 and j_ == j-1 and dist_matrix[i_][j_][0] == dist_matrix[i][j][0]-1:
            output=f'{word2[:j_]}*{word2[j_]}*{word1[i_+1:]} (substituted {word1[i_]}->{word2[j_]})\n'+output
        elif i_ == i-1 and dist_matrix[i_][j_][0] == dist_matrix[i][j][0]-1:
            output=f'{word2[:j_]}**{word1[i_+1:]} (subtracted {word1[i_]})\n'+output
        elif j_ == j-1 and dist_matrix[i_][j_][0] == dist_matrix[i][j][0]-1:
            output=f'{word2[:j_]}*{word2[j_]}*{word1[i_:]} (added {word2[j_]})\n'+output          
        i, j = i_, j_
    print(f'Distance between {word1} and {word2} equals: {dist[0]}\nSteps:')  
    print(output)   

In [4]:
data = [("los", "kloc"),("Łódź", "Lodz"),("kwintesencja", "quintessence"),
        ("ATGAATCTTACCGCCTCG", "ATGAGGCTCTGGCCCCTG"), ("wojtk", "wjeek")]

for (word1, word2) in data:
    visualization(word1, word2)
    print("-----------")

Distance between los and kloc equals: 2
Steps:
*k*los (added k)
klo*c* (substituted s->c)

-----------
Distance between Łódź and Lodz equals: 3
Steps:
*L*ódź (substituted Ł->L)
L*o*dź (substituted ó->o)
Lod*z* (substituted ź->z)

-----------
Distance between kwintesencja and quintessence equals: 5
Steps:
*q*wintesencja (substituted k->q)
q*u*intesencja (substituted w->u)
quintes*s*encja (added s)
quintessenc*e*a (substituted j->e)
quintessence** (subtracted a)

-----------
Distance between ATGAATCTTACCGCCTCG and ATGAGGCTCTGGCCCCTG equals: 7
Steps:
ATGA*G*TCTTACCGCCTCG (substituted A->G)
ATGAG*G*CTTACCGCCTCG (substituted T->G)
ATGAGGCT*C*TACCGCCTCG (added C)
ATGAGGCTCT*G*CCGCCTCG (substituted A->G)
ATGAGGCTCTG*G*CCGCCTCG (added G)
ATGAGGCTCTGGCC**CCTCG (subtracted G)
ATGAGGCTCTGGCCCCT**G (subtracted C)

-----------
Distance between wojtk and wjeek equals: 3
Steps:
w**jtk (subtracted o)
wj*e*k (substituted t->e)
wje*e*k (added e)

-----------


## Najdłuższy wspólny podciąg

### LCS wśród tokenów

#### Algorytmy znajdowania długości LCS

In [1]:
# Hunt-Szymański algorithm
def lcs(list1, list2, visualize=False):
    lcs = []
    ranges = [len(list2)]
    for w in range(len(list1)):

        positions = [j for (j, l) in enumerate(list2) if l == list1[w]]
        positions.reverse()
        
        if visualize:
            print(ranges)
            i_ = 0
            for i in ranges:
                print(list2[i_:i])
                i_ = i
        
        for p in positions:
            k = bisect(ranges, p)
            if k == bisect(ranges, p-1):
                if k<len(ranges)-1:
                    ranges[k] = p
                else:
                    ranges[k:k] = [p]
        if visualize:            
            print(f'->{list1[w]}')    
            print("------------")
    
    if visualize:
        print(ranges)
        i_ = 0
        for i in ranges:
            print(list2[i_:i])
            i_ = i

    return len(ranges) - 1, ranges

In [5]:
# Standard algorithm
def lcs2(word1, word2):
    dist_matrix = [[0]*(len(word2)+1) for i in range(len(word1)+1)]   
    
    for i in range(len(word1)+1):
        dist_matrix[i][0] =  0
    
    for i in range(len(word2)+1):
        dist_matrix[0][i] = 0
        
    for j in range(1, 1+len(word2)):
        for i in range(1, 1+len(word1)):
            if word1[i-1] == word2[j-1]:
                dist_matrix[i][j] = dist_matrix[i-1][j-1] + 1
            else:
                dist_matrix[i][j] = max(dist_matrix[i][j-1], dist_matrix[i-1][j])

    return dist_matrix[-1][-1], dist_matrix   

In [6]:
lcs("zcbbda", "abcabbabaz", True)

[10]
abcabbabaz
->z
------------
[9, 10]
abcabbaba
z
->c
------------
[2, 10]
ab
cabbabaz
->b
------------
[1, 4, 10]
a
bca
bbabaz
->b
------------
[1, 4, 5, 10]
a
bca
b
babaz
->d
------------
[1, 4, 5, 10]
a
bca
b
babaz
->a
------------
[0, 3, 5, 6, 10]

abc
ab
b
abaz


(4, [0, 3, 5, 6, 10])

In [7]:
lcs2("zcbbda", "abcabbabaz")

(4,
 [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
  [0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1],
  [0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2],
  [0, 0, 1, 1, 1, 2, 3, 3, 3, 3, 3],
  [0, 0, 1, 1, 1, 2, 3, 3, 3, 3, 3],
  [0, 1, 1, 1, 2, 2, 3, 4, 4, 4, 4]])

#### Tokenizacja

In [8]:
nlp = Polish()
tokenizer = Tokenizer(nlp.vocab)
with open('romeo-i-julia-700.txt', 'r') as data:
    tokens = tokenizer(data.read())

In [9]:
def delete_random(data):
    A = random.sample(list(range(len(data))), 97*len(data)//100)
    A.sort()
    return [data[i] for i in A]

In [10]:
text1 = delete_random(tokens)
text2 = delete_random(tokens)
lcs_length, ranges = lcs(text1, text2)
print(f'LCS length for text1 and text2 equals given by standard algorithm: {lcs_length}')
lcs_length, dist_matrix = lcs2(text1, text2)
print(f'LCS length for text1 and text2 equals given by Hunt-Szymański algorithm: {lcs_length}')

LCS length for text1 and text2 equals given by standard algorithm: 2135
LCS length for text1 and text2 equals given by Hunt-Szymański algorithm: 2135


### Implementacja diff

#### Odtwarzanie LCS

In [11]:
def get_lcs(dist_matrix, word1, word2, i, j):
    if i==0 or j==0:
        return []
    if word1[i-1] == word2[j-1]:
        return get_lcs(dist_matrix, word1, word2, i-1, j-1) + [word1[i-1]]
    if dist_matrix[i][j-1]>dist_matrix[i-1][j]:
        return get_lcs(dist_matrix, word1, word2, i, j-1)
    return get_lcs(dist_matrix, word1, word2, i-1, j)
    

In [12]:
def diff(file1, file2):
    l, dist_matrix = lcs2(file1, file2)
    lcs = get_lcs(dist_matrix, file1, file2, len(file1), len(file2))
    i_1 = 0;
    i_2 = 0;
    i_lcs = 0;
    
    while i_1 < len(file1) or i_2 < len(file2):
        while i_1 < len(file1) and (i_lcs >= len(lcs) or file1[i_1] != lcs[i_lcs]):
            print(f'< {i_1}: {file1[i_1]}')
            i_1+=1
   
        while i_2 < len(file2) and (i_lcs >= len(lcs) or file2[i_2] != lcs[i_lcs]):
            print(f'> {i_2}: {file2[i_2]}')
            i_2+=1
            
        i_lcs += 1
        i_1 += 1
        i_2 += 1
    return lcs   
        

In [13]:
diff(['z','c','b','b','d','a'], ['a','b','c','a','b','b','a','z','b','a'])

< 0: z
> 0: a
> 1: b
> 3: a
< 4: d
> 7: z
> 8: b
> 9: a


['c', 'b', 'b', 'a']

#### Podział tokenów na linie

In [14]:
lines_of_tokens = []
tokens_indices = []
with open('romeo-i-julia-700.txt', 'r') as data:
    line_no = 0
    while True: 
        line = data.readline() 
        if not line: 
            break
        tokens_ = tokenizer(line)#
        lines_of_tokens.append(tokens_)
        tokens_indices += [(line_no, t) for t in range(len(tokens_))]
        line_no += 1 

In [15]:
def delete_random_words(lines_of_tokens, tokens_indices):
    A = random.sample(list(range(len(tokens_indices))), 97*len(tokens_indices)//100)
    A.sort()
    output = ["" for i in lines_of_tokens]
    for a in A:
        i, j = tokens_indices[a] 
        if str(lines_of_tokens[i][j]) != "\n":
            output[i] += (str(lines_of_tokens[i][j]) + ' ')
        
    return output

In [16]:
lines1 = delete_random_words(lines_of_tokens, tokens_indices)
lines2 = delete_random_words(lines_of_tokens, tokens_indices)

Poniżej przedstawiono wynik funkcji *diff*. Symbol $<$ oznacza, że dana linia została usunięta z pierwszego pliku. Symbol $>$ oznacza, że linia została dodana w drugim pliku. Następnie prezentowany jest numer zmienionej linii, a na końcu sama linia.

In [17]:
LCS = diff(lines1, lines2)

< 15:   * MERKUCJO krewny księcia 
> 15:   * MERKUCJO — krewny księcia 
< 19:   * JAN — brat z tegoż zgromadzenia 
> 19:   * — brat z tegoż zgromadzenia 
< 22:   * ABRAHAM — służący Montekiego 
< 23:   * APTEKARZ 
< 24: * TRZECH MUZYKANTÓW 
> 22:   ABRAHAM — służący Montekiego 
> 23:   APTEKARZ 
> 24:   * TRZECH MUZYKANTÓW 
< 28:   * PANI MONTEKI — małżonka Montekiego 
> 28: * PANI MONTEKI — małżonka Montekiego 
< 37: Rzecz się przez większą część sztuki w Weronie, przez część piątego aktu w Mantui. 
> 37: Rzecz odbywa się przez większą część sztuki w Weronie, przez część piątego aktu w Mantui. 
< 46: Tam, gdzie się rzecz ta rozgrywa, Weronie, 
> 46: Tam, gdzie się rzecz rozgrywa, w Weronie, 
< 50: Z łon tych dwu wrogów wzięło życie, 
< 51: Pod najstraszliwszą z gwiazd, kochanków dwoje; 
> 50: Z łon tych dwu wrogów wzięło bowiem życie, 
> 51: Pod najstraszliwszą z gwiazd, dwoje; 
< 61: Jest w nim co złego, my usuniem błędy… 
> 61: Jest w nim co złego, usuniem błędy… 
< 77: Dalipan, Grz