In [1]:
#!jupyter nbconvert --to='script' repair.ipynb

In [107]:
def string_to_symbol_list(string):
    last_symbol = 0
    symbol_list = []
    char_to_symbol_dict = {}

    for x in string:
        if x not in char_to_symbol_dict:
            char_to_symbol_dict[x] = last_symbol
            last_symbol += 1
        symbol_list.append(char_to_symbol_dict[x])
    
    return (symbol_list, char_to_symbol_dict)

In [108]:
class KTupleInfo:
    def __init__(self):
        self.count = 0
        self.last = -1
        self.pos_in_queue = -1

    @classmethod
    def print_dict(cls, items):
        print(f"{'Key':<6} {'Count':<6} {'Last':<6} {'PosInQueue':<12}")
        print("-" * 32)
        for i, item in items.items():
            print(f"{str(i):<6} {item.count:<6} {item.last:<6} {item.pos_in_queue:<12}")
    
    def __repr__(self):
        return f"KTupleInfo(count={self.count}, last={self.last}, pos_in_queue={self.pos_in_queue})"


class SequenceElement:
    def __init__(self):
        self.symbol = None
        self.pos = -1
        self.prev_k_tuple = -1
        self.next_k_tuple = -1

    @classmethod
    def print_list(cls, items):
        print(f"{'Index':<6} {'Symbol':<10} {'Pos':<6} {'PrevKTuple':<12} {'NextKTuple':<12}")
        print("-" * 50)
        for i, item in enumerate(items):
            print(f"{str(i):<6} {str(item.symbol):<10} {item.pos:<6} {str(item.prev_k_tuple):<12} {str(item.next_k_tuple):<12}")

    def __repr__(self):
        return (f"SequenceElement(symbol={self.symbol}, prev_k_tuple={self.prev_k_tuple}, pos={self.pos}, "
                f"next_k_tuple={self.next_k_tuple})")




def construct_active_k_tuples_and_sequence(symbol_list, k):
    active_k_tuples = {}
    sequence = []

    for i in range(k - 1):
        elem = SequenceElement()
        elem.symbol = symbol_list[i]
        elem.prev_k_tuple = -1
        elem.pos = i
        elem.next_k_tuple = -1
        sequence.append(elem)

    for i in range(k - 1, len(symbol_list)):
        k_tuple = tuple(symbol_list[i - k + 1:i + 1])
        if k_tuple not in active_k_tuples:
            info = KTupleInfo()
            info.count = 1
            info.last = i
            active_k_tuples[k_tuple] = info

            elem = SequenceElement()
            elem.symbol = symbol_list[i]
            elem.prev_k_tuple = -1
            elem.pos = i
            elem.next_k_tuple = -1
            sequence.append(elem)
        else:
            prev_index = active_k_tuples[k_tuple].last
            active_k_tuples[k_tuple].count += 1
            active_k_tuples[k_tuple].last = i

            elem = SequenceElement()
            elem.symbol = symbol_list[i]
            elem.pos = i
            elem.prev_k_tuple = prev_index
            sequence[prev_index].next_k_tuple = i
            sequence.append(elem)

    return sequence, active_k_tuples


In [109]:
# we use a dict based on priorities to manage priority queue
# no use for bothering with heapq

def construct_priority_queue(active_k_tuples):
    priority_queue = {}
    
    for k, v in active_k_tuples.items():
        if v.count not in priority_queue:
            priority_queue[v.count] = []
        v.pos_in_queue = len(priority_queue[v.count])
        priority_queue[v.count].append(k)
        
    return priority_queue

In [116]:
def replace_active_k_tuple(priority_queue, active_k_tuples, sequence, k_tuple, new_symbol,k):
    #works only for k=2
    #then need to come up with way to replace all the deleted tuples
    
    changed_k_tuples = {}

    #NotASymbol
    NAS = -1
    def get_prev_index(node): 
        #return -1 if not found
        if node.pos == 0:
            return -1 
        elif sequence[node.pos-1].symbol != NAS:
            return node.pos-1
        else:
            return sequence[node.pos-1].prev_k_tuple

    def get_next_index(node):
        #return -1 if not found
        if node.pos == len(sequence)-1:
            return -1 
        elif sequence[node.pos+1].symbol != NAS:
            return node.pos+1
        else:
            return sequence[node.pos+1].next

    def update_active_k_tuples(pair_head, old_pair, new_pair):
        if new_pair not in changed_k_tuples:
            #first such pair to change

            #when removing the first pair, pointer in active_k_tuples also needs to be changed
            
            changed_k_tuples[new_pair] = KTupleInfo()
            changed_k_tuples[new_pair].count = 1
            changed_k_tuples[new_pair].last = pair_head.pos
            changed_k_tuples[new_pair].list_begin = pair_head.pos
            pair_head.prev_k_tuple = -1
            pair_head.next_k_tuple = -1
        else:
            #there were other such pairs
            changed_k_tuples[new_pair].count += 1
            pair_head.next_k_tuple = changed_k_tuples[new_pair].last
            changed_k_tuples[new_pair].last = pair_head.pos

        if old_pair not in changed_k_tuples:
            changed_k_tuples[old_pair] = active_k_tuples[old_pair]
        changed_k_tuples[old_pair].count -= 1

    def update_pointers(removed_symbol):
        #update k_tuple linking
        if removed_symbol.prev_k_tuple != -1:
            sequence[removed_symbol.prev_k_tuple].next_k_tuple = removed_symbol.next_k_tuple
        if removed_symbol.next_k_tuple != -1:
            sequence[removed_symbol.next_k_tuple].prev_k_tuple = removed_symbol.prev_k_tuple

        #make shortcuts for the deleted node
        if removed_symbol.pos != 0:
            if sequence[removed_symbol.pos-1].symbol == NAS:
                removed_symbol.prev_k_tuple = sequence[removed_symbol.pos-1].prev_k_tuple
            else:
                removed_symbol.prev_k_tuple = removed_symbol.pos -1 
        if removed_symbol.pos != len(sequence)-1:
            if sequence[removed_symbol.pos+1].symbol == NAS:
                removed_symbol.next_k_tuple = sequence[removed_symbol.pos+1].next_k_tuple
            else:
                removed_symbol.next_k_tuple = removed_symbol.pos +1 
        

    last_k_tuple_pos = active_k_tuples[k_tuple].last 

    while last_k_tuple_pos != -1:
        
            
        last = sequence[last_k_tuple_pos]
        first = sequence[get_prev_index(last)] #always finds an index 
        last_k_tuple_pos = last.prev_k_tuple
    
        #update pairs
        prev_index = get_prev_index(first)
        if prev_index != -1:
            #if it is not the leftmost pair in the sequence 
            prev_symbol = sequence[prev_index]
            old_prev_pair = (prev_symbol.symbol, first.symbol)
            new_prev_pair = (prev_symbol.symbol, new_symbol)
            #the head of the new pair is the head of the previous pair
            update_active_k_tuples(last, old_prev_pair, new_prev_pair)
    
        next_index = get_next_index(last)
        if next_index != -1:
            next_symbol = sequence[next_index]
            old_next_pair = (last.symbol, next_symbol.symbol) 
            new_next_pair = (new_symbol, next_symbol.symbol) 
            update_active_k_tuples (next_symbol, old_next_pair, new_next_pair)
            update_pointers(next_symbol)
    
        #update the symbol
        first.symbol = -1
        last.symbol = new_symbol
        update_pointers(first)



# 

# tests

In [117]:
from pprint import pprint

In [118]:
def debug(string, k_tuple, new_symbol):
    sequence, active_k_tuples = construct_active_k_tuples_and_sequence(string_to_symbol_list(string)[0], 2)
    priority_queue = construct_priority_queue(active_k_tuples)
    # print("-"*32)
    # print("active_k_tuples before")
    # print("-"*32)
    # KTupleInfo.print_dict(active_k_tuples)
    # print("-"*32)
    # print("priority queue before")
    # print("-"*32)
    # print(priority_queue)
    # print("-"*32)
    print("sequence before")
    print("-"*32)
    SequenceElement.print_list(sequence)
    print("-"*32)
    
    replace_active_k_tuple(priority_queue, active_k_tuples, sequence, k_tuple, new_symbol, 2)
    
    # print("-"*32)
    # print("active_k_tuples after")
    # print("-"*32)
    # KTupleInfo.print_dict(active_k_tuples)
    # print("-"*32)
    # print("priority queue after")
    # print("-"*32)
    # print(priority_queue)
    # print("-"*32)
    print("sequence after")
    print("-"*32)
    SequenceElement.print_list(sequence)
    print("-"*32)

debug("012312012", (1,2), 4)

sequence before
--------------------------------
Index  Symbol     Pos    PrevKTuple   NextKTuple  
--------------------------------------------------
0      0          0      -1           -1          
1      1          1      -1           7           
2      2          2      -1           5           
3      3          3      -1           -1          
4      1          4      -1           -1          
5      2          5      2            8           
6      0          6      -1           -1          
7      1          7      1            -1          
8      2          8      5            -1          
--------------------------------
sequence after
--------------------------------
Index  Symbol     Pos    PrevKTuple   NextKTuple  
--------------------------------------------------
0      0          0      -1           -1          
1      -1         1      0            2           
2      4          2      -1           8           
3      3          3      2            5           
4  

In [82]:
debug("012012", (1,2), 3)

sequence before
--------------------------------
Index  Symbol     Pos    PrevKTuple   NextKTuple  
--------------------------------------------------
0      0          0      -1           -1          
1      1          1      -1           4           
2      2          2      -1           5           
3      0          3      -1           -1          
4      1          4      1            -1          
5      2          5      2            -1          
--------------------------------
sequence after
--------------------------------
Index  Symbol     Pos    PrevKTuple   NextKTuple  
--------------------------------------------------
0      0          0      -1           -1          
1      -1         1      4            4           
2      3          2      -1           5           
3      0          3      -1           -1          
4      -1         4      -1           -1          
5      3          5      2            -1          
--------------------------------


In [77]:
debug("01", (0,1), 3)

sequence before
--------------------------------
Index  Symbol     Pos    PrevKTuple   NextKTuple  
--------------------------------------------------
0      0          0      -1           -1          
1      1          1      -1           -1          
--------------------------------
sequence after
--------------------------------
Index  Symbol     Pos    PrevKTuple   NextKTuple  
--------------------------------------------------
0      -1         0      -1           -1          
1      3          1      -1           -1          
--------------------------------


In [63]:
string_to_symbol_list("ala ma kota")

([0, 1, 0, 2, 3, 0, 2, 4, 5, 6, 0],
 {'a': 0, 'l': 1, ' ': 2, 'm': 3, 'k': 4, 'o': 5, 't': 6})

In [65]:
s, a = construct_active_k_tuples_and_sequence(string_to_symbol_list("0101")[0], 2)
SequenceElement.print_list(s)
KTupleInfo.print_dict(a)

Index  Symbol     Pos    PrevKTuple   NextKTuple  
--------------------------------------------------
0      0          0      -1           -1          
1      1          1      -1           3           
2      0          2      -1           -1          
3      1          3      1            -1          
Key    Count  Last   PosInQueue  
--------------------------------
(0, 1) 2      3      -1          
(1, 0) 1      2      -1          


In [66]:
s, a = construct_active_k_tuples_and_sequence(string_to_symbol_list("000")[0], 2)
SequenceElement.print_list(s)
KTupleInfo.print_dict(a)

Index  Symbol     Pos    PrevKTuple   NextKTuple  
--------------------------------------------------
0      0          0      -1           -1          
1      0          1      -1           2           
2      0          2      1            -1          
Key    Count  Last   PosInQueue  
--------------------------------
(0, 0) 2      2      -1          


In [67]:
s, a = construct_active_k_tuples_and_sequence(string_to_symbol_list("012012")[0], 2)
SequenceElement.print_list(s)
KTupleInfo.print_dict(a)

Index  Symbol     Pos    PrevKTuple   NextKTuple  
--------------------------------------------------
0      0          0      -1           -1          
1      1          1      -1           4           
2      2          2      -1           5           
3      0          3      -1           -1          
4      1          4      1            -1          
5      2          5      2            -1          
Key    Count  Last   PosInQueue  
--------------------------------
(0, 1) 2      4      -1          
(1, 2) 2      5      -1          
(2, 0) 1      3      -1          


In [11]:
from pprint import pprint

In [69]:
sequence, active_k_tuples = construct_active_k_tuples_and_sequence(string_to_symbol_list("012012")[0], 2)
print("active_k_tuples before")
KTupleInfo.print_dict(active_k_tuples)
priority_queue = construct_priority_queue(active_k_tuples)
print("active_k_tuples after")
KTupleInfo.print_dict(active_k_tuples)
print("priority queue")
pprint(priority_queue)

active_k_tuples before
Key    Count  Last   PosInQueue  
--------------------------------
(0, 1) 2      4      -1          
(1, 2) 2      5      -1          
(2, 0) 1      3      -1          
active_k_tuples after
Key    Count  Last   PosInQueue  
--------------------------------
(0, 1) 2      4      0           
(1, 2) 2      5      1           
(2, 0) 1      3      0           
priority queue
{1: [(2, 0)], 2: [(0, 1), (1, 2)]}
