In [1]:
#!jupyter nbconvert --to='script' repair.ipynb

In [1]:
def string_to_symbol_list(string):
    last_symbol = 0
    symbol_list = []
    char_to_symbol_dict = {}

    for x in string:
        if x not in char_to_symbol_dict:
            char_to_symbol_dict[x] = last_symbol
            last_symbol += 1
        symbol_list.append(char_to_symbol_dict[x])
    
    return (symbol_list, char_to_symbol_dict)

In [56]:
class KTupleInfo:
    def __init__(self):
        self.count = 0
        self.last = -1
        self.pos_in_queue = -1

    @classmethod
    def print_dict(cls, items):
        print(f"{'Key':<6} {'Count':<6} {'Last':<6} {'PosInQueue':<12}")
        print("-" * 32)
        for i, item in items.items():
            print(f"{str(i):<6} {item.count:<6} {item.last:<6} {item.pos_in_queue:<12}")

class SequenceElement:
    def __init__(self):
        self.symbol = None
        self.pos = -1
        self.prev_k_tuple = -1
        self.next_k_tuple = -1

    @classmethod
    def print_list(cls, items):
        print(f"{'Index':<6} {'Symbol':<10} {'Pos':<6} {'PrevKTuple':<12} {'NextKTuple':<12}")
        print("-" * 50)
        for i, item in enumerate(items):
            print(f"{str(i):<6} {str(item.symbol):<10} {item.pos:<6} {str(item.prev_k_tuple):<12} {str(item.next_k_tuple):<12}")




def construct_active_k_tuples_and_sequence(symbol_list, k):
    active_k_tuples = {}
    sequence = []

    for i in range(k - 1):
        elem = SequenceElement()
        elem.symbol = symbol_list[i]
        elem.prev_k_tuple = -1
        elem.pos = i
        elem.next_k_tuple = -1
        sequence.append(elem)

    for i in range(k - 1, len(symbol_list)):
        k_tuple = tuple(symbol_list[i - k + 1:i + 1])
        if k_tuple not in active_k_tuples:
            info = KTupleInfo()
            info.count = 1
            info.last = i
            active_k_tuples[k_tuple] = info

            elem = SequenceElement()
            elem.symbol = symbol_list[i]
            elem.prev_k_tuple = -1
            elem.pos = i
            elem.next_k_tuple = -1
            sequence.append(elem)
        else:
            prev_index = active_k_tuples[k_tuple].last
            active_k_tuples[k_tuple].count += 1
            active_k_tuples[k_tuple].last = i

            elem = SequenceElement()
            elem.symbol = symbol_list[i]
            elem.pos = i
            elem.prev_k_tuple = prev_index
            sequence[prev_index].next_k_tuple = i
            sequence.append(elem)

    return sequence, active_k_tuples


In [57]:
# we use a dict based on priorities to manage priority queue
# no use for bothering with heapq

def construct_priority_queue(active_k_tuples):
    priority_queue = {}
    
    for k, v in active_k_tuples.items():
        if v.count not in priority_queue:
            priority_queue[v.count] = []
        v.pos_in_queue = len(priority_queue[v.count])
        priority_queue[v.count].append(k)
        
    return priority_queue

In [58]:
def replace_active_k_tuple(priority_queue, active_k_tuples, sequence, k_tuple, new_symbol,k):
    #works only for k=2
    #then need to come up with way to replace all the deleted tuples
    
    changed_k_tuples = {}

    #NotASymbol
    NAS = -1
    def get_prev_index(node):
        pos = node.pos-1
        while pos >= 0 and sequence[pos] == NAS:
            pos -= 1
        return pos

    def get_next_index(node):
        pos = node.pos+1
        while pos < len(sequence) and sequence[pos] == NAS:
            pos += 1
        return pos
        

    last = sequence[active_k_tuples[k_tuple].last]
    first = sequence[get_prev_index(last)]

    def update_counts_link_new_prev_k_tuples(pair_pos, old_pair, new_pair):
        if new_pair not in changed_k_tuples:
            #first such pair to change
            changed_k_tuples[new_pair] = KTupleInfo()
            changed_k_tuples[new_pair].count = 1
            changed_k_tuples[new_pair].last = pair_pos
            sequence[pair_pos].prev_k_tuple = -1
        else:
            #there were other such pairs
            changed_k_tuples[new_pair].count += 1
            sequence[pair_pos].prev_k_tuple = changed_k_tuples[new_pair].last
            changed_k_tuples[new_pair].last = pair_pos

        #technically the first to be inserted never is in changed_k_tuples but too much code required to fix that, it does no harm
        if old_pair not in changed_k_tuples:
            changed_k_tuples[old_pair] = active_k_tuples[old_pair]
        changed_k_tuples[old_pair].count -= 1
        
    #update pairs
    prev_index = get_prev_index(first)
    if prev_index != -1:
        #if it is not the leftmost pair in the sequence 

        pair_pos = first.pos #so the position of first; position of pair is stored in its last symbol 
        prev_symbol = sequence[prev_index]
        old_prev_pair = (prev_symbol.symbol, first.symbol)
        new_prev_pair = (prev_symbol.symbol, new_symbol)

        update_counts_link_new_prev_k_tuples(pair_pos, old_prev_pair, new_prev_pair)

    next_index = get_next_index(last)
    if next_index != -1:
        
        pair_pos = next_index #so the position of last; position of pair is stored in its last symbol 
        next_symbol = sequence[next_index]
        old_next_pair = (last.symbol, next_symbol.symbol) 
        new_next_pair = (new_symbol, next_symbol.symbol) 

        update_counts_link_new_prev_k_tuples (pair_pos, old_next_pair, new_next_pair)

    #update the symbol
    first.symbol = -1
    last.symbol = new_symbol
    
    # while last.prev_k_tuple != -1:
    #     last = sequence[last.prev_k_tuple]

    #REPLACE SYMBOLS! TAKE CARE OF POINTERS!

# tests

In [59]:
from pprint import pprint

In [64]:
sequence, active_k_tuples = construct_active_k_tuples_and_sequence(string_to_symbol_list("012012")[0], 2)
priority_queue = construct_priority_queue(active_k_tuples)
print("-"*32)
print("active_k_tuples before")
print("-"*32)
KTupleInfo.print_dict(active_k_tuples)
print("-"*32)
print("priority queue before")
print("-"*32)
print(priority_queue)
print("-"*32)
print("sequence before")
print("-"*32)
SequenceElement.print_list(sequence)
print("-"*32)

replace_active_k_tuple(priority_queue, active_k_tuples, sequence, (0,1), 3, 2)

print("-"*32)
print("active_k_tuples after")
print("-"*32)
KTupleInfo.print_dict(active_k_tuples)
print("-"*32)
print("priority queue after")
print("-"*32)
print(priority_queue)
print("-"*32)
print("sequence after")
print("-"*32)
SequenceElement.print_list(sequence)
print("-"*32)


--------------------------------
active_k_tuples before
--------------------------------
Key    Count  Last   PosInQueue  
--------------------------------
(0, 1) 2      4      0           
(1, 2) 2      5      1           
(2, 0) 1      3      0           
--------------------------------
priority queue before
--------------------------------
{2: [(0, 1), (1, 2)], 1: [(2, 0)]}
--------------------------------
sequence before
--------------------------------
Index  Symbol     Pos    PrevKTuple   NextKTuple  
--------------------------------------------------
0      0          0      -1           -1          
1      1          1      -1           4           
2      2          2      -1           5           
3      0          3      -1           -1          
4      1          4      1            -1          
5      2          5      2            -1          
--------------------------------
--------------------------------
active_k_tuples after
--------------------------------
Key    C

In [63]:
string_to_symbol_list("ala ma kota")

([0, 1, 0, 2, 3, 0, 2, 4, 5, 6, 0],
 {'a': 0, 'l': 1, ' ': 2, 'm': 3, 'k': 4, 'o': 5, 't': 6})

In [65]:
s, a = construct_active_k_tuples_and_sequence(string_to_symbol_list("0101")[0], 2)
SequenceElement.print_list(s)
KTupleInfo.print_dict(a)

Index  Symbol     Pos    PrevKTuple   NextKTuple  
--------------------------------------------------
0      0          0      -1           -1          
1      1          1      -1           3           
2      0          2      -1           -1          
3      1          3      1            -1          
Key    Count  Last   PosInQueue  
--------------------------------
(0, 1) 2      3      -1          
(1, 0) 1      2      -1          


In [66]:
s, a = construct_active_k_tuples_and_sequence(string_to_symbol_list("000")[0], 2)
SequenceElement.print_list(s)
KTupleInfo.print_dict(a)

Index  Symbol     Pos    PrevKTuple   NextKTuple  
--------------------------------------------------
0      0          0      -1           -1          
1      0          1      -1           2           
2      0          2      1            -1          
Key    Count  Last   PosInQueue  
--------------------------------
(0, 0) 2      2      -1          


In [67]:
s, a = construct_active_k_tuples_and_sequence(string_to_symbol_list("012012")[0], 2)
SequenceElement.print_list(s)
KTupleInfo.print_dict(a)

Index  Symbol     Pos    PrevKTuple   NextKTuple  
--------------------------------------------------
0      0          0      -1           -1          
1      1          1      -1           4           
2      2          2      -1           5           
3      0          3      -1           -1          
4      1          4      1            -1          
5      2          5      2            -1          
Key    Count  Last   PosInQueue  
--------------------------------
(0, 1) 2      4      -1          
(1, 2) 2      5      -1          
(2, 0) 1      3      -1          


In [11]:
from pprint import pprint

In [69]:
sequence, active_k_tuples = construct_active_k_tuples_and_sequence(string_to_symbol_list("012012")[0], 2)
print("active_k_tuples before")
KTupleInfo.print_dict(active_k_tuples)
priority_queue = construct_priority_queue(active_k_tuples)
print("active_k_tuples after")
KTupleInfo.print_dict(active_k_tuples)
print("priority queue")
pprint(priority_queue)

active_k_tuples before
Key    Count  Last   PosInQueue  
--------------------------------
(0, 1) 2      4      -1          
(1, 2) 2      5      -1          
(2, 0) 1      3      -1          
active_k_tuples after
Key    Count  Last   PosInQueue  
--------------------------------
(0, 1) 2      4      0           
(1, 2) 2      5      1           
(2, 0) 1      3      0           
priority queue
{1: [(2, 0)], 2: [(0, 1), (1, 2)]}
