In [1]:
def string_to_symbol_list(string):
    last_symbol = 0
    symbol_list = []
    char_to_symbol_dict = {}

    for x in string:
        if x not in char_to_symbol_dict:
            char_to_symbol_dict[x] = last_symbol
            last_symbol += 1
        symbol_list.append(char_to_symbol_dict[x])
    
    return (symbol_list, char_to_symbol_dict)

In [74]:
class KTupleInfo:
    def __init__(self):
        self.count = 0
        self.last = -1
        self.pos_in_queue = -1

    def __repr__(self):
        return f"KTupleInfo(count={self.count}, last={self.last}, pos_in_queue={self.pos_in_queue})"


class SequenceElement:
    def __init__(self):
        self.symbol = None
        self.next = -1
        self.prev = -1
        self.prev_k_tuple = -1

    def __repr__(self):
        return (f"SequenceElement(symbol={self.symbol}, next={self.next}, "
                f"prev={self.prev}, prev_k_tuple={self.prev_k_tuple})")


def construct_active_k_tuples_and_sequence(symbol_list, k):
    active_k_tuples = {}
    sequence = []

    for i in range(k - 1):
        elem = SequenceElement()
        elem.symbol = symbol_list[i]
        elem.next = i + 1
        elem.prev = i - 1
        sequence.append(elem)

    for i in range(k - 1, len(symbol_list)):
        k_tuple = tuple(symbol_list[i - k + 1:i + 1])
        if k_tuple not in active_k_tuples:
            info = KTupleInfo()
            info.count = 1
            info.last = i
            active_k_tuples[k_tuple] = info

            elem = SequenceElement()
            elem.symbol = symbol_list[i]
            elem.next = i + 1
            elem.prev = i - 1
            sequence.append(elem)
        else:
            prev_index = active_k_tuples[k_tuple].last
            active_k_tuples[k_tuple].count += 1
            active_k_tuples[k_tuple].last = i

            elem = SequenceElement()
            elem.symbol = symbol_list[i]
            elem.next = i + 1
            elem.prev = i - 1
            elem.prev_k_tuple = prev_index
            sequence.append(elem)

    sequence[-1].next = -1

    return sequence, active_k_tuples


In [75]:
# we use a dict based on priorities to manage priority queue
# no use for bothering with heapq

def construct_priority_queue(active_k_tuples):
    priority_queue = {}
    
    for k, v in active_k_tuples.items():
        if v.count not in priority_queue:
            priority_queue[v.count] = []
        v.pos_in_queue = len(priority_queue[v.count])
        priority_queue[v.count].append(k)
        
    return priority_queue

In [95]:
def replace_active_k_tuple(priority_queue, active_k_tuples, sequence, k_tuple, new_symbol,k):
    #works only for k=2
    #then need to come up with way to replace all the deleted tuples
    
    changed_k_tuples = {}

    last = sequence[active_k_tuples[k_tuple].last]
    first = sequence[last.prev]

    def change(pair_pos, old_pair, new_pair):
        if new_pair not in changed_k_tuples:
            #first such pair to change
            changed_k_tuples[new_pair] = KTupleInfo()
            changed_k_tuples[new_pair].count = 1
            changed_k_tuples[new_pair].last = pair_pos
            sequence[pair_pos].prev_k_tuple = -1
        else:
            #there were other such pairs
            changed_k_tuples[new_pair].count += 1
            sequence[pair_pos].prev_k_tuple = changed_k_tuples[new_pair].last
            changed_k_tuples[new_pair].last = pair_pos

        #technically the first to be inserted never is in changed_k_tuples but too much code required to fix that, it does no harm
        if old_pair not in changed_k_tuples:
            changed_k_tuples[old_pair] = active_k_tuples[old_pair]
        changed_k_tuples[old_pair].count -= 1
        
    #update pairs
    if first.prev != -1:
        #if it is not the leftmost pair in the sequence 

        pair_pos = last.prev #so the position of first; position of pair is stored in its last symbol 
        prev_symbol = sequence[sequence[first.prev].prev]
        old_prev_pair = (prev_symbol.symbol, first.symbol)
        new_prev_pair = (prev_symbol.symbol, new_symbol)

        change(pair_pos, old_prev_pair, new_prev_pair)

    if last.next != -1:
        
        pair_pos = first.next #so the position of last; position of pair is stored in its last symbol 
        next_symbol = sequence[sequence[last.next].next]
        old_next_pair = (last.symbol, next_symbol.symbol) 
        new_next_pair = (new_symbol, next_symbol.symbol) 

        change(pair_pos, old_next_pair, new_next_pair)

    #update the symbol
    sequence[first.next].symbol = -1

    # while last.prev_k_tuple != -1:
    #     last = sequence[last.prev_k_tuple]

    #REPLACE SYMBOLS! TAKE CARE OF POINTERS!

# tests

In [96]:
string_to_symbol_list("ala ma kota")

([0, 1, 0, 2, 3, 0, 2, 4, 5, 6, 0],
 {'a': 0, 'l': 1, ' ': 2, 'm': 3, 'k': 4, 'o': 5, 't': 6})

In [97]:
construct_active_k_tuples_and_sequence(string_to_symbol_list("0101")[0], 2)

([SequenceElement(symbol=0, next=1, prev=-1, prev_k_tuple=-1),
  SequenceElement(symbol=1, next=2, prev=0, prev_k_tuple=-1),
  SequenceElement(symbol=0, next=3, prev=1, prev_k_tuple=-1),
  SequenceElement(symbol=1, next=-1, prev=2, prev_k_tuple=1)],
 {(0, 1): KTupleInfo(count=2, last=3, pos_in_queue=-1),
  (1, 0): KTupleInfo(count=1, last=2, pos_in_queue=-1)})

In [98]:
construct_active_k_tuples_and_sequence(string_to_symbol_list("0000")[0], 2)

([SequenceElement(symbol=0, next=1, prev=-1, prev_k_tuple=-1),
  SequenceElement(symbol=0, next=2, prev=0, prev_k_tuple=-1),
  SequenceElement(symbol=0, next=3, prev=1, prev_k_tuple=1),
  SequenceElement(symbol=0, next=-1, prev=2, prev_k_tuple=2)],
 {(0, 0): KTupleInfo(count=3, last=3, pos_in_queue=-1)})

In [99]:
construct_active_k_tuples_and_sequence(string_to_symbol_list("012012")[0], 2)

([SequenceElement(symbol=0, next=1, prev=-1, prev_k_tuple=-1),
  SequenceElement(symbol=1, next=2, prev=0, prev_k_tuple=-1),
  SequenceElement(symbol=2, next=3, prev=1, prev_k_tuple=-1),
  SequenceElement(symbol=0, next=4, prev=2, prev_k_tuple=-1),
  SequenceElement(symbol=1, next=5, prev=3, prev_k_tuple=1),
  SequenceElement(symbol=2, next=-1, prev=4, prev_k_tuple=2)],
 {(0, 1): KTupleInfo(count=2, last=4, pos_in_queue=-1),
  (1, 2): KTupleInfo(count=2, last=5, pos_in_queue=-1),
  (2, 0): KTupleInfo(count=1, last=3, pos_in_queue=-1)})

In [100]:
from pprint import pprint

In [101]:
sequence, active_k_tuples = construct_active_k_tuples_and_sequence(string_to_symbol_list("012012")[0], 2)
pprint("active_k_tuples before")
pprint(active_k_tuples)
priority_queue = construct_priority_queue(active_k_tuples)
pprint("active_k_tuples after")
pprint(active_k_tuples)
pprint("priority queue")
pprint(priority_queue)

'active_k_tuples before'
{(0, 1): KTupleInfo(count=2, last=4, pos_in_queue=-1),
 (1, 2): KTupleInfo(count=2, last=5, pos_in_queue=-1),
 (2, 0): KTupleInfo(count=1, last=3, pos_in_queue=-1)}
'active_k_tuples after'
{(0, 1): KTupleInfo(count=2, last=4, pos_in_queue=0),
 (1, 2): KTupleInfo(count=2, last=5, pos_in_queue=1),
 (2, 0): KTupleInfo(count=1, last=3, pos_in_queue=0)}
'priority queue'
{1: [(2, 0)], 2: [(0, 1), (1, 2)]}


In [102]:
sequence, active_k_tuples = construct_active_k_tuples_and_sequence(string_to_symbol_list("012012")[0], 2)
priority_queue = construct_priority_queue(active_k_tuples)

pprint("active_k_tuples before")
pprint(active_k_tuples)
pprint("priority queue before")
pprint(priority_queue)
pprint("sequence after")
pprint(sequence)

replace_active_k_tuple(priority_queue, active_k_tuples, sequence, (0,1), 3, 2)

pprint("active_k_tuples after")
pprint(active_k_tuples)
pprint("priority queue after")
pprint(priority_queue)
pprint("sequence after")
pprint(sequence)

'active_k_tuples before'
{(0, 1): KTupleInfo(count=2, last=4, pos_in_queue=0),
 (1, 2): KTupleInfo(count=2, last=5, pos_in_queue=1),
 (2, 0): KTupleInfo(count=1, last=3, pos_in_queue=0)}
'priority queue before'
{1: [(2, 0)], 2: [(0, 1), (1, 2)]}
'sequence after'
[SequenceElement(symbol=0, next=1, prev=-1, prev_k_tuple=-1),
 SequenceElement(symbol=1, next=2, prev=0, prev_k_tuple=-1),
 SequenceElement(symbol=2, next=3, prev=1, prev_k_tuple=-1),
 SequenceElement(symbol=0, next=4, prev=2, prev_k_tuple=-1),
 SequenceElement(symbol=1, next=5, prev=3, prev_k_tuple=1),
 SequenceElement(symbol=2, next=-1, prev=4, prev_k_tuple=2)]


KeyError: (1, 0)