In [1]:
def string_to_symbol_list(string):
    last_symbol = 0
    symbol_list = []
    char_to_symbol_dict = {}

    for x in string:
        if x not in char_to_symbol_dict:
            char_to_symbol_dict[x] = last_symbol
            last_symbol += 1
        symbol_list.append(char_to_symbol_dict[x])
    
    return (symbol_list, char_to_symbol_dict)

In [66]:
def construct_active_k_tuples_and_sequence(symbol_list, k):
    active_k_tuples = {}
    sequence = []

    #before any k_tuples can be found
    for i in range(k-1):
        sequence.append({
            "symbol": symbol_list[i],
            "next": i+1,
            "prev": i-1,
            "prev_k_tuple": -1,
        })

    #start of the range
    for i in range(k-1, len(symbol_list)):
        k_tuple = tuple(symbol_list[i-k+1:i+1])
        if k_tuple not in active_k_tuples:
            #this is first such k_tuple
            active_k_tuples[k_tuple] = {
                "count" : 1,
                "last" : i,
            }
            sequence.append({
                "symbol": symbol_list[i],
                "next": i+1,
                "prev": i-1,
                "prev_k_tuple": -1,
            })
        else:
            #other k_tuples exist
            active_k_tuples[k_tuple]["count"] += 1
            sequence.append({
                "symbol": symbol_list[i],
                "next": i+1,
                "prev": i-1,
                "prev_k_tuple": active_k_tuples[k_tuple]["last"]
            })
            active_k_tuples[k_tuple]["last"] = i

    sequence[-1]["next"] = -1

    return (sequence, active_k_tuples)

In [67]:
# we use a dict based on priorities to manage priority queue
# no use for bothering with heapq

def construct_priority_queue(active_k_tuples):
    priority_queue = {}
    
    for k, v in active_k_tuples.items():
        if v["count"] not in priority_queue:
            priority_queue[v["count"]] = []
        v["pos_in_queue"] = len(priority_queue[v["count"]])
        priority_queue[v["count"]].append(k)
        
    return priority_queue

In [64]:
def replace_active_k_tuple(priority_queue, active_k_tuples, sequence, k_tuple, new_symbol,k):
    #works only for k=2
    #then need to come up with way to replace all the deleted tuples
    
    changed_k_tuples = {}

    last = sequence[active_k_tuples[k_tuple]["last"]]
    first = sequence[last["prev"]]

    prev_k_tuple = last["prev_k_tuple"]

    #update pairs
    if first["prev"] != -1:
        prev_symbol = sequence[sequence[first]["prev"]]
        prev_pair = (prev_symbol["symbol"], first["symbol"])
        new_prev_pair = (prev_symbol["symbol"], new_symbol)

        if new_prev_pair not in changed_k_tuples:
            #first such pair to change
            changed_k_tuples[new_prev_pair] ={
                "count" : 1,
                "last" : last["prev"], #so the position of prev; position of pair is stored in its last symbol
            }
        else:
            changed_k_tuples[new_prev_pair]["count"] += 1
            
            changed_k_tuples[new_prev_pair]["last"] = 

        #technically the first to be inserted never is 
        if prev_pair not in changed_k_tuples:
            changed_k_tuples[prev_pair] = active_k_tuples[prev_pair]
        changed_k_tuples[prev_pair]["count"] -= 1

    if last["next"] != -1:
        next_symbol = sequence[sequence[last]["next"]]
        next_pair = (last["symbol"], next_symbol["symbol"]) 
        new_next_pair = (new_symbol, next_symbol["symbol"]) 
        
        changed_k_tuples[new_prev_pair] ={
            "count" : 1,
            "last" : first["prev"],
        }
    
        if next_pair not in changed_k_tuples:
            changed_k_tuples[next_pair] = active_k_tuples[next_pair]
        changed_k_tuples[next_pair]["count"] -= 1

    while last["prev_k_tuple"] != -1:
        last = sequence[last["prev_k_tuple"]]

    #REPLACE SYMBOLS! TAKE CARE OF POINTERS!

# tests

In [59]:
string_to_symbol_list("ala ma kota")

([0, 1, 0, 2, 3, 0, 2, 4, 5, 6, 0],
 {'a': 0, 'l': 1, ' ': 2, 'm': 3, 'k': 4, 'o': 5, 't': 6})

In [60]:
construct_active_k_tuples_and_sequence(string_to_symbol_list("0101")[0], 2)

([{'symbol': 0, 'next': 1, 'prev': -1, 'prev_k_tuple': -1},
  {'symbol': 1, 'next': 2, 'prev': 0, 'prev_k_tuple': -1},
  {'symbol': 0, 'next': 3, 'prev': 1, 'prev_k_tuple': -1},
  {'symbol': 1, 'next': 4, 'prev': 2, 'prev_k_tuple': 1}],
 {(0, 1): {'count': 2, 'last': 3}, (1, 0): {'count': 1, 'last': 2}})

In [55]:
construct_active_k_tuples_and_sequence(string_to_symbol_list("0000")[0], 2)

([{'symbol': 0, 'next': -1, 'prev': 1, 'prev_k_tuple': -1},
  {'symbol': 0, 'next': 0, 'prev': 2, 'prev_k_tuple': -1},
  {'symbol': 0, 'next': 1, 'prev': 3, 'prev_k_tuple': 1},
  {'symbol': 0, 'next': 2, 'prev': 4, 'prev_k_tuple': 2}],
 {(0, 0): {'count': 3, 'last': 3}})

In [61]:
construct_active_k_tuples_and_sequence(string_to_symbol_list("012012")[0], 2)

([{'symbol': 0, 'next': 1, 'prev': -1, 'prev_k_tuple': -1},
  {'symbol': 1, 'next': 2, 'prev': 0, 'prev_k_tuple': -1},
  {'symbol': 2, 'next': 3, 'prev': 1, 'prev_k_tuple': -1},
  {'symbol': 0, 'next': 4, 'prev': 2, 'prev_k_tuple': -1},
  {'symbol': 1, 'next': 5, 'prev': 3, 'prev_k_tuple': 1},
  {'symbol': 2, 'next': 6, 'prev': 4, 'prev_k_tuple': 2}],
 {(0, 1): {'count': 2, 'last': 4},
  (1, 2): {'count': 2, 'last': 5},
  (2, 0): {'count': 1, 'last': 3}})

In [62]:
from pprint import pprint

In [63]:
sequence, active_k_tuples = construct_active_k_tuples_and_sequence(string_to_symbol_list("012012")[0], 2)
pprint("active_k_tuples before")
pprint(active_k_tuples)
priority_queue = construct_priority_queue(active_k_tuples)
pprint("active_k_tuples after")
pprint(active_k_tuples)
pprint("priority queue")
pprint(priority_queue)

'active_k_tuples before'
{(0, 1): {'count': 2, 'last': 4},
 (1, 2): {'count': 2, 'last': 5},
 (2, 0): {'count': 1, 'last': 3}}
'active_k_tuples after'
{(0, 1): {'count': 2, 'last': 4, 'pos_in_queue': 0},
 (1, 2): {'count': 2, 'last': 5, 'pos_in_queue': 1},
 (2, 0): {'count': 1, 'last': 3, 'pos_in_queue': 0}}
'priority queue'
{1: [(2, 0)], 2: [(0, 1), (1, 2)]}
