In [1]:
from __future__ import annotations

from collections import Counter
from random import randint, choice, choices
from time import time

from efnlp import CharLanguage, SuffixTree, SuffixTreeSet

In [2]:
SuffixTree(2).parse([0,1,2], 3)

SuffixTree(token=2, children={2: SuffixTree(token=2, children={1: SuffixTree(token=1, children={0: SuffixTree(token=0, children={}, nexts={3: Target(token=3, count=1, probability=0.0)}, sampler=[[], []])}, nexts={3: Target(token=3, count=1, probability=0.0)}, sampler=[[], []])}, nexts={3: Target(token=3, count=1, probability=0.0)}, sampler=[[], []])}, nexts={3: Target(token=3, count=1, probability=0.0)}, sampler=[[], []])

In [3]:
L, B, N = 10, 3, 150
S = SuffixTreeSet(L)
C = [randint(0, L-1) for k in range(N)]
Cs = ','.join(str(t) for t in C)
for i in range(B, N-1):
    S.parse(C[i-B:i], C[i])

S.normalize()

S

SuffixTreeSet(size=10, trees=[SuffixTree(token=0, children={7: SuffixTree(token=7, children={2: SuffixTree(token=2, children={}, nexts={1: Target(token=1, count=1, probability=1.0)}, sampler=[[1], [1.0]])}, nexts={1: Target(token=1, count=1, probability=1.0)}, sampler=[[1], [1.0]]), 8: SuffixTree(token=8, children={3: SuffixTree(token=3, children={}, nexts={8: Target(token=8, count=1, probability=1.0)}, sampler=[[8], [1.0]]), 4: SuffixTree(token=4, children={}, nexts={5: Target(token=5, count=1, probability=1.0)}, sampler=[[5], [1.0]])}, nexts={8: Target(token=8, count=1, probability=0.5), 5: Target(token=5, count=1, probability=0.5)}, sampler=[[8, 5], [0.5, 0.5]]), 4: SuffixTree(token=4, children={6: SuffixTree(token=6, children={}, nexts={6: Target(token=6, count=1, probability=1.0)}, sampler=[[6], [1.0]])}, nexts={6: Target(token=6, count=1, probability=1.0)}, sampler=[[6], [1.0]]), 6: SuffixTree(token=6, children={0: SuffixTree(token=0, children={}, nexts={5: Target(token=5, count=

In [4]:
for p in S.prefixes():
    ps = ','.join(str(t) for t in p)
    assert ps in Cs, f"prefix {ps} was not found in corpus"

In [5]:
choice(S.prefixes())

[4, 9, 5]

In [6]:
S.patterns()
for p in S.patterns():
    ps = ','.join(str(t) for t in p[0]) + "," + str(p[1])
    assert ps in Cs, f"prefix {ps} was not found in corpus"

In [7]:
p = choice(S.prefixes())
print(S.search(p))
Counter([ S.sample(p) for _ in range(10) ])

[7, 7, 4]


Counter({9: 10})

In [8]:
with open("data/tinywillspeare.txt", "r") as f:
    data = f.read()

L = CharLanguage.from_corpus(data)
L

CharLanguage(size=65, stot={'\n': 0, ' ': 1, '!': 2, '$': 3, '&': 4, "'": 5, ',': 6, '-': 7, '.': 8, '3': 9, ':': 10, ';': 11, '?': 12, 'A': 13, 'B': 14, 'C': 15, 'D': 16, 'E': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'J': 22, 'K': 23, 'L': 24, 'M': 25, 'N': 26, 'O': 27, 'P': 28, 'Q': 29, 'R': 30, 'S': 31, 'T': 32, 'U': 33, 'V': 34, 'W': 35, 'X': 36, 'Y': 37, 'Z': 38, 'a': 39, 'b': 40, 'c': 41, 'd': 42, 'e': 43, 'f': 44, 'g': 45, 'h': 46, 'i': 47, 'j': 48, 'k': 49, 'l': 50, 'm': 51, 'n': 52, 'o': 53, 'p': 54, 'q': 55, 'r': 56, 's': 57, 't': 58, 'u': 59, 'v': 60, 'w': 61, 'x': 62, 'y': 63, 'z': 64}, ttos={0: '\n', 1: ' ', 2: '!', 3: '$', 4: '&', 5: "'", 6: ',', 7: '-', 8: '.', 9: '3', 10: ':', 11: ';', 12: '?', 13: 'A', 14: 'B', 15: 'C', 16: 'D', 17: 'E', 18: 'F', 19: 'G', 20: 'H', 21: 'I', 22: 'J', 23: 'K', 24: 'L', 25: 'M', 26: 'N', 27: 'O', 28: 'P', 29: 'Q', 30: 'R', 31: 'S', 32: 'T', 33: 'U', 34: 'V', 35: 'W', 36: 'X', 37: 'Y', 38: 'Z', 39: 'a', 40: 'b', 41: 'c', 42: 'd', 43: 'e', 44

In [9]:
S = SuffixTreeSet(L.size)
C = L.encode(data)
assert L.decode(C) == data

In [None]:
started = time()

B, N = 5, len(C)
for i in range(B, N-1):
    S.parse(C[i-B:i], C[i])
S.normalize()

print(f"done in {time() - started} seconds")

In [None]:
started = time()

shake = L.decode(0)
code = [0]
for i in range(1000):
    prefix = code if len(code) < B else code[-B:]
    code.append(S.sample(prefix)) 
    shake += L.decode(code[-1])
    
print(f"done in {time() - started} seconds")
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ")
print(shake)