In [45]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [46]:
from transformers import GPT2Tokenizer
from dawg import PyDawg

dawg_path = "/net/nfs.cirrascale/allennlp/davidw/proj/proj-rusty-dawg/rusty-dawg/dawg/wikitext-2-raw.dawg"

# Make sure the tokenizer matches the one used to construct the DAWG.
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

py_dawg = PyDawg(dawg_path, tokenizer)

In [47]:
# Substring found in the Wikitext 2 train data.
query = "As with previous Valkyira Chronicles games , Valkyria Chronicles III"

# Get suffix contexts
suffix_contexts = py_dawg.get_suffix_context(query)

print(suffix_contexts)

{'tokens': [1722, 351, 2180, 569, 18354, 8704, 17740, 1830, 837, 569, 18354, 7496, 17740, 6711], 'suffix_contexts': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], 'context_counts': [234, 9, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [48]:
# Return a list of all substrings in the DAWG that match the query.
matching_substrings = py_dawg.get_matching_substrings(query)

for entry in matching_substrings:
    print(entry)

{'tokens': (1722,), 'count': 234, 'text': 'As'}
{'tokens': (1722, 351), 'count': 9, 'text': 'As with'}
{'tokens': (1722, 351, 2180), 'count': 1, 'text': 'As with previous'}
{'tokens': (1722, 351, 2180, 569), 'count': 1, 'text': 'As with previous V'}
{'tokens': (1722, 351, 2180, 569, 18354), 'count': 1, 'text': 'As with previous Valky'}
{'tokens': (1722, 351, 2180, 569, 18354, 8704), 'count': 1, 'text': 'As with previous Valkyira'}
{'tokens': (1722, 351, 2180, 569, 18354, 8704, 17740), 'count': 1, 'text': 'As with previous Valkyira Chronicles'}
{'tokens': (1722, 351, 2180, 569, 18354, 8704, 17740, 1830), 'count': 1, 'text': 'As with previous Valkyira Chronicles games'}
{'tokens': (1722, 351, 2180, 569, 18354, 8704, 17740, 1830, 837), 'count': 1, 'text': 'As with previous Valkyira Chronicles games,'}
{'tokens': (1722, 351, 2180, 569, 18354, 8704, 17740, 1830, 837, 569), 'count': 1, 'text': 'As with previous Valkyira Chronicles games, V'}
{'tokens': (1722, 351, 2180, 569, 18354, 8704, 177

The counts are weird. There are ~15K occurrences of "as" in the train corpus.
If I call `dawg.get_count()` before transitioning, I get way too many occurrences (like 2M). If I call it after, I get too few (like 200). How do I get the counts?

Also, we have redundant substrings. For instance, "As with previous" only occurs as a substring of "As with previous V". I think that if `s` is a substring of `t`, we should keep `s` if its count is greater than `t`, and otherwise throw it out.

In [49]:
query = "Usain bolt set the world record in the 100-meter dash."
suffix_context = py_dawg.get_suffix_context(query)
matching_substrings = py_dawg.get_matching_substrings(query)

for substring in matching_substrings:
    print(substring)  

{'tokens': (5842,), 'count': 4, 'text': 'Us'}
{'tokens': (391,), 'count': 271, 'text': 'ain'}
{'tokens': (18100,), 'count': 4, 'text': ' bolt'}
{'tokens': (900,), 'count': 725, 'text': ' set'}
{'tokens': (900, 262), 'count': 29, 'text': ' set the'}
{'tokens': (262, 995), 'count': 347, 'text': ' the world'}
{'tokens': (262, 995, 1700), 'count': 2, 'text': ' the world record'}
{'tokens': (995, 1700, 287), 'count': 1, 'text': ' world record in'}
{'tokens': (995, 1700, 287, 262), 'count': 1, 'text': ' world record in the'}
{'tokens': (995, 1700, 287, 262, 1802), 'count': 1, 'text': ' world record in the 100'}
{'tokens': (12,), 'count': 17027, 'text': '-'}
{'tokens': (14470,), 'count': 7, 'text': ' dash'}
{'tokens': (13,), 'count': 8668, 'text': '.'}


Apparently the token "meter" never appears?

In [43]:
query = "foo foo foo foo"
matching_substrings = py_dawg.get_matching_substrings(query)
for entry in matching_substrings:
    print(entry)

{'tokens': (21943,), 'count': 2347039, 'text': 'foo'}
{'tokens': (21943, 22944), 'count': 2347039, 'text': 'foo foo'}
{'tokens': (21943, 22944, 22944), 'count': 2347039, 'text': 'foo foo foo'}
{'tokens': (21943, 22944, 22944, 22944), 'count': 2347039, 'text': 'foo foo foo foo'}


In [44]:
py_dawg.get_suffix_context("foo foo foo foo")

{'tokens': [21943, 22944, 22944, 22944],
 'suffix_contexts': [0, 0, 0, 0],
 'context_counts': [2347039, 2347039, 2347039, 2347039]}

Weird bug here; what happened? Does the "-" character do something funky?

In [37]:
query = "foo-bar"
matching_substrings = py_dawg.get_matching_substrings(query)

for substring in matching_substrings:
    print(substring)  

{'tokens': (21943,), 'count': 2347039, 'text': 'foo'}
{'tokens': (12,), 'count': 17027, 'text': '-'}
{'tokens': (5657,), 'count': 126, 'text': 'bar'}
