In [8]:
def convert_line_old(line, lineno=None):
    parts = line.split(':    ')
    return '\t'.join(parts)

In [20]:
def convert_line(line):
    ix = line.find(b':')
    assert ix != -1
    offset = line[:ix]
    rest = line[ix+5:]
    return b'\t'.join((offset, rest))

In [21]:
def convert_file(path):
    with open(path, 'rb') as src:
        with open(path.replace('.txt', '.tsv'), 'wb') as dst:
            for (srclineno, srcline) in enumerate(src):
                try:
                    dst.write(convert_line(srcline, srclineno) + '\r')
                except:
                    print("[%d]: %s" % (srclineno, srcline))
                

In [22]:
line = b'2615:    AccessibleComputing'

In [23]:
convert_line(line)

b'2615\tAccessibleComputing'

In [1]:
import string
import datrie

In [2]:
string.ascii_letters

'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'

In [3]:
string.printable

'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c'

In [4]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [5]:
allowed = (
    string.printable +
    string.punctuation
)

In [6]:
titles_trie = datrie.Trie(allowed)
titles_offset_trie = datrie.Trie(string.digits)
titles_lower_trie = datrie.Trie(string.ascii_lowercase + string.punctuation)

In [7]:
def convert_word(line):
    ix = line.find('\t')
    assert ix != -1
    offset = line[:ix]
    if line[-1] == '\n':
        word = line[ix+1:-1]
    else:
        word = line[ix+1:]
    return (word, offset)

In [8]:
def add_to_trie(trie, offset_trie, key, value):
    offset = int(value)
    if key in trie:
        existing = trie[key]
        if value not in existing:
            existing.append(offset)
            existing.sort()
    else:
        trie[key] = [ offset, ]
    
    if value in offset_trie:
        existing = offset_trie[value]
        if key not in existing:
            existing.append(key)
            existing.sort()
    else:
        offset_trie[value] = [ key, ]
    
    lower_value = key.lower()
    if lower_value == value:
        return

    if lower_value not in trie:
        trie[lower_value] = [ -offset, ]

In [9]:
def add_word(line):
    (word, offset) = convert_word(line)
    add_to_trie(titles_trie, titles_offset_trie, word, offset)

In [37]:
failed = []

In [20]:
tsv_path = 'titles-1m.tsv'

In [21]:
tsv_trie = tsv_path.replace('.tsv', '.trie')

In [22]:
tsv_offsets_trie = tsv_trie.replace('.trie', '_offsets.trie')

In [23]:
max_failed = 10
with open(tsv_path, 'rb') as f:
    for bytes_line in f:
        try:
            line = bytes_line.decode('utf-8', 'replace')
            add_word(line)
        except Exception as e:
            failed.append((e, line))
            if len(failed) >= max_failed:
                raise

KeyboardInterrupt: 

In [24]:
len(titles_trie)

416341

In [25]:
titles_trie.save(tsv_trie)

In [26]:
titles_offset_trie.save('titles_by_offset.trie')

In [18]:
failed = None

In [28]:
len(failed)

10

In [29]:
failed[0]

(TypeError('unorderable types: int() < tuple()'),
 '886354688\t1911 encyclopedia\n')

In [45]:
words.save('words.trie')

In [46]:
offset_to_words.save('words_by_offset.trie')

In [27]:
%timeit datrie.Trie.load(tsv_trie)

1 loops, best of 3: 3.8 s per loop


In [48]:
words.keys()[:10]

['A',
 'Aani',
 'Aaron',
 'Aaronic',
 'Aaronical',
 'Aaronite',
 'Aaronitic',
 'Aaru',
 'Ab',
 'Ababdeh']

In [35]:
%timeit words.prefixes('aaron')

The slowest run took 18.40 times longer than the fastest. This could mean that an intermediate result is being cached 
1000000 loops, best of 3: 195 ns per loop


In [44]:
%timeit words['aaron']

The slowest run took 21.18 times longer than the fastest. This could mean that an intermediate result is being cached 
10000000 loops, best of 3: 109 ns per loop


In [47]:
words.suffixes('Aa')

['ron']

In [49]:
words.items('aa')

[('aaron', [44])]