In [7]:
import os
import pickle
from timeit import default_timer

In [15]:
word_dict = {}
with open("../assets\dictionaries/full_dictionary.txt", "r") as f:
    start =  default_timer()
    for i, line in enumerate(f.readlines()):
        line = line.strip()
        word_dict[line] = True
    end = default_timer()
    print(f"Took {end-start:.2f} s")

Took 0.38 s


In [18]:
# fast loading, but slow search (need to create a word tree from this)

In [62]:
class WordTree:
    def __init__(self, parent=None):
        self.leafs = {}
        self.word = False
        self.parent = parent

    def add_word(self, word):
        self.attach_word(word)

    def attach_word(self, word):
        if not word:
            return

        head = word[0]
        body = word[1:]
        child = self.leafs.setdefault(head, WordTree(parent=self))
        if body:
            child.attach_word(body)
        else:
            child.word = True
    
    def get_leaf(self, word):
        # if word is empty, check if this leaf is a word
        if not word:
            return self.word

        head = word[0]
        body = word[1:]
        
        child = self.leafs.get(head, None)
        if not child:
            return False
        
        return child.get_leaf(body)

In [58]:
word_tree = WordTree()
lines = []
with open("../assets\dictionaries/compact_dictionary.txt", "r") as f:
    start =  default_timer()
    for i, line in enumerate(f.readlines()):
        line = line.strip()
        lines.append(line)
        word_tree.add_word(line)
    end = default_timer()
    print(f"Took {end-start:.2f} s")

Took 8.01 s


In [59]:
class WordTreeSerialiser:
    def serialise(self, tree):
        tokens = self.tokenise(tree)
        return ''.join(tokens)
    
    def tokenise(self, tree):
        string = []
            
        if tree.word:
            string.append("*")
            
        for char, leaf in tree.leafs.items():
            string.append(char)
            leaf_string = self.serialise(leaf)
            string.extend(leaf_string)
            string.append("<")
        
        return string        

In [60]:
serialiser = WordTreeSerialiser()
string = serialiser.serialise(word_tree)

In [61]:
print(len(string))
print(sum((len(line) for line in lines)))

2143905
3011338


In [68]:
class WordTreeLoader:
    def load(self, string):
        tree = WordTree()
        base = tree
        
        for i, token in enumerate(string):
            if token == '*':
                base.word = True
            elif token == '<':
                base = base.parent
            elif token.isalpha():
                child = WordTree(base)
                base.leafs.setdefault(token, child)
                base = child
            else:
                print(f"Unknown token: {token}")
            
            if base is None:
                print(f"Ended at {i}: {string[i:]}")
                break

        return tree

In [70]:
start = default_timer()
loader = WordTreeLoader()
word_tree = loader.load(string)
end = default_timer()
print(f"Took {end-start:.2f}s")

Took 4.58s


In [74]:
word_tree.get_leaf("hello")

True

In [77]:
# store as branch(string): isword(bool)
# if branch exist, if word
dictionary = {}
with open("../assets\dictionaries/compact_dictionary.txt", "r") as f:
    start =  default_timer()
    for line in f.readlines():
        line = line.strip()
        lines.append(line)
        
        for i, char in enumerate(line):
            dictionary.setdefault(line[:i], False)
        
        # override branch if word
        dictionary[line] = True
        
    end = default_timer()
    print(f"Took {end-start:.2f} s")

Took 1.88 s


In [91]:
with open('dictionary.pickle', 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(dictionary, f, pickle.HIGHEST_PROTOCOL)

In [96]:
start = default_timer()
with open('dictionary.pickle', 'rb') as f:
    loaded_dict = pickle.load(f)
end = default_timer()
print(f"Took {end-start:.2f}s")

Took 0.45s
