In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
with open("/content/drive/MyDrive/brown_nouns.txt") as f:
    words = [line.strip() for line in f]

class TrieNode:
    def __init__(self):
        self.children = {}
        self.count = 0
        self.end = 0

class Trie:
    def __init__(self, suffix=False):
        self.root = TrieNode()
        self.suffix = suffix

    def insert(self, word):
        chars = word[::-1] if self.suffix else word
        node = self.root
        node.count += 1
        for ch in chars:
            if ch not in node.children:
                node.children[ch] = TrieNode()
            node = node.children[ch]
            node.count += 1
        node.end += 1

    def _best_split_index(self, word, method="max_children"):
        chars = word[::-1] if self.suffix else word
        node = self.root
        best_idx = None
        best_score = -1.0

        for i, ch in enumerate(chars):
            if ch not in node.children:
                break
            node = node.children[ch]

            if len(node.children) >= 2:
                if method == "max_children":
                    score = float(len(node.children))
                elif method == "entropy":
                    total = node.count
                    ps = [child.count / total for child in node.children.values()]
                    score = -sum(p * math.log(p, 2) for p in ps)
                else:
                    score = float(len(node.children))

                if (score > best_score) or (score == best_score and (best_idx is None or i < best_idx)):
                    best_score = score
                    best_idx = i + 1

        return best_idx if best_idx is not None else len(chars)

    def split_word(self, word, method="max_children"):
        split_len = self._best_split_index(word, method=method)
        if self.suffix:
            if split_len < len(word):
                stem = word[:-split_len]
                suffix = word[-split_len:]
            else:
                stem, suffix = word, ""
        else:
            stem = word[:split_len]
            suffix = word[split_len:]
        return stem, suffix



prefix_trie = Trie(suffix=False)
suffix_trie = Trie(suffix=True)

for w in words:
    prefix_trie.insert(w)
    suffix_trie.insert(w)


print(f"{'Word':15s} | {'Prefix Trie':20s} | {'Suffix Trie':20s}")
print("-"*65)

for w in words[:30]:
    pre_stem, pre_suf = prefix_trie.split_word(w)
    suf_stem, suf_suf = suffix_trie.split_word(w)

    pre_result = f"{pre_stem}+{pre_suf}" if pre_suf else pre_stem
    suf_result = f"{suf_stem}+{suf_suf}" if suf_suf else suf_stem

    print(f"{w:15s} | {pre_result:20s} | {suf_result:20s}")


Word            | Prefix Trie          | Suffix Trie         
-----------------------------------------------------------------
investigation   | in+vestigation       | investigati+on      
primary         | p+rimary             | primar+y            
election        | e+lection            | electi+on           
evidence        | e+vidence            | evidenc+e           
irregularities  | i+rregularities      | irregularitie+s     
place           | p+lace               | plac+e              
jury            | ju+ry                | jur+y               
presentments    | p+resentments        | presentment+s       
charge          | c+harge              | charg+e             
election        | e+lection            | electi+on           
praise          | p+raise              | prais+e             
thanks          | t+hanks              | thank+s             
manner          | ma+nner              | mann+er             
election        | e+lection            | electi+on           
term