# AC算法

Aho-Corasick算法。

可以应用于多模式串匹配实现敏感词过滤。

In [49]:
class Node:
    def __init__(self, char, is_end_char=False):
        self.char = char
        self.is_end_char = is_end_char
        self.length = 0
        self.children = dict()
        self.fail = None
        
    def __repr__(self):
        return self.char
    
    def to_dict(self):
        return vars(self)

class Trie:
    def __init__(self):
        self.root = Node('/')
    
    def insert(self, word):
        node = self.root
        for char in word:
            if not node.children.get(char, False):
                node.children[char] = Node(char)
            node = node.children[char]
        else:
            node.is_end_char = True
            node.length = len(word)
            
    def find(self, word):
        node = self.root
        for x in word:
            if x not in node.children:
                break
            else:
                node = node.children[x]
        else:
            if node.is_end_char:
                return True
        return False
    
    def buildFailNode(self):
        queue = list()
        queue.append(self.root)
        while(queue):
            node = queue.pop(0)
            for child in node.children.values():
                if node is self.root:
                    child.fail = self.root
                else:
                    q = node.fail
                    while(q):
                        if q.children.get(child.char, False):
                            child.fail = q.children[child.char]
                            break
                        q = q.fail
                    if q == None:
                        child.fail = self.root
                        
                queue.append(child)
    
    def match(self, text):
        p = self.root
        for k, v in enumerate(text):
            while(p.children.get(v, None) is None and p is not self.root):
                p = p.fail
            
            p = p.children.get(v, None)
            
            if p is None:
                p = self.root
            
            tmp_p = p
            while tmp_p is not self.root:
                if tmp_p.is_end_char:
                    index = k - tmp_p.length +1
                    print('match -> text[%s:%s]: %s' % (index, index+tmp_p.length, text[index:index+tmp_p.length]))
                tmp_p = tmp_p.fail
                    
                
            



trie = Trie()
trie.insert('abce')
trie.insert('bcd')
trie.insert('ce')
trie.buildFailNode()

        
trie.match('cececcecasdasdabce')

match -> text[0:2]: ce
match -> text[2:4]: ce
match -> text[5:7]: ce
match -> text[14:18]: abce
match -> text[16:18]: ce
