# KMP算法(Knuth-Morris-Pratt算法)




In [163]:
def _get_next(pattern):
    _list = [-1,]
    index = -1
    for k in range(1, len(pattern)):
        while index!=-1 and pattern[index+1] != pattern[k]:
            index = _list[index]
            
        if pattern[index+1] == pattern[k]:
            index += 1
            
        _list.append(index)
    return _list


def kmp(base, pattern):
    pattern_len = len(pattern)
    index = 0
    _next = _get_next(pattern)
    for k, v in enumerate(base):
        while index > 0 and v != pattern[index]:

            index = _next[index-1] + 1

        if v == pattern[index]:
            index += 1            
        if index == pattern_len:
                return 'base[%s:%s]: %s' % (k-index, k, base[k-index:k])
        
    return False

    
kmp('ababaeabac', 'ababacd')

False

## next数组分析


next 下标对应模式串下标，值对应**模式串开头到这个下标这段字符串**的**最长可匹配前缀字串最后一个字符的下标**。

以`base = 'ababacd'`为例：


`base[1] != base[0]`, 没有匹配的字符，并且已经没有次长可匹配前缀子串和次长可匹配后缀子串，`next[1] = -1`

`base[2] == base[0]`, 最长可匹配前缀字串和次长可匹配后缀子串是`a`，`next[2] = 0` 

`base[3] == base[1]`, 次长可匹配前缀子串和次长可匹配后缀子串下标是0，因为`base[3] == base[1]`相等，最长可匹配前缀字串和次长可匹配后缀子串是`ab` `next[3] = 1`

`base[4] == base[2]`, 次长可匹配前缀子串和次长可匹配后缀子串下标是1，因为`base[4] == base[2]`相等，最长可匹配前缀字串和次长可匹配后缀子串是`aba` `next[4] = 2`

`base[5] != base[3]`, `base[5] != base[2]`, `base[5] != base[1]`, `base[5] != base[0]`, 没有匹配的字符，所以next[5] = -1

`base[6] != base[0]`, 没有匹配的字符，next[6] = -1

In [164]:
def _get_next(pattern):
    _list = [-1,]
    index = -1
    for k in range(1, len(pattern)):
        print(k, index, _list, pattern[index+1] == pattern[k])

        while index!=-1 and pattern[index+1] != pattern[k]:
            index = _list[index]
            
        if pattern[index+1] == pattern[k]:
            index += 1
            
        _list.append(index)
        print(k, index, _list)
        print('-'*20)
    return _list


_get_next('ababacd')

1 -1 [-1] False
1 -1 [-1, -1]
--------------------
2 -1 [-1, -1] True
2 0 [-1, -1, 0]
--------------------
3 0 [-1, -1, 0] True
3 1 [-1, -1, 0, 1]
--------------------
4 1 [-1, -1, 0, 1] True
4 2 [-1, -1, 0, 1, 2]
--------------------
5 2 [-1, -1, 0, 1, 2] False
5 -1 [-1, -1, 0, 1, 2, -1]
--------------------
6 -1 [-1, -1, 0, 1, 2, -1] False
6 -1 [-1, -1, 0, 1, 2, -1, -1]
--------------------


[-1, -1, 0, 1, 2, -1, -1]

In [165]:
def _get_next(pattern):
    prefix = set()  # 前缀子集
    _list = []
    for k, v in enumerate(pattern):
        suffix = set()  # 后缀子集
        source = pattern[0:k+1]
        _prefix = source[0:-1]
        _suffix = source[1:]

        prefix.add(_prefix)

        for x in range(len(_suffix)):
            suffix.add(_suffix[x:])

        mixed = prefix & suffix
        _list.append(len(max(mixed))-1 if mixed else -1)
    return _list

_get_next('ababacd')

[-1, -1, 0, 1, 2, -1, -1]