In [80]:
import codecs
from time import time

import numpy as np

## Наивный алгоритм

In [81]:
def naive_matcher(text, pattern):
    n = len(text)
    m = len(pattern)
    results = []
    for i in range(n - m + 1):
        j = 0
        while j < m:
            if text[i + j] != pattern[j]:
                break
            j += 1
        if j == m:
            results.append((i, i + j - 1))
    return results

## Алгоритм Рабина-Карпа

In [82]:
def RK_matcher(text, pattern):
    n = len(text)
    m = len(pattern)
    p = 101
    alphabet_len = 256 # num characters in alphabet

    p_pow = np.zeros(max(n, m), dtype=np.int64)
    p_pow[0] = 1   
    for i in range(1, len(p_pow)):
        p_pow[i] = (p_pow[i - 1] * p) % alphabet_len
    
    h = np.zeros(n + 1, dtype=np.int64)
    
    for i in range(n):
        h[i + 1] = (h[i] + (ord(text[i]) - ord('a') + 1) * p_pow[i]) % alphabet_len
    
    h_s = 0
    for i in range(m):
        h_s = (h_s + (ord(pattern[i]) - ord('a') + 1) * p_pow[i]) % alphabet_len
    
    results = []
    i = 0
    for i in range(n - m + 1):
        curr_h = (h[i + m] + alphabet_len - h[i]) % alphabet_len
        if curr_h == (h_s * p_pow[i]) % alphabet_len:
            results.append((i, i + m - 1))
    return results

## Алгоритм Кнутта-Мориса-Пратта

In [83]:
def compute_prefix(pattern):
    m = len(pattern)
    pref_fun = [0] * len(pattern)
    k = 0
    for i in range(1, m):
        while k > 0 and pattern[k] != pattern[i]:
            k = pref_fun[k - 1]
        if pattern[k] == pattern[i]:
            k += 1
        pref_fun[i] = k
    return pref_fun

def KMP_matcher(text, pattern):
    n = len(text)
    m = len(pattern)
    results = []
    prefix_fun = compute_prefix(pattern)
    q = 0

    for i in range(n):
        while q > 0 and pattern[q] != text[i]:
            q = prefix_fun[q - 1]
        if pattern[q] == text[i]:
            q += 1
        if q == m:
            results.append((i + 1 - q, i))
            q = prefix_fun[q - 1]
    return results

## Алгоритм Ахо-Карасика 

In [105]:
class Node:
    def __init__(self):
        self.next = {}
        self.fail = None
        self.is_word = False


class AhoCorasick:
    def __init__(self, patterns):
        self.root = Node()
        self.make_bor(patterns)
        self.set_links()

    def add_pattern(self, pattern):
        tmp = self.root
        for char in pattern:
            tmp = tmp.next.setdefault(char, Node())
        tmp.is_word = True

    def make_bor(self, patterns):
        if not isinstance(patterns, list):
            self.add_pattern(patterns)
        else:
            for pattern in patterns:
                self.add_pattern(pattern)

    def set_links(self):
        queue = [self.root]
        while queue:
            temp = queue.pop()
            p = None
            for key in temp.next.keys():
                if temp == self.root:
                    temp.next[key].fail = self.root
                else:
                    p = temp.fail
                    while p is not None:
                        if key in p.next:
                            temp.next[key].fail = p.next[key]
                            break
                        p = p.fail
                    if p is None:
                        temp.next[key].fail = self.root
                queue.append(temp.next[key])

    def search(self, text):
        results = set()
        start_index = 0

        for curr_position in range(len(text)):
            word = text[curr_position]
            end_index = curr_position
            p = self.root
            while word in p.next:
                if p == self.root:
                    start_index = curr_position
                p = p.next[word]
                if p.is_word:
                    if (start_index, end_index) not in results:
                        results.add((start_index, end_index))
                if p.next and end_index + 1 < len(text):
                    end_index += 1
                    word = text[end_index]
                else:
                    break
                while (word not in p.next) and (p != self.root):
                    p = p.fail
                    start_index += 1
                if p == self.root:
                    break
        return list(results)
    
def AC_matcher(text, pattern):
    aho_corasick = AhoCorasick(pattern)
    return aho_corasick.search(text)

## Measurements of benchmark

In [118]:
def measure_function(fun, text, pattern, expected, iterations=1):
    total_time = 0
    for i in range(iterations):
        start = time()
        result = fun(text, pattern)
        if i == 0:
            if result != expected:
                print("RESULT IS WRONG:", result, "EXPECTED:", expected)
            else:
                print('result:', result)
        end = time()
        total_time += (end - start)
    print('total time:', total_time / iterations, '\n')


def test(text, pattern):
    expected = naive_matcher(text, pattern)
    
    print('\tНаивный алгоритм')
    measure_function(naive_matcher, text, pattern, expected)
    
    print('\tАлгоритм Рабина-Карпа')
    measure_function(RK_matcher, text, pattern, expected)
    
    print('\tАлгоритм Кнута-Морриса-Пратта')
    measure_function(KMP_matcher, text, pattern, expected)
    
    print('\tАлгоритм Ахо-Карасика')
    measure_function(AC_matcher, text, pattern, expected)
    
def start_testing():
    for bench_type in ['bad', 'good']:
        for i in range(1, 5):
            text = codecs.open(f"benchmarks/{bench_type}_t_{i}.txt", "r", "utf_8_sig")
            pattern = codecs.open(f"benchmarks/{bench_type}_w_{i}.txt", "r", "utf_8_sig")
            print(f'TEST FILE {bench_type} №{i}')
            test(text.read(), pattern.read())

In [119]:
start_testing()

TEST FILE bad №1
	Наивный алгоритм
result: [(8, 9)]
total time: 0.0 

	Алгоритм Рабина-Карпа
result: [(8, 9)]
total time: 0.0 

	Алгоритм Кнута-Морриса-Пратта
result: [(8, 9)]
total time: 0.0 

	Алгоритм Ахо-Карасика
result: [(8, 9)]
total time: 0.0 

TEST FILE bad №2
	Наивный алгоритм
result: [(90, 99)]
total time: 0.0 

	Алгоритм Рабина-Карпа
result: [(90, 99)]
total time: 0.00035572052001953125 

	Алгоритм Кнута-Морриса-Пратта
result: [(90, 99)]
total time: 0.0 

	Алгоритм Ахо-Карасика
result: [(90, 99)]
total time: 0.0031938552856445312 

TEST FILE bad №3
	Наивный алгоритм
result: [(900, 999)]
total time: 0.023932695388793945 

	Алгоритм Рабина-Карпа
result: [(900, 999)]
total time: 0.008287668228149414 

	Алгоритм Кнута-Морриса-Пратта
result: [(900, 999)]
total time: 0.001033782958984375 

	Алгоритм Ахо-Карасика
result: [(900, 999)]
total time: 0.6296796798706055 

TEST FILE bad №4
	Наивный алгоритм
result: [(4000, 4999)]
total time: 1.147019863128662 

	Алгоритм Рабина-Карпа
resu