<a href="https://colab.research.google.com/github/walkerjian/DailyCode/blob/main/StringMatchingAlgorithms.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Naive String Matching
def naive_string_matching(text, pattern):
    """
    Naive String Matching Algorithm
    Time Complexity: O(N * k)
    """
    n, k = len(text), len(pattern)
    for i in range(n - k + 1):
        if text[i:i+k] == pattern:
            return i
    return False

In [2]:
# KMP Algorithm
def kmp_string_matching(text, pattern):
    """
    KMP (Knuth-Morris-Pratt) String Matching Algorithm
    Time Complexity: O(N + k)
    """
    def compute_lps(pattern):
        length = 0
        lps = [0] * len(pattern)
        i = 1
        while i < len(pattern):
            if pattern[i] == pattern[length]:
                length += 1
                lps[i] = length
                i += 1
            else:
                if length != 0:
                    length = lps[length - 1]
                else:
                    lps[i] = 0
                    i += 1
        return lps

    lps = compute_lps(pattern)
    i, j = 0, 0
    while i < len(text):
        if pattern[j] == text[i]:
            i += 1
            j += 1
        if j == len(pattern):
            return i - j
        elif i < len(text) and pattern[j] != text[i]:
            if j != 0:
                j = lps[j - 1]
            else:
                i += 1
    return False

In [3]:
# Boyer-Moore Algorithm
def boyer_moore_string_matching(text, pattern):
    """
    Boyer-Moore String Matching Algorithm (using bad character heuristic)
    Time Complexity: Best case can be sub-linear, worst case is O(N * k).
    """
    def bad_character_heuristic(pattern):
        bad_char = [-1] * 256
        for i in range(len(pattern)):
            bad_char[ord(pattern[i])] = i
        return bad_char

    m = len(pattern)
    n = len(text)
    bad_char = bad_character_heuristic(pattern)

    s = 0
    while s <= n - m:
        j = m - 1
        while j >= 0 and pattern[j] == text[s + j]:
            j -= 1
        if j < 0:
            return s
        else:
            s += max(1, j - bad_char[ord(text[s + j])])
    return False

In [4]:
# Z-Algorithm
def z_algorithm_string_matching(text, pattern):
    """
    Z-Algorithm String Matching
    Time Complexity: O(N + k)
    """
    def compute_z_array(s):
        n = len(s)
        z = [0] * n
        l, r = 0, 0
        for i in range(1, n):
            if i <= r:
                z[i] = min(r - i + 1, z[i - l])
            while i + z[i] < n and s[z[i]] == s[i + z[i]]:
                z[i] += 1
            if i + z[i] - 1 > r:
                l, r = i, i + z[i] - 1
        return z

    combined = pattern + "$" + text
    z_array = compute_z_array(combined)

    for i in range(len(pattern) + 1, len(combined)):
        if z_array[i] == len(pattern):
            return i - len(pattern) - 1
    return False

In [10]:
def rabin_karp(text, pattern, base=256, mod=10**9 + 7):
    n, k = len(text), len(pattern)
    if k > n: return False

    # Compute initial hashes
    h, window = 0, 0
    for i in range(k):
        h = (h * base + ord(pattern[i])) % mod
        window = (window * base + ord(text[i])) % mod

    highest_base_power = pow(base, k - 1, mod)

    for i in range(n - k + 1):
        if h == window and text[i:i+k] == pattern:
            return i

        if i + k < n:
            window = (window - ord(text[i]) * highest_base_power) * base + ord(text[i+k])
            window %= mod

    return False

In [5]:
def test_string_matching(matching_function):
    """Test function for string matching routines using a function pointer."""
    test_cases = [
        ("hello world", "world"),
        ("hello world", "earth"),
        ("abcdefgh", "cde"),
        ("abcdefgh", "xyz"),
        ("a" * 100 + "b", "b"),
        ("a" * 100 + "b", "aa"),
        ("openai", "pena"),
        ("openai", "openai"),
        ("openai", "openaiz"),
        ("abracadabra", "abra")
    ]

    results = []

    for text, pattern in test_cases:
        result = matching_function(text, pattern)
        results.append((text, pattern, result))

    return results


In [11]:
# Let's test these algorithms first using the test harness
naive_results = test_string_matching(naive_string_matching)
kmp_results = test_string_matching(kmp_string_matching)
boyer_moore_results = test_string_matching(boyer_moore_string_matching)
z_algorithm_results = test_string_matching(z_algorithm_string_matching)
rabin_karp_results = test_string_matching(rabin_karp)

naive_results, kmp_results, boyer_moore_results, z_algorithm_results, rabin_karp_results

([('hello world', 'world', 6),
  ('hello world', 'earth', False),
  ('abcdefgh', 'cde', 2),
  ('abcdefgh', 'xyz', False),
  ('aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab',
   'b',
   100),
  ('aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab',
   'aa',
   0),
  ('openai', 'pena', 1),
  ('openai', 'openai', 0),
  ('openai', 'openaiz', False),
  ('abracadabra', 'abra', 0)],
 [('hello world', 'world', 6),
  ('hello world', 'earth', False),
  ('abcdefgh', 'cde', 2),
  ('abcdefgh', 'xyz', False),
  ('aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab',
   'b',
   100),
  ('aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab',
   'aa',
   0),
  ('openai', 'pena', 1),
  ('openai', 'openai', 0),
  ('openai', 'openaiz', False),
  ('abracadabra', 'abra', 0)],
 [('hello world', 'world', 6

In [13]:
import textwrap
def display_algorithm_results(name, description, results):
    print(f"{name}:\n")
    print("\n".join(textwrap.wrap(description, width=80)))
    print("\nResults:")
    for text, pattern, result in results:
        status = f"found at index {result}" if result is not False else "not found"
        result_string = f"Pattern '{pattern}' in text '{text}' was {status}."
        print("\n".join(textwrap.wrap(result_string, width=80)))
    print("-" * 80)
# Displaying the results for each algorithm again with PEP8 formatting and word wrapping
display_algorithm_results(
    "Naive String Matching",
    ("The Naive String Matching algorithm checks for a pattern match at every "
     "position in the text. It is suitable for simple scenarios where the text and "
     "pattern are relatively small. Time Complexity: O(N * k) where N is the length "
     "of the text and k is the length of the pattern."),
    naive_results
)

display_algorithm_results(
    "KMP (Knuth-Morris-Pratt) Algorithm",
    ("The KMP algorithm uses preprocessing to determine how much of the previous "
     "comparison can be skipped when a mismatch occurs. It is efficient for scenarios "
     "where the same pattern is searched frequently in various texts. Time Complexity: "
     "O(N + k) due to preprocessing of the pattern."),
    kmp_results
)

display_algorithm_results(
    "Boyer-Moore Algorithm",
    ("The Boyer-Moore algorithm uses the bad character heuristic to skip sections of "
     "the text, reducing the number of comparisons. It's efficient for searching small "
     "patterns in large texts. Time Complexity: Best case can be sub-linear, but the "
     "worst case is O(N * k). On average, it often performs better than other algorithms "
     "in practical scenarios."),
    boyer_moore_results
)

display_algorithm_results(
    "Z-Algorithm",
    ("The Z-algorithm constructs the Z-array for a combined string of 'pattern + special "
     "character + text'. Each value in the Z-array represents the length of the substring "
     "starting from that position which is also a prefix of the entire string. Time Complexity: "
     "O(N + k) due to the Z-array construction."),
    z_algorithm_results
)

display_algorithm_results(
    "Rabin-Karp Algorithm",
    ("The Rabin-Karp algorithm uses a rolling hash to quickly filter out substrings that "
     "cannot match the pattern. Time Complexity: The worst-case time complexity of the "
     "Rabin-Karp algorithm is O(N + k), but its average and best-case time complexity is "
     "O(N + k) under reasonable assumptions."),
    rabin_karp_results
)


Naive String Matching:

The Naive String Matching algorithm checks for a pattern match at every position
in the text. It is suitable for simple scenarios where the text and pattern are
relatively small. Time Complexity: O(N * k) where N is the length of the text
and k is the length of the pattern.

Results:
Pattern 'world' in text 'hello world' was found at index 6.
Pattern 'earth' in text 'hello world' was not found.
Pattern 'cde' in text 'abcdefgh' was found at index 2.
Pattern 'xyz' in text 'abcdefgh' was not found.
Pattern 'b' in text 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab' was found at index 100.
Pattern 'aa' in text 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab' was found at index 0.
Pattern 'pena' in text 'openai' was found at index 1.
Pattern 'openai' in text 'openai' was found at index 0.
Pattern 'openaiz' in text 'openai' was not found.
Pattern 'abra' in 