<a href="https://colab.research.google.com/github/walkerjian/DailyCode/blob/main/Knuth%E2%80%93Morris%E2%80%93Pratt_(KMP)_String_Matching.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Given a string and a pattern, find the starting indices of all occurrences of the pattern in the string. For example, given the string "abracadabra" and the pattern "abr", you should return [0, 7]


In [1]:
# Model
def find_pattern_indices(text, pattern):
    """
    Find all starting indices of the occurrences of a pattern in a given text.

    Parameters:
        text (str): The text in which to find the pattern.
        pattern (str): The pattern to find in the text.

    Returns:
        list: A list of starting indices where the pattern is found in the text.
              Returns an empty list if the pattern is not found.
    """
    indices = []
    for i in range(len(text) - len(pattern) + 1):
        if text[i:i + len(pattern)] == pattern:
            indices.append(i)
    return indices

# View
def display_result(text, pattern, indices):
    """
    Display the result to the user.

    Parameters:
        text (str): The original text.
        pattern (str): The pattern that was searched for.
        indices (list): The list of starting indices where the pattern is found.
    """
    print(f"Text: {text}")
    print(f"Pattern: {pattern}")
    print(f"Starting indices of occurrences: {indices}")

# Controller
def main():
    text = "abracadabra"
    pattern = "abr"
    indices = find_pattern_indices(text, pattern)
    display_result(text, pattern, indices)

# Test function
def test_find_pattern_indices():
    test_cases = [
        ("abracadabra", "abr", [0, 7]),
        ("hello world", "l", [2, 3, 9]),
        ("aaaaa", "aa", [0, 1, 2, 3]),
        ("", "a", []),
        ("abc", "", []),
        ("", "", []),
        ("mississippi", "iss", [1, 4]),
        ("xyzxyz", "xyz", [0, 3]),
        ("appleapple", "le", [3, 8]),
        ("testtest", "test", [0, 4])
    ]

    for i, (text, pattern, expected) in enumerate(test_cases):
        result = find_pattern_indices(text, pattern)
        print(f"Test case {i+1}")
        print(f"Text: {text}")
        print(f"Pattern: {pattern}")
        print(f"Expected: {expected}")
        print(f"Output: {result}")
        assert result == expected, f"Test case {i+1} failed"
        print(f"Test case {i+1} passed\n")

if __name__ == "__main__":
    main()
    print("Running tests...")
    test_find_pattern_indices()


Text: abracadabra
Pattern: abr
Starting indices of occurrences: [0, 7]
Running tests...
Test case 1
Text: abracadabra
Pattern: abr
Expected: [0, 7]
Output: [0, 7]
Test case 1 passed

Test case 2
Text: hello world
Pattern: l
Expected: [2, 3, 9]
Output: [2, 3, 9]
Test case 2 passed

Test case 3
Text: aaaaa
Pattern: aa
Expected: [0, 1, 2, 3]
Output: [0, 1, 2, 3]
Test case 3 passed

Test case 4
Text: 
Pattern: a
Expected: []
Output: []
Test case 4 passed

Test case 5
Text: abc
Pattern: 
Expected: []
Output: [0, 1, 2, 3]


AssertionError: ignored

Maximally Efficent Implementation

The most efficient algorithm for string pattern matching is the Knuth–Morris–Pratt (KMP) algorithm. It performs the search in
$O(n+m)$ time, where $n$ is the length of the text and $m$ is the length of the pattern. Below is the Python implementation of the KMP algorithm for finding all occurrences of a pattern in a given text.

In [2]:
def build_kmp_table(pattern):
    """
    Build the KMP table for a given pattern.

    Parameters:
        pattern (str): The pattern for which to build the KMP table.

    Returns:
        list: The KMP table as a list of integers.
    """
    table = [0] * len(pattern)
    j = 0
    for i in range(1, len(pattern)):
        while j > 0 and pattern[i] != pattern[j]:
            j = table[j - 1]
        if pattern[i] == pattern[j]:
            j += 1
        table[i] = j
    return table


def find_pattern_indices_kmp(text, pattern):
    """
    Find all starting indices of the occurrences of a pattern in a given text
    using the KMP algorithm.

    Parameters:
        text (str): The text in which to find the pattern.
        pattern (str): The pattern to find in the text.

    Returns:
        list: A list of starting indices where the pattern is found in the text.
              Returns an empty list if the pattern is not found.
    """
    if not text or not pattern:
        return []

    indices = []
    kmp_table = build_kmp_table(pattern)
    j = 0

    for i in range(len(text)):
        while j > 0 and text[i] != pattern[j]:
            j = kmp_table[j - 1]
        if text[i] == pattern[j]:
            j += 1
        if j == len(pattern):
            indices.append(i - j + 1)
            j = kmp_table[j - 1]
    return indices


def test_find_pattern_indices_kmp():
    test_cases = [
        ("abracadabra", "abr", [0, 7]),
        ("hello world", "l", [2, 3, 9]),
        ("aaaaa", "aa", [0, 1, 2, 3]),
        ("", "a", []),
        ("abc", "", []),
        ("", "", []),
        ("mississippi", "iss", [1, 4]),
        ("xyzxyz", "xyz", [0, 3]),
        ("appleapple", "le", [3, 8]),
        ("testtest", "test", [0, 4])
    ]

    for i, (text, pattern, expected) in enumerate(test_cases):
        result = find_pattern_indices_kmp(text, pattern)
        print(f"Test case {i+1}")
        print(f"Text: {text}")
        print(f"Pattern: {pattern}")
        print(f"Expected: {expected}")
        print(f"Output: {result}")
        assert result == expected, f"Test case {i+1} failed"
        print(f"Test case {i+1} passed\n")


if __name__ == "__main__":
    print("Running tests...")
    test_find_pattern_indices_kmp()


Running tests...
Test case 1
Text: abracadabra
Pattern: abr
Expected: [0, 7]
Output: [0, 7]
Test case 1 passed

Test case 2
Text: hello world
Pattern: l
Expected: [2, 3, 9]
Output: [2, 3, 9]
Test case 2 passed

Test case 3
Text: aaaaa
Pattern: aa
Expected: [0, 1, 2, 3]
Output: [0, 1, 2, 3]
Test case 3 passed

Test case 4
Text: 
Pattern: a
Expected: []
Output: []
Test case 4 passed

Test case 5
Text: abc
Pattern: 
Expected: []
Output: []
Test case 5 passed

Test case 6
Text: 
Pattern: 
Expected: []
Output: []
Test case 6 passed

Test case 7
Text: mississippi
Pattern: iss
Expected: [1, 4]
Output: [1, 4]
Test case 7 passed

Test case 8
Text: xyzxyz
Pattern: xyz
Expected: [0, 3]
Output: [0, 3]
Test case 8 passed

Test case 9
Text: appleapple
Pattern: le
Expected: [3, 8]
Output: [3, 8]
Test case 9 passed

Test case 10
Text: testtest
Pattern: test
Expected: [0, 4]
Output: [0, 4]
Test case 10 passed



## Let's delve deeper into the efficiency and robustness of the KMP algorithm through tests and analysis.

### Proof of Efficiency

1. **KMP Table Generation**: The KMP table is generated in $O(M)$ time, where $M$ is the length of the pattern.
2. **Text Traversal**: Each character in the text is examined at most twice, leading to $O(N)$ time complexity, where $N$ is the length of the text.

So, the overall time complexity is $O(N + M)$.

### Extensive Testing

To test the efficiency and robustness of the function, we can conduct the following tests:

1. **Randomly Generated Tests**: We can generate long strings and patterns to find.
2. **Edge Cases**: Long patterns in short texts, very long patterns, empty strings, and so on.
3. **Repetitive Patterns**: Patterns and texts that are highly repetitive.
4. **Real-World Data**: For example, searching for a pattern in the text of a book or a lengthy article.

In [3]:
import random
import string
import time

# Generate a random string of a given length
def generate_random_string(length):
    return ''.join(random.choices(string.ascii_lowercase, k=length))

# Generate a repetitive string of a given length based on a small set of characters
def generate_repetitive_string(length):
    return 'ab' * (length // 2) + 'a' * (length % 2)

# Real-world test case (an excerpt from a book, article, etc.)
real_world_text = "It was the best of times, it was the worst of times, " * 50
real_world_pattern = "best of times"

# Test cases
test_cases = [
    ("abracadabra", "abr", [0, 7]),  # Simple case
    (generate_random_string(1000), "abc", []),  # Randomly generated text
    (generate_repetitive_string(1000), "ab", [i for i in range(0, 1000, 2)]),  # Repetitive pattern
    ("", "a", []),  # Empty text
    ("abc", "", []),  # Empty pattern
    ("", "", []),  # Both empty
    ("mississippi", "iss", [1, 4]),  # Multiple occurrences
    (real_world_text, real_world_pattern, [i for i in range(0, 2450, 49)]),  # Real-world data
    ("appleapple", "le", [3, 8]),  # Multiple occurrences, non-overlapping
    ("testtest", "test", [0, 4])  # Multiple occurrences, non-overlapping
]

# Run tests
print("Running tests...")
for i, (text, pattern, expected) in enumerate(test_cases):
    start_time = time.time()
    result = find_pattern_indices_kmp(text, pattern)
    end_time = time.time()

    print(f"Test case {i+1}")
    if len(text) > 50:
        print(f"Text: {text[:50]}... (length: {len(text)})")
    else:
        print(f"Text: {text} (length: {len(text)})")

    if len(pattern) > 50:
        print(f"Pattern: {pattern[:50]}... (length: {len(pattern)})")
    else:
        print(f"Pattern: {pattern} (length: {len(pattern)})")

    print(f"Expected: {expected}")
    print(f"Output: {result}")
    print(f"Time taken: {end_time - start_time:.6f} seconds")

    assert result == expected, f"Test case {i+1} failed"
    print(f"Test case {i+1} passed\n")


Running tests...
Test case 1
Text: abracadabra (length: 11)
Pattern: abr (length: 3)
Expected: [0, 7]
Output: [0, 7]
Time taken: 0.000015 seconds
Test case 1 passed

Test case 2
Text: llufjrxkuchezjrhovrgqzsnzlzefjwsjwlinrrrzkooxsedpv... (length: 1000)
Pattern: abc (length: 3)
Expected: []
Output: []
Time taken: 0.000343 seconds
Test case 2 passed

Test case 3
Text: ababababababababababababababababababababababababab... (length: 1000)
Pattern: ab (length: 2)
Expected: [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 142, 144, 146, 148, 150, 152, 154, 156, 158, 160, 162, 164, 166, 168, 170, 172, 174, 176, 178, 180, 182, 184, 186, 188, 190, 192, 194, 196, 198, 200, 202, 204, 206, 208, 210, 212, 214, 216, 218, 220, 222, 224, 226, 228, 230, 23

AssertionError: ignored

It appears that test case 8 failed, and this is likely due to the incorrect expectation set for the "real-world" text pattern matching. The discrepancy may be due to the additional characters that are part of the repetitive text, such as spaces and commas, which were not accounted for in the expected indices.

In [5]:
# Correct the expectation for the real-world test case
real_world_text = "It was the best of times, it was the worst of times, " * 50
real_world_pattern = "best of times"
real_world_expected = [i for i in range(11, 2609, 53)]  # Adjusted the indices based on the actual pattern location

# Update the test cases
test_cases[7] = (real_world_text, real_world_pattern, real_world_expected)

# Run the tests again, this time only displaying a portion of the output for large lists
print("Running tests...")
for i, (text, pattern, expected) in enumerate(test_cases):
    start_time = time.time()
    result = find_pattern_indices_kmp(text, pattern)
    end_time = time.time()

    print(f"Test case {i+1}")
    if len(text) > 50:
        print(f"Text: {text[:50]}... (length: {len(text)})")
    else:
        print(f"Text: {text} (length: {len(text)})")

    if len(pattern) > 50:
        print(f"Pattern: {pattern[:50]}... (length: {len(pattern)})")
    else:
        print(f"Pattern: {pattern} (length: {len(pattern)})")

    print(f"Expected: {expected[:10]}... (total: {len(expected)})" if len(expected) > 10 else f"Expected: {expected}")
    print(f"Output: {result[:10]}... (total: {len(result)})" if len(result) > 10 else f"Output: {result}")
    print(f"Time taken: {end_time - start_time:.6f} seconds")

    assert result == expected, f"Test case {i+1} failed"
    print(f"Test case {i+1} passed\n")


Running tests...
Test case 1
Text: abracadabra (length: 11)
Pattern: abr (length: 3)
Expected: [0, 7]
Output: [0, 7]
Time taken: 0.000110 seconds
Test case 1 passed

Test case 2
Text: llufjrxkuchezjrhovrgqzsnzlzefjwsjwlinrrrzkooxsedpv... (length: 1000)
Pattern: abc (length: 3)
Expected: []
Output: []
Time taken: 0.000372 seconds
Test case 2 passed

Test case 3
Text: ababababababababababababababababababababababababab... (length: 1000)
Pattern: ab (length: 2)
Expected: [0, 2, 4, 6, 8, 10, 12, 14, 16, 18]... (total: 500)
Output: [0, 2, 4, 6, 8, 10, 12, 14, 16, 18]... (total: 500)
Time taken: 0.000561 seconds
Test case 3 passed

Test case 4
Text:  (length: 0)
Pattern: a (length: 1)
Expected: []
Output: []
Time taken: 0.000006 seconds
Test case 4 passed

Test case 5
Text: abc (length: 3)
Pattern:  (length: 0)
Expected: []
Output: []
Time taken: 0.000001 seconds
Test case 5 passed

Test case 6
Text:  (length: 0)
Pattern:  (length: 0)
Expected: []
Output: []
Time taken: 0.000002 seconds
Test 

All the test cases have passed successfully, including the corrected real-world example and the extensive random and repetitive tests. The time taken for each test case is also minimal, indicating that the KMP algorithm is highly efficient.