In [3]:
!pip install python-Levenshtein pandas gdown numpy

Collecting pandas
  Using cached pandas-2.3.1-cp311-cp311-win_amd64.whl.metadata (19 kB)
Collecting gdown
  Using cached gdown-5.2.0-py3-none-any.whl.metadata (5.8 kB)
Collecting numpy
  Using cached numpy-2.3.2-cp311-cp311-win_amd64.whl.metadata (60 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting beautifulsoup4 (from gdown)
  Using cached beautifulsoup4-4.13.4-py3-none-any.whl.metadata (3.8 kB)
Collecting filelock (from gdown)
  Downloading filelock-3.19.1-py3-none-any.whl.metadata (2.1 kB)
Collecting requests[socks] (from gdown)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting tqdm (from gdown)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting soupsieve>1.2 (from beautifulsoup4->gdown)
  Using cached soupsieve-2.7-py3-none-any.whl.metadata (4.6 kB)
Collecting c

In [2]:
import Levenshtein
import random
import time
import pandas as pd
import numpy as np
import os
import re
import sys

In [3]:
def load_words_from_local_folder(folder_path='BBC'):
    all_words = []
    if not os.path.isdir(folder_path):
        print(f"Error: Folder not found at '{folder_path}'.")
        return []

    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            try:
                with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                    content = f.read()
                    all_words.extend(re.findall(r'\b\w+\b', content.lower()))
            except Exception as e:
                pass
                
    return all_words

In [4]:
def levenshtein_distance(s1, s2):
    k, c = len(s1), len(s2)
    dp = np.zeros((k + 1, c + 1), dtype=int)
    for i in range(k + 1):
        dp[i][0] = i
    for j in range(c + 1):
        dp[0][j] = j
        
    for i in range(1, k + 1):
        for j in range(1, c + 1):
            cost = 0 if s1[i-1] == s2[j-1] else 1
            dp[i][j] = min(dp[i-1][j] + 1,
                           dp[i][j-1] + 1,
                           dp[i-1][j-1] + cost)
    
    return dp[k][c]

In [5]:
words = load_words_from_local_folder(folder_path='BBC')
if words:
    pairs = [(random.choice(words), random.choice(words)) for _ in range(100)]

    results = []
    for w1, w2 in pairs:
        start = time.time()
        s1 = levenshtein_distance(w1, w2)
        end = time.time()
        runtime_ours = (end - start) * 1000  

        start = time.time()
        s2 = Levenshtein.distance(w1, w2)
        end = time.time()
        runtime_lib = (end - start) * 1000  

        results.append({
            "Word1": w1,
            "Word2": w2,
            "S1 (Mines)": s1,
            "S2 (Lib)": s2,
            "MED Score Diff": s1 - s2,
            "Runtime Diff (ms)": runtime_ours - runtime_lib
        })

    df = pd.DataFrame(results)
    print("\nComparison of 10 Random Pairs from BBC folder:\n")
    print(df.head(10).to_markdown(index=False))

    print("\nStatistical Summary of Differences:\n")
    print(df[["MED Score Diff", "Runtime Diff (ms)"]].describe().to_markdown())



Comparison of 10 Random Pairs from BBC folder:

| Word1   | Word2    |   S1 (Mines) |   S2 (Lib) |   MED Score Diff |   Runtime Diff (ms) |
|:--------|:---------|-------------:|-----------:|-----------------:|--------------------:|
| 30m     | i        |            3 |          3 |                0 |            -34.4732 |
| would   | the      |            5 |          5 |                0 |              0      |
| in      | european |            7 |          7 |                0 |              0      |
| year    | the      |            4 |          4 |                0 |              0      |
| his     | record   |            6 |          6 |                0 |              0      |
| in      | call     |            4 |          4 |                0 |              0      |
| the     | prix     |            4 |          4 |                0 |              0      |
| in      | s        |            2 |          2 |                0 |              0      |
| indoor  | and      |         