In [1]:
from nltk.corpus import twitter_samples
from collections import Counter
import re
import nltk

nltk.download('twitter_samples')

def tokenize(text):
    return re.findall(r'\b[a-z]+\b', text.lower())

tweets = twitter_samples.strings()
tokens = []
for tweet in tweets:
    tokens.extend(tokenize(tweet))

word_freq = Counter(tokens)
total_words = sum(word_freq.values())


[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.


In [3]:
with open("shakespeare.txt", encoding='utf-8') as f:
    shakespeare_text = f.read()

shakespeare_words = tokenize(shakespeare_text)


In [4]:
def delete_letter(word):
    return [word[:i] + word[i+1:] for i in range(len(word))]

def replace_letter(word):
    letters = 'abcdefghijklmnopqrstuvwxyz'
    return [word[:i] + l + word[i+1:] for i in range(len(word)) for l in letters if l != word[i]]

def insert_letter(word):
    letters = 'abcdefghijklmnopqrstuvwxyz'
    return [word[:i] + l + word[i:] for i in range(len(word)+1) for l in letters]

def transpose_letters(word):
    return [word[:i] + word[i+1] + word[i] + word[i+2:] for i in range(len(word)-1)]

def edits1(word):
    return set(delete_letter(word) + replace_letter(word) + insert_letter(word) + transpose_letters(word))

def edits2(word):
    return set(e2 for e1 in edits1(word) for e2 in edits1(e1))


In [5]:
def word_probability(word):
    return word_freq[word] / total_words if word in word_freq else 1 / total_words


In [6]:
def min_edit_distance(s1, s2):
    m, n = len(s1), len(s2)
    dp = [[0]*(n+1) for _ in range(m+1)]

    for i in range(m+1): dp[i][0] = i
    for j in range(n+1): dp[0][j] = j

    for i in range(1, m+1):
        for j in range(1, n+1):
            cost = 0 if s1[i-1] == s2[j-1] else 1
            dp[i][j] = min(
                dp[i-1][j] + 1,   # delete
                dp[i][j-1] + 1,   # insert
                dp[i-1][j-1] + cost  # substitute
            )
    return dp[m][n]


In [7]:
def correction(word):
    candidates = (
        [word] if word in word_freq else
        edits1(word) & word_freq.keys() or
        edits2(word) & word_freq.keys() or
        [word]
    )
    return max(candidates, key=word_probability)


In [8]:
import random

def make_typos(word):
    if len(word) < 4:
        return word  # занадто коротке
    return random.choice([
        lambda w: delete_letter(w)[0],
        lambda w: replace_letter(w)[0],
        lambda w: insert_letter(w)[0],
        lambda w: transpose_letters(w)[0],
    ])(word)

# Створення тестової вибірки
test_sample = [w for w in set(shakespeare_words) if w in word_freq][:100]
typo_pairs = [(w, make_typos(w)) for w in test_sample]

# Оцінка
def evaluate(pairs):
    correct = 0
    for correct_word, typo in pairs:
        predicted = correction(typo)
        if predicted == correct_word:
            correct += 1
    return correct / len(pairs)

accuracy = evaluate(typo_pairs)
print("Accuracy:", accuracy)


Accuracy: 0.75


In [9]:
# Кусок тексту для тесту замість файлу
sample_text = """
To be, or not to be, that is the question:
Whether 'tis nobler in the mind to suffer
The slings and arrows of outrageous fortune,
Or to take arms against a sea of troubles
"""

# Токенізація
sample_words = tokenize(sample_text)

# Вибірка слів для тесту: ті, що є у словнику частот
test_sample = [w for w in set(sample_words) if w in word_freq]

# Створення пар (правильне слово, слово з помилкою)
typo_pairs = [(w, make_typos(w)) for w in test_sample]

# Оцінка точності
accuracy = evaluate(typo_pairs)
print("Accuracy on sample text:", accuracy)


Accuracy on sample text: 0.9545454545454546


In [10]:
import json

# Збереження словника частот та total_words у JSON
model_data_serializable = {
    "word_freq": dict(word_freq),
    "total_words": total_words
}

with open("word_freq_model.json", "w", encoding="utf-8") as f:
    json.dump(model_data_serializable, f, ensure_ascii=False, indent=2)

print("Model saved to word_freq_model.json")


Model saved to word_freq_model.json


In [11]:
from collections import Counter
import json

def load_model(filename="word_freq_model.json"):
    with open(filename, "r", encoding="utf-8") as f:
        data = json.load(f)
    return Counter(data["word_freq"]), data["total_words"]

# Приклад завантаження:
# loaded_word_freq, loaded_total_words = load_model()
# print("Loaded words count:", len(loaded_word_freq))
