In [2]:
!pip install Levenshtein

Collecting Levenshtein
  Downloading Levenshtein-0.25.0-cp310-cp310-win_amd64.whl (98 kB)
     -------------------------------------- 98.4/98.4 kB 402.7 kB/s eta 0:00:00
Collecting rapidfuzz<4.0.0,>=3.1.0
  Downloading rapidfuzz-3.7.0-cp310-cp310-win_amd64.whl (1.6 MB)
     ---------------------------------------- 1.6/1.6 MB 2.4 MB/s eta 0:00:00
Installing collected packages: rapidfuzz, Levenshtein
Successfully installed Levenshtein-0.25.0 rapidfuzz-3.7.0




In [4]:
import pandas as pd
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from tensorflow.keras.models import Model
import Levenshtein

In [46]:
with open("dict.txt",'r',encoding="UTF-8") as file:
    lines = file.readlines()

In [47]:
df = pd.DataFrame()

In [48]:
lines = [l.strip() for l in lines]

In [49]:
df["right"] = lines

In [50]:
df

Unnamed: 0,right
0,абажур
1,абажурчик
2,абазин
3,абазинец
4,абазинцы
...,...
62022,ящерка
62023,ящерогад
62024,ящичек
62025,ящичник


In [51]:
import random

def make_typo(word):
    error_type = random.choice(["replace", "omit", "swap", "add"])
    typo = ""
    
    if error_type == "replace":
        index = random.randint(0, len(word)-1)
        new_letter = random.choice([chr(i) for i in range(1072, 1104) if chr(i) != word[index]])
        typo = word[:index] + new_letter + word[index+1:]
    elif error_type == "omit":
        index = random.randint(0, len(word)-2)
        typo = word[:index] + word[index+1:]
    elif error_type == "swap":
        index = random.randint(0, len(word)-2)
        typo = word[:index] + word[index+1] + word[index] + word[index+2:]
    elif error_type == "add":
        index = random.randint(0, len(word)-1)
        new_letter = random.choice([chr(i) for i in range(1072, 1104)])
        typo = word[:index] + new_letter + word[index:]

    return typo


In [52]:
test_df = df.sample(100, replace=False)

In [53]:
test_df

Unnamed: 0,right
43090,равнинность
57063,французомания
16388,индюшонок
33590,папежество
17163,кавалерия
...,...
58094,художница
16641,интроскопия
10175,грехопадение
16686,инфразвук


In [54]:
data = {"right":[],"wrong":[]}
for word in test_df["right"]:
    for _ in range(100):
        data["right"].append(word)
        data["wrong"].append(make_typo(word))

In [55]:
data = pd.DataFrame(data)

In [56]:
data

Unnamed: 0,right,wrong
0,равнинность,ргавнинность
1,равнинность,равниность
2,равнинность,равниннсть
3,равнинность,равнинноисть
4,равнинность,гравнинность
...,...,...
9995,новокрещенец,новокрвещенец
9996,новокрещенец,новокрещенц
9997,новокрещенец,новозкрещенец
9998,новокрещенец,новоркещенец


In [30]:
def correct_word(word, dictionary):
    word = word.lower()
    min_distance = float('inf')
    corrected_word = None
    
    for correct_word in dictionary:
        distance = Levenshtein.distance(word, correct_word)
        if distance < min_distance:
            min_distance = distance
            corrected_word = correct_word
    
    return corrected_word

dictionary = list(data["right"])

data["corrected"] = data["wrong"].apply(lambda x: correct_word(x, dictionary))

In [94]:
data

{'right': ['cat', 'dog', 'apple', 'house'],
 'wrong': ['cot', 'dgo', 'aple', 'houss']}

In [86]:
all_words = list(data['right']) + list(data['wrong'])

tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_words)

sequences_right = tokenizer.texts_to_sequences(data['right'])
sequences_wrong = tokenizer.texts_to_sequences(data['wrong'])

In [87]:
max_length = max(max(len(seq) for seq in sequences_right), max(len(seq) for seq in sequences_wrong))
sequences_right_padded = pad_sequences(sequences_right, maxlen=max_length, padding='post')
sequences_wrong_padded = pad_sequences(sequences_wrong, maxlen=max_length, padding='post')


In [88]:
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 50
lstm_units = 128

input_layer = Input(shape=(max_length,))


embedding_layer = Embedding(vocab_size, embedding_dim, input_length=max_length)(input_layer)


lstm_layer = LSTM(lstm_units)(embedding_layer)


output_layer = Dense(vocab_size, activation='softmax')(lstm_layer)

In [89]:



model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


model.fit(sequences_wrong_padded, sequences_right_padded, epochs=10, batch_size=32, validation_split=0.2)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x15138ec65f0>

In [90]:
def correct_spelling(wrong_word):
    sequence = tokenizer.texts_to_sequences([wrong_word])
    padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post')
    predicted_sequence = model.predict(padded_sequence)
    predicted_index = np.argmax(predicted_sequence, axis=-1)
    predicted_word = tokenizer.index_word[predicted_index[0]]
    return predicted_word

# Пример использования
incorrect_word = "теполечение"
corrected_word = correct_spelling(incorrect_word)
print(f"Неправильное написание: {incorrect_word}, Предсказанное исправление: {corrected_word}")


Неправильное написание: теполечение, Предсказанное исправление: cat


In [69]:
test = data.sample(100, replace=False)

In [70]:
test

Unnamed: 0,right,wrong
9689,интроскопия,инвроскопия
1903,бронебойщик,бронебойик
8753,квинтет,квинет
1829,заколка,заоклка
5035,менеджмент,енеджмент
...,...,...
4139,дактилография,дактилоярафия
1739,натравщица,натарвщица
9677,интроскопия,итнроскопия
7214,микроцефал,микрогцефал


In [77]:
import random

wr = ""
res = ""
for i in range(3):
    word = test["wrong"].iloc[random.randint(0,100)]
    wr+= word + " "
    
    res += correct_spelling(word) + " "



In [78]:
wr

'проглатыванеие многожекнство всгполаскивание '

In [79]:
res

'проглатывание многоженство всполаскивание '

In [93]:
import ipywidgets as widgets
from IPython.display import display

def correct_words(b):
    # Получаем текст из поля ввода
    input_text = input_text_area.value.strip()
    # Заменяем слова с ошибками на исправленные
    corrected_text = correct_text(input_text)
    # Выводим исправленный текст
    output_text_area.value = corrected_text

def correct_text(input_text):
    sequence = tokenizer.texts_to_sequences([input_text])
    padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post')
    predicted_sequence = model.predict(padded_sequence)
    predicted_index = np.argmax(predicted_sequence, axis=-1)
    predicted_word = tokenizer.index_word[predicted_index[0]]
    return predicted_word

# Создаем текстовое поле для ввода текста
input_text_area = widgets.Textarea(
    value='',
    placeholder='Введите текст',
    description='Текст:',
    disabled=False,
    layout={'width': '50%'}
)

# Создаем кнопку "Пуск"
run_button = widgets.Button(description="Пуск")
run_button.on_click(correct_words)

# Создаем текстовую область для вывода исправленного текста
output_text_area = widgets.Textarea(
    value='',
    placeholder='Исправленный текст',
    description='Исправлено:',
    disabled=True,
    layout={'width': '50%'}
)

# Отображаем элементы интерфейса
display(input_text_area)
display(run_button)
display(output_text_area)


Textarea(value='', description='Текст:', layout=Layout(width='50%'), placeholder='Введите текст')

Button(description='Пуск', style=ButtonStyle())

Textarea(value='', description='Исправлено:', disabled=True, layout=Layout(width='50%'), placeholder='Исправле…

