# Spelling Correction using Seq2Seq LSTM 

NOTE :  This dataset has been synthetically created for teaching and demonstration purposes. In real-world production use, spelling corrector models—especially sequence-to-sequence models with attention—should be trained on millions of diverse and domain-relevant examples to generalize well and provide robust performance.


In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input, Concatenate, Attention
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split

In [9]:

# Load synthetic spelling dataset
df = pd.read_csv('synthetic_spelling_20k.csv')

# Clean and lowercase
input_texts = df['input'].astype(str).str.lower().tolist()
target_texts = df['target'].astype(str).str.lower().tolist()
target_texts = [f"<start> {txt} <end>" for txt in target_texts]

# Train-test split
train_input, val_input, train_target, val_target = train_test_split(input_texts, target_texts, test_size=0.2, random_state=42)

# Tokenizers
input_tokenizer = Tokenizer(oov_token='<unk>')
input_tokenizer.fit_on_texts(train_input)
input_seqs = input_tokenizer.texts_to_sequences(train_input)
input_maxlen = max(len(seq) for seq in input_seqs)
input_padded = pad_sequences(input_seqs, maxlen=input_maxlen, padding='post')
input_vocab_size = len(input_tokenizer.word_index) + 1

target_tokenizer = Tokenizer(oov_token='<unk>', filters='')
target_tokenizer.fit_on_texts(train_target)
target_seqs = target_tokenizer.texts_to_sequences(train_target)
target_maxlen = max(len(seq) for seq in target_seqs)
target_padded = pad_sequences(target_seqs, maxlen=target_maxlen, padding='post')
target_vocab_size = len(target_tokenizer.word_index) + 1

# Decoder input and output
decoder_input_data = np.concatenate([np.zeros((len(target_padded), 1)), target_padded[:, :-1]], axis=1)
decoder_output_data = to_categorical(target_padded, num_classes=target_vocab_size)

# Model parameters
embedding_dim = 64
lstm_units = 64

# Encoder
encoder_inputs = Input(shape=(input_maxlen,))
encoder_embedding = Embedding(input_vocab_size, embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)

# Decoder
decoder_inputs = Input(shape=(target_maxlen,))
decoder_embedding = Embedding(target_vocab_size, embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[state_h, state_c])

# Attention
attention_layer = Attention()([decoder_outputs, encoder_outputs])
concat_layer = Concatenate(axis=-1)([decoder_outputs, attention_layer])
decoder_dense = Dense(target_vocab_size, activation='softmax')(concat_layer)

# Compile model
model = Model([encoder_inputs, decoder_inputs], decoder_dense)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train
model.fit([input_padded, decoder_input_data], decoder_output_data, batch_size=32, epochs=10, validation_split=0.2)

# Inference function
def correct_spelling(sentence):
    sentence = sentence.lower()
    seq = input_tokenizer.texts_to_sequences([sentence])
    pad_seq = pad_sequences(seq, maxlen=input_maxlen, padding='post')
    decoder_seq = np.zeros((1, target_maxlen))
    decoder_seq[0, 0] = target_tokenizer.word_index['<start>']
    result = ""
    for i in range(1, target_maxlen):
        preds = model.predict([pad_seq, decoder_seq])
        pred_id = np.argmax(preds[0, i - 1, :])
        pred_word = target_tokenizer.index_word.get(pred_id, '')
        if pred_word == '<end>' or pred_word == '<unk>':
            break
        result += pred_word + ' '
        decoder_seq[0, i] = pred_id
    return result.strip()




Epoch 1/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - accuracy: 0.4703 - loss: 2.5991 - val_accuracy: 1.0000 - val_loss: 0.0771
Epoch 2/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 1.0000 - loss: 0.0472 - val_accuracy: 1.0000 - val_loss: 0.0130
Epoch 3/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 1.0000 - loss: 0.0103 - val_accuracy: 1.0000 - val_loss: 0.0054
Epoch 4/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 1.0000 - loss: 0.0047 - val_accuracy: 1.0000 - val_loss: 0.0030
Epoch 5/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 1.0000 - loss: 0.0026 - val_accuracy: 1.0000 - val_loss: 0.0019
Epoch 6/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 1.0000 - loss: 0.0017 - val_accuracy: 1.0000 - val_loss: 0.0013
Epoch 7/10
[1m400/400[0m 

In [24]:

# Example
sample_input = "they need to acommodate all the invited guests."
print("Input:", sample_input)
print("Corrected:", correct_spelling(sample_input))


Input: they need to acommodate all the invited guests.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 97ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
Corrected: they need to accommodate all the invited guests.


## NOTE