In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split
import pickle
import re
import matplotlib.pyplot as plt

In [2]:
def preprocess_text(text):
        """
        Clean and preprocess text data.

        Args:
            text (str): Input text

        Returns:
            str: Cleaned text
        """
        # Convert to lowercase
        text = text.lower()

        # Remove extra whitespaces
        text = re.sub(r'\s+', ' ', text)

        # Remove special characters but keep basic punctuation
        text = re.sub(r'[^\w\s.,!?;:\'-]', '', text)

        # Strip leading/trailing whitespace
        text = text.strip()

        return text

In [3]:
import os

In [4]:
!pip install kagglehub



In [5]:
import kagglehub
input_column='Ungrammatical Statement'
target_column='Standard English'
path = kagglehub.dataset_download("satishgunjal/grammar-correction")
filepath=os.path.join(path,'Grammar Correction.csv')
df = pd.read_csv(filepath)
df = df.dropna(subset=[input_column, target_column])

Downloading from https://www.kaggle.com/api/v1/datasets/download/satishgunjal/grammar-correction?dataset_version_number=1...


100%|██████████| 62.4k/62.4k [00:00<00:00, 62.7MB/s]

Extracting files...





In [6]:
df.head()

Unnamed: 0,Serial Number,Error Type,Ungrammatical Statement,Standard English
0,1,Verb Tense Errors,I goes to the store everyday.,I go to the store everyday.
1,2,Verb Tense Errors,They was playing soccer last night.,They were playing soccer last night.
2,3,Verb Tense Errors,She have completed her homework.,She has completed her homework.
3,4,Verb Tense Errors,He don't know the answer.,He doesn't know the answer.
4,5,Verb Tense Errors,The sun rise in the east.,The sun rises in the east.


In [7]:
# Preprocess text
input_texts = df[input_column].apply(preprocess_text).tolist()
target_texts = df[target_column].apply(preprocess_text).tolist()

In [38]:
print(input_texts)

['i goes to the store everyday.', 'they was playing soccer last night.', 'she have completed her homework.', "he don't know the answer.", 'the sun rise in the east.', 'i am eat pizza for lunch.', 'the students studies for the exam.', 'the car need to be repaired.', 'she will goes to the party tonight.', 'they watches the movie together.', 'the flowers is blooming in spring.', 'she think she can finish the project.', 'the dogs barks at the mail carrier.', 'the kids plays video games after school.', 'the computer not working properly.', 'he had sleep for ten hours.', 'i walk to work every day last month.', 'she will be write a book next year.', 'the chef cook dinner for the guests.', 'they plants a tree in the garden.', 'i has been to paris three times.', 'the cat catch the mouse yesterday.', 'the airplane fly over the city.', 'he do his homework every evening.', 'they was at the concert last night.', 'the computer running slow today.', 'she buy a new dress for the party.', 'the birds si

In [39]:
print(target_texts)

['i go to the store everyday.', 'they were playing soccer last night.', 'she has completed her homework.', "he doesn't know the answer.", 'the sun rises in the east.', 'i am eating pizza for lunch.', 'the students study for the exam.', 'the car needs to be repaired.', 'she will go to the party tonight.', 'they watch the movie together.', 'the flowers bloom in spring.', 'she thinks she can finish the project.', 'the dogs bark at the mail carrier.', 'the kids play video games after school.', 'the computer is not working properly.', 'he had slept for ten hours.', 'i walked to work every day last month.', 'she will be writing a book next year.', 'the chef cooks dinner for the guests.', 'they plant a tree in the garden.', 'i have been to paris three times.', 'the cat caught the mouse yesterday.', 'the airplane flies over the city.', 'he does his homework every evening.', 'they were at the concert last night.', 'the computer is running slow today.', 'she buys a new dress for the party.', 'th

In [8]:
# Tokenize sentences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(input_texts + target_texts)

In [9]:
# Convert sentences to sequences
input_sequences = tokenizer.texts_to_sequences(input_texts) #The sequences list would then look like [[1, 2, 3, 4], [6, 2, 3, 4]]
target_sequences = tokenizer.texts_to_sequences(target_texts)


In [10]:
# Pad sequences to the same length, [[1, 2, 3], [4, 5]] -> array([[1, 2, 3, 0],[4, 5, 0, 0]])
max_len = max(max(len(seq) for seq in input_sequences), max(len(seq) for seq in target_sequences))
input_sequences = pad_sequences(input_sequences, maxlen=max_len, padding='post')
target_sequences = pad_sequences(target_sequences, maxlen=max_len, padding='post')

In [11]:
# Define the model
# Number of unique words in your vocabulary + 1: An additional index for out-of-vocabulary words or padding.
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 50

In [12]:
#Build LSTM Model
model = Sequential([
    #convert input sequences of integers (where each integer represents a word index) into dense vectors of fixed size (embedding_dim).
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len),
    #return the full sequence of outputs, not just the last output.
    LSTM(100, return_sequences=True),
    LSTM(100, return_sequences=True),
    LSTM(100, return_sequences=True),
    # Number of units in the output layer, which corresponds to the size of the vocabulary. Each unit represents a word in the vocabulary.
    Dense(vocab_size, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])




In [13]:
model.summary()

In [14]:
# Train the model
model.fit(input_sequences, target_sequences, epochs=20, batch_size=1)

Epoch 1/20
[1m2018/2018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 8ms/step - accuracy: 0.6182 - loss: 3.2212
Epoch 2/20
[1m2018/2018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 8ms/step - accuracy: 0.6662 - loss: 2.2547
Epoch 3/20
[1m2018/2018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 8ms/step - accuracy: 0.6932 - loss: 1.9853
Epoch 4/20
[1m2018/2018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 9ms/step - accuracy: 0.7183 - loss: 1.7344
Epoch 5/20
[1m2018/2018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 8ms/step - accuracy: 0.7289 - loss: 1.5745
Epoch 6/20
[1m2018/2018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 8ms/step - accuracy: 0.7423 - loss: 1.4207
Epoch 7/20
[1m2018/2018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 8ms/step - accuracy: 0.7602 - loss: 1.2502
Epoch 8/20
[1m2018/2018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 8ms/step - accuracy: 0.7724 - loss: 1.1450
Epoch 9/20
[1m2

<keras.src.callbacks.history.History at 0x7bb4cd9267e0>

In [15]:
# Function to correct a sentence
def correct_sentence(sentence):
    sequence = tokenizer.texts_to_sequences([sentence])
    sequence = pad_sequences(sequence, maxlen=max_len, padding='post')
    predicted_sequence = model.predict(sequence)
    #Convert the predicted sequence from one-hot encoded format to integer indices.
    #finds the index of the maximum value along the last axis (the vocabulary axis), gives the index of the most probable word for each position in the sequence.
    predicted_sequence = tf.argmax(predicted_sequence, axis=-1).numpy()[0]
    #Convert the integer indices back to words using the tokenizer, excluding padding tokens '0'.
    corrected_sentence = ' '.join(tokenizer.index_word[idx] for idx in predicted_sequence if idx != 0)
    return corrected_sentence

In [16]:
# Test the correction function
input_sentence = "apples is healthy."
corrected_sentence = correct_sentence(input_sentence)
print("Original Sentence:")
print(input_sentence)
print("\nCorrected Sentence:")
print(corrected_sentence)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 215ms/step
Original Sentence:
apples is healthy.

Corrected Sentence:
please is a


In [17]:
# Test the correction function
input_sentence = "I goes to Dubai"
corrected_sentence = correct_sentence(input_sentence)
print("Original Sentence:")
print(input_sentence)
print("\nCorrected Sentence:")
print(corrected_sentence)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
Original Sentence:
I goes to Dubai

Corrected Sentence:
i want to to


# Encoder Decoder Model

An Encoder-Decoder Model is a type of deep learning architecture commonly used for sequence-to-sequence (seq2seq) tasks, such as machine translation, text summarization, and grammar correction. It consists of two key components:
1. Encoder
- The encoder takes an input sequence (e.g., a sentence) and compresses it into a fixed-size context vector (also called a "hidden state" or "thought vector").
- Typically, LSTMs or GRUs (types of recurrent neural networks) are used in the encoder.
- It processes the input word by word and learns important features while discarding unnecessary details.
2. Decoder
- The decoder takes the compressed context vector from the encoder and generates the output sequence one word at a time.
- It is also an LSTM or GRU-based network, but instead of encoding, it predicts words step by step.
- The decoder learns to produce the correct sequence based on the encoded information.


In [18]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding


In [19]:
!wget https://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

--2025-11-23 06:58:23--  https://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-11-23 06:58:23--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2025-11-23 07:01:02 (5.17 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613]

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [20]:
# Load GloVe embeddings
def load_glove_embeddings(glove_path, embedding_dim):
    embeddings_index = {}
    with open(glove_path, "r", encoding="utf-8") as f:
        #line = "apple 0.12 -0.34 0.45 ..." values = ["apple", "0.12", "-0.34", "0.45", ...]
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype="float32")
            # embeddings_index["apple"] = np.array([0.12, -0.34, 0.45])
            embeddings_index[word] = coefs
    return embeddings_index


In [21]:
# Path to GloVe file and embedding dimension
glove_path = "glove.6B.100d.txt"  # Make sure you have downloaded it
embedding_dim = 100
glove_embeddings = load_glove_embeddings(glove_path, embedding_dim)


In [22]:

input_texts = df[input_column].apply(preprocess_text).tolist()
target_texts = df[target_column].apply(preprocess_text).tolist()


In [23]:
# Tokenize input and output sentences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(input_texts + target_texts)


In [24]:
input_sequences = tokenizer.texts_to_sequences(input_texts)
target_sequences = tokenizer.texts_to_sequences(target_texts)


In [25]:
# Get vocabulary size
vocab_size = len(tokenizer.word_index) + 1


In [26]:
# Pad sequences
max_seq_length = max(max(len(seq) for seq in input_sequences), max(len(seq) for seq in target_sequences))
input_sequences = pad_sequences(input_sequences, maxlen=max_seq_length, padding='post')
target_sequences = pad_sequences(target_sequences, maxlen=max_seq_length, padding='post')



In [27]:
# Create an embedding matrix using GloVe
#Create an empty (vocab_size × embedding_dim) matrix
embedding_matrix = np.zeros((vocab_size, embedding_dim))
#Loop through each word in your tokenizer’s vocabulary
for word, i in tokenizer.word_index.items():
    #Check if GloVe has a pretrained vector for this word
    if word in glove_embeddings:
        #Copy the GloVe vector into the correct row of the matrix
        embedding_matrix[i] = glove_embeddings[word]



In [28]:
# Create encoder-decoder model
embed_dim = embedding_dim
hidden_units = 128

# Encoder
encoder_inputs = Input(shape=(max_seq_length,))
embedding_layer = Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], trainable=False)(encoder_inputs)
encoder_lstm = LSTM(hidden_units, return_state=True)
#Hidden state (state_h) → Summarizes what the LSTM has seen so far. Cell state (state_c) → Stores long-term memory.
encoder_outputs, state_h, state_c = encoder_lstm(embedding_layer)



In [29]:
# Decoder
decoder_inputs = Input(shape=(max_seq_length,))
decoder_embedding_layer = Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], trainable=False)(decoder_inputs)
decoder_lstm = LSTM(hidden_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding_layer, initial_state=[state_h, state_c])
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)



In [30]:
# Model compilation
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy',metrics=['accuracy'])



In [31]:
# Convert target sequences to expected output format
target_sequences_output = np.expand_dims(target_sequences, -1)



In [32]:
model.summary()

In [33]:
# Train the model
model.fit([input_sequences, target_sequences], target_sequences_output, epochs=20, verbose=1)


Epoch 1/20
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - accuracy: 0.5528 - loss: 6.1105
Epoch 2/20
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.6225 - loss: 2.7212
Epoch 3/20
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.6480 - loss: 2.4979
Epoch 4/20
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.6526 - loss: 2.3446
Epoch 5/20
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.6895 - loss: 2.1735
Epoch 6/20
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.7117 - loss: 1.9875
Epoch 7/20
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.7188 - loss: 1.8938
Epoch 8/20
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.7442 - loss: 1.7446
Epoch 9/20
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7bb4cda644d0>

In [34]:
# Function to correct grammar in a sentence
def correct_sentence(sentence):
    seq = tokenizer.texts_to_sequences([sentence])
    seq = pad_sequences(seq, maxlen=max_seq_length, padding='post')
    pred = model.predict([seq, seq])
    predicted_seq = np.argmax(pred, axis=-1)[0]
    corrected_words = [word for index in predicted_seq if index > 0 for word, idx in tokenizer.word_index.items() if idx == index]
    return " ".join(corrected_words)


In [35]:
# Test predictions
test_sentences = ["he go to market", "she like the movie"]
for sentence in test_sentences:
    print(f"Original: {sentence} | Corrected: {correct_sentence(sentence)}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 289ms/step
Original: he go to market | Corrected: he go to company
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
Original: she like the movie | Corrected: she like the movie


In [36]:
# Test predictions
test_sentences = ["he go to market", "she like the movie"]
for sentence in test_sentences:
    print(f"Original: {sentence} | Corrected: {correct_sentence(sentence)}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
Original: he go to market | Corrected: he go to company
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
Original: she like the movie | Corrected: she like the movie


In [37]:
# Test predictions
test_sentences = ['Teh quick brown fox jumps over teh lazy dog.']
for sentence in test_sentences:
    print(f"Original: {sentence} | Corrected: {correct_sentence(sentence)}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
Original: Teh quick brown fox jumps over teh lazy dog. | Corrected: quick the on dog
