In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
import numpy as np

# Training and testing data
correct_sentences = [
    "There they saw something extraordinary, far exceeding what they knew.",
    "Their knowledge of those facts was incomplete.",
    "Their knowledge of those data was incomplete.",
    "They’re going to learn something new from the ML course.",
    "He is driving the car to the store.",
    "He is driving the car to the picnic.",
    "She walked to the park with her friends.",
    "She walked to the park with her parents.",
    "The cat chased the mouse around the house.",
    "The cat chased the mouse around the car.",
    "He walked to the mountain.",
    "He walked to the park.",
]

incorrect_sentences = [
    "They’re they saw something extraordinary, far exceeding what they knew.",
    "There knowledge of those facts was incomplete.",
    "There knowledge of those data was incomplete.",
    "Their going to learn something new from the ML course.",
    "He are driving the car to the store.",
    "He are driving the car to the picnic.",
    "She walk to the park with her friends.",
    "She walk to the park with her parents.",
    "The cat chase the mouse around the house.",
    "The cat chase the mouse around the car.",
    "He walk to the mountain.",
    "He walk to the park.",
]

# Mapping of misused words to correct alternatives
word_corrections = {
    'They’re': 'There',
    'There': 'Their',
    'Their': 'They’re',
    'walk': 'walked',
    'are': 'is',
    'chase': 'chased'
}

In [2]:
# Combine correct and incorrect sentences into one dataset
all_sentences = correct_sentences + incorrect_sentences
all_labels = [0] * len(correct_sentences) + [1] * len(incorrect_sentences)

# Splitting into train and test sets
train_sentences, test_sentences, train_labels, test_labels = train_test_split(all_sentences, all_labels, test_size=0.2, random_state=100)

In [3]:
# Tokenize sentences
word_to_ix = {}
for sentence in all_sentences:
    for word in sentence.split():
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)

# Tokenize sentences with handling OOV words
def convert_to_sequences(sentences, word_to_ix):
    unk_token = len(word_to_ix)  # Assign a unique index for the <unk> token
    sentence_sequences = []
    for sentence in sentences:
        sequence = [word_to_ix[word] if word in word_to_ix else unk_token for word in sentence.split()]
        sentence_sequences.append(sequence)
    # Padding sequences
    max_len = max(len(seq) for seq in sentence_sequences)
    padded_sequences = [seq + [0] * (max_len - len(seq)) for seq in sentence_sequences]
    return torch.tensor(padded_sequences, dtype=torch.long)

train_inputs = convert_to_sequences(train_sentences, word_to_ix)
test_inputs = convert_to_sequences(test_sentences, word_to_ix)

train_labels = torch.tensor(train_labels, dtype=torch.long)
test_labels = torch.tensor(test_labels, dtype=torch.long)

In [4]:
# Define the model
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_size, num_layers, dropout):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, output_size)

    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.lstm(embedded)
        output = self.fc(output[:, -1, :])  # Get the last output from the sequence
        return output

# Initialize model, loss function, and optimizer
vocab_size = len(word_to_ix)
embedding_dim = 100
hidden_dim = 64
output_size = 2  # Binary classification: 0 for correct, 1 for incorrect
num_layers = 2
dropout = 0.2
model = LSTMModel(vocab_size, embedding_dim, hidden_dim, output_size, num_layers, dropout)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [5]:
# Training the model
num_epochs = 100
for epoch in range(num_epochs):
    optimizer.zero_grad()
    output = model(train_inputs)
    loss = criterion(output, train_labels)
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [10/100], Loss: 0.6832
Epoch [20/100], Loss: 0.6046
Epoch [30/100], Loss: 0.2425
Epoch [40/100], Loss: 0.0581
Epoch [50/100], Loss: 0.0045
Epoch [60/100], Loss: 0.0027
Epoch [70/100], Loss: 0.0039
Epoch [80/100], Loss: 0.0032
Epoch [90/100], Loss: 0.0028
Epoch [100/100], Loss: 0.0024


In [6]:
# Define a function to detect misused words and propose corrections
def detect_misused_words(sentence, word_corrections):
    misused_words = []
    for word in sentence.split():
        if word in word_corrections:
            misused_words.append((word, word_corrections[word]))
    return misused_words

In [7]:
# Evaluate the model on the test set and detect misused words
with torch.no_grad():
    output = model(test_inputs)
    _, predicted = torch.max(output, 1)
    correct = (predicted == test_labels).sum().item()
    total = test_labels.size(0)
    accuracy = correct / total * 100
    print(f'Accuracy on Test Set: {accuracy:.2f}% \n')

    print('****Incorrect Sentences**** \n')

    # Detect misused words in test sentences
    for i, sentence in enumerate(test_sentences):
        if predicted[i] == 1:  # If the sentence is predicted as incorrect
            print(f'Sentence: {sentence}')
            misused_words = detect_misused_words(sentence, word_corrections)
            if misused_words:
                for word, correction in misused_words:
                    print(f'Misused words detected: {word}')
                    print(f'Correct alternative word: {correction} \n')
            else:
                print('No misused words detected. \n')

Accuracy on Test Set: 100.00% 

****Incorrect Sentences**** 

Sentence: She walk to the park with her friends.
Misused words detected: walk
Correct alternative word: walked 

Sentence: The cat chase the mouse around the car.
Misused words detected: chase
Correct alternative word: chased 

Sentence: There knowledge of those facts was incomplete.
Misused words detected: There
Correct alternative word: Their 

