In [67]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D, Bidirectional
from sklearn.model_selection import train_test_split

# Load the dataset
file_path = "C:\\Users\\hp\\Desktop\\homophones_dataset.csv"
df = pd.read_csv(file_path)

# Preprocess the dataset
sentences = df['Sentence'].values
homophones = df['Homophone'].values
correct_words = df['Correct Word'].values

# Concatenate sentences and homophones for the input
inputs = [f"{sentence} {homophone}" for sentence, homophone in zip(sentences, homophones)]

# Tokenize the inputs
input_tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
input_tokenizer.fit_on_texts(inputs)
input_sequences = input_tokenizer.texts_to_sequences(inputs)
padded_input_sequences = pad_sequences(input_sequences, maxlen=100, padding='post', truncating='post')

# Tokenize the outputs
output_tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
output_tokenizer.fit_on_texts(correct_words)
output_sequences = output_tokenizer.texts_to_sequences(correct_words)

# Pad the output sequences to have the same length
padded_output_sequences = pad_sequences(output_sequences, maxlen=1, padding='post', truncating='post')

# Convert the outputs to a numpy array (since they are already sequences of integers)
padded_output_sequences = np.array(padded_output_sequences)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(padded_input_sequences, padded_output_sequences, test_size=0.2, random_state=42)

# Load the GloVe embeddings
embedding_index = {}
embedding_dim = 100
glove_file = "C:\\Users\\hp\\Desktop\\New folder\\glove.6B.100d.txt"
with open(glove_file, encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

# Create the embedding matrix
embedding_matrix = np.zeros((10000, embedding_dim))
for word, i in input_tokenizer.word_index.items():
    if i < 10000:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

# Create the model
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=embedding_dim, input_length=100, weights=[embedding_matrix], trainable=False))
model.add(SpatialDropout1D(0.2))
model.add(Bidirectional(LSTM(256, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)))
model.add(Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dense(10000, activation='softmax'))

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=20, validation_data=(X_test, y_test), batch_size=64)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy * 100:.2f}%')

# Save the model
model.save('homophone_correction_model.h5')

# Predict the correct word for new sentences
def predict_correct_word(sentence, homophone):
    input_sequence = input_tokenizer.texts_to_sequences([f"{sentence} {homophone}"])
    padded_input_sequence = pad_sequences(input_sequence, maxlen=100, padding='post', truncating='post')
    prediction = model.predict(padded_input_sequence)
    predicted_word_index = np.argmax(prediction[0])
    predicted_word = output_tokenizer.index_word.get(predicted_word_index, "Unknown")
    return predicted_word


Epoch 1/20




[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 787ms/step - accuracy: 0.3340 - loss: 8.8145 - val_accuracy: 0.4545 - val_loss: 6.1490
Epoch 2/20
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 682ms/step - accuracy: 0.4729 - loss: 4.9726 - val_accuracy: 0.4545 - val_loss: 1.4390
Epoch 3/20
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 738ms/step - accuracy: 0.4501 - loss: 1.1496 - val_accuracy: 0.5152 - val_loss: 0.8297
Epoch 4/20
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 702ms/step - accuracy: 0.5039 - loss: 0.7960 - val_accuracy: 0.5152 - val_loss: 0.8127
Epoch 5/20
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 713ms/step - accuracy: 0.5236 - loss: 0.7975 - val_accuracy: 0.4545 - val_loss: 0.8136
Epoch 6/20
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 701ms/step - accuracy: 0.4617 - loss: 0.7960 - val_accuracy: 0.5152 - val_loss: 0.8077
Epoch 7/20
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[



Test Accuracy: 83.33%


In [83]:
# Example prediction
example_sentence = "Dont go here and their"
example_homophone = "Their", "there", "Your", "They re"
print(predict_correct_word(example_sentence, example_homophone))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
there
