# NLP Language model for text generation involves train a neural network to predict the next word in a sequence of words.

In [17]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense


In [18]:
# Sample Data
text_data = [
    "The sun sets behind the mountains, casting a warm glow across the valley.",
    "Gardeners plant seeds in the fertile soil, hoping for a bountiful harvest.",
    "Soft whispers of the wind rustle through the leaves of the ancient oak tree.",
    "Children giggle and play in the vibrant meadow, surrounded by wildflowers.",
    "A mysterious figure emerges from the shadows, capturing everyone's attention.",
    "Raindrops dance on the windowpane, creating a soothing melody.",
    "The old bookstore is filled with the scent of aging paper and forgotten tales.",
    "In the bustling city, neon lights illuminate the streets as people hurry by.",
    "As the moon rises, nocturnal creatures awaken, beginning their nightly rituals.",
    "Ocean waves gently kiss the shore, carrying secrets from distant lands."
]

# Tokenization
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(text_data)
total_words = len(tokenizer.word_index) + 1


In [19]:
# Create input sequences and targets
input_sequences = []
for line in text_data:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

max_sequence_length = max([len(seq) for seq in input_sequences])
input_sequences = tf.keras.preprocessing.sequence.pad_sequences(input_sequences, maxlen=max_sequence_length)

x, y = input_sequences[:, :-1], input_sequences[:, -1]
y = tf.keras.utils.to_categorical(y, num_classes=total_words)




In [20]:
# Build the model
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_length-1))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(x, y, epochs=100, verbose=2)



Epoch 1/100
4/4 - 3s - loss: 4.5322 - accuracy: 0.0093 - 3s/epoch - 693ms/step
Epoch 2/100
4/4 - 0s - loss: 4.5140 - accuracy: 0.1296 - 60ms/epoch - 15ms/step
Epoch 3/100
4/4 - 0s - loss: 4.4908 - accuracy: 0.1296 - 44ms/epoch - 11ms/step
Epoch 4/100
4/4 - 0s - loss: 4.4507 - accuracy: 0.1296 - 42ms/epoch - 10ms/step
Epoch 5/100
4/4 - 0s - loss: 4.3526 - accuracy: 0.1296 - 40ms/epoch - 10ms/step
Epoch 6/100
4/4 - 0s - loss: 4.2765 - accuracy: 0.1296 - 41ms/epoch - 10ms/step
Epoch 7/100
4/4 - 0s - loss: 4.2576 - accuracy: 0.1296 - 46ms/epoch - 11ms/step
Epoch 8/100
4/4 - 0s - loss: 4.2133 - accuracy: 0.1296 - 50ms/epoch - 13ms/step
Epoch 9/100
4/4 - 0s - loss: 4.2001 - accuracy: 0.1296 - 50ms/epoch - 12ms/step
Epoch 10/100
4/4 - 0s - loss: 4.1868 - accuracy: 0.1296 - 56ms/epoch - 14ms/step
Epoch 11/100
4/4 - 0s - loss: 4.1475 - accuracy: 0.1296 - 44ms/epoch - 11ms/step
Epoch 12/100
4/4 - 0s - loss: 4.1243 - accuracy: 0.1296 - 41ms/epoch - 10ms/step
Epoch 13/100
4/4 - 0s - loss: 4.0856 -

<keras.src.callbacks.History at 0x247059bec10>

In [21]:
# Generate text
seed_text = "Raindrops dance on"
next_words = 5
for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = tf.keras.preprocessing.sequence.pad_sequences([token_list], maxlen=max_sequence_length-1)
    predicted = np.argmax(model.predict(token_list, verbose=0))
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    seed_text += " " + output_word

print(seed_text)

Raindrops dance on the windowpane creating a soothing


# Speech to Text model.

In [16]:
def speech_to_text(file):
    samp=sr.AudioFile(file)
    with samp as source:
        audio=recog.record(samp)
    return recog.recognize_google(audio)


In [65]:
print(speech_to_text("I Was Broken.wav"))

I was broken from my young age the message from the beginning from the brain in the beauty


In [10]:
import speech_recognition as sr
recog=sr.Recognizer()
mic=sr.Microphone()
mic.list_microphone_names()
with mic as source:
    audio=recog.listen(source)

In [12]:
recog.recognize_google(audio)

# Text to Speech model.

In [15]:
from gtts import gTTS
import os


def text_to_speech(text,language='en',filename='output.mp3'):
    tts=gTTS(text=text,lang=language,slow=False)
    tts.save(filename)
    os.system(f"start {filename}")

if __name__=="__main__":
    input_text=input('User text pl >>:')
    text_to_speech(input_text)


User text pl >>:Hai How are you,i'm waiting for you since 143 days


# NLP Language model to detect the sentence/word error in the text corpus.

In [45]:
# Sample dataset with correct and incorrect sentences
correct_sentences = ['hello', 'world', 'python', 'spell', 'language', 'model', 'check']
incorrect_sentences = ['helo', 'worl', 'pythoon', 'spl', 'langage', 'moel', 'chek']


In [46]:
# Combine correct and incorrect sentences into a single dataset
all_sentences = correct_sentences + incorrect_sentences

# Labels: 1 for correct sentences, 0 for incorrect sentences
labels = [1] * len(correct_sentences) + [0] * len(incorrect_sentences)



In [47]:
# Split the dataset into train and test
x_train, x_test, y_train, y_test = train_test_split(all_sentences, labels, test_size=0.2, random_state=42)

# Load spaCy English language model
nlp = spacy.load("en_core_web_sm")


In [48]:
# Test the error detection
def detect_error(test_sentence):
    doc = nlp(test_sentence)

    # Check if the sentence contains any parsing errors
    if doc.is_parsed:
        print(f"The sentence '{test_sentence}' is correct.")
    else:
        print(f"The sentence '{test_sentence}' may contain errors.")

# Test the error detection function
detect_error("Hello, this is a sample text with som misspelled words. Can you find the error?")



The sentence 'Hello, this is a sample text with som misspelled words. Can you find the error?' is correct.


  if doc.is_parsed:


In [49]:
# Evaluate the model
y_pred = [nlp(sentence).is_parsed for sentence in x_test]
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")


Accuracy: 33.33%


  y_pred = [nlp(sentence).is_parsed for sentence in x_test]


# Language model to correct the error in the text

In [61]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

def correct_errors(input_text):
    # Load pre-trained GPT-2 model and tokenizer
    model = GPT2LMHeadModel.from_pretrained("gpt2")
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

    # Tokenize input text
    input_ids = tokenizer.encode(input_text, return_tensors="pt")

    # Generate corrected text with explicit attention_mask
    attention_mask = torch.ones_like(input_ids)  # Create attention mask with all 1s
    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_length=100,
            num_beams=5,
            no_repeat_ngram_size=2,
            top_k=50,
            top_p=0.95,
            do_sample=True,
            attention_mask=attention_mask
        )

    corrected_text = tokenizer.decode(output[0], skip_special_tokens=True)

    return corrected_text

# Example usage
input_text = "This s is an examplee of incorrectt textt."
corrected_text = correct_errors(input_text)

print(f"\nOriginal text: {input_text}")
print(f"\nCorrected text: {corrected_text}")


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Original text: This s is an examplee of incorrectt textt.

Corrected text: This s is an examplee of incorrectt textt.

If you want to see more examples, check out this page.
