<a href="https://colab.research.google.com/github/syedsaadali11/Text_Generator_using_LSTM/blob/main/Copy_of_Text_gen_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import re

# Step 1: Install Dependencies and Upload Files

# Install required libraries
!pip install tensorflow

# Upload dataset and GloVe embeddings
from google.colab import files
uploaded = files.upload()

# Step 2: Preprocess Dataset

def load_dataset(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = file.read()
    return data

# Clean and format text

def clean_text(text):
    text = re.sub(r"([.,!?])", r" \1 ", text)  # Add space around punctuation
    text = re.sub(r"[^a-zA-Z.,!?']+", " ", text)  # Remove unwanted characters
    text = text.lower()  # Convert to lowercase
    return text

# Step 3: Preprocessing Text

def preprocess_text(text, sequence_length=50):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts([text])
    total_words = len(tokenizer.word_index) + 1

    # Generate sequences
    sequences = []
    token_list = tokenizer.texts_to_sequences([text])[0]
    for i in range(sequence_length, len(token_list)):
        seq = token_list[i-sequence_length:i+1]
        sequences.append(seq)

    sequences = np.array(sequences)
    X, y = sequences[:, :-1], sequences[:, -1]
    y = to_categorical(y, num_classes=total_words)

    return tokenizer, X, y, total_words, sequence_length

# Step 4: Load GloVe Embeddings

def load_glove_embeddings(glove_file, word_index, embedding_dim=100):
    embeddings_index = {}
    with open(glove_file, encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
        else:
            print(f"Word '{word}' not found in GloVe embeddings.")

    return embedding_matrix

# Step 5: Create Model with Pre-trained GloVe Embeddings

def create_model_with_glove(total_words, sequence_length, embedding_matrix):
    model = Sequential()
    model.add(Embedding(input_dim=total_words,
                        output_dim=100,
                        input_length=sequence_length,
                        weights=[embedding_matrix],
                        trainable=True))
    model.add(LSTM(150, return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(100))
    model.add(Dropout(0.2))
    model.add(Dense(total_words, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Step 6: Beam Search Predictions

def beam_search_predictions(model, tokenizer, seed_text, sequence_length, k=3, max_words=50):
    sequences = [(seed_text, 0)]  # Tuple of (sentence, probability)
    for _ in range(max_words):
        all_candidates = []
        for sentence, score in sequences:
            token_list = tokenizer.texts_to_sequences([sentence])[0]
            token_list = pad_sequences([token_list], maxlen=sequence_length, padding='pre')
            predictions = model.predict(token_list, verbose=0)[0]

            # Get top-k predictions
            top_k_indices = np.argsort(predictions)[-k:]
            for index in top_k_indices:
                word = tokenizer.index_word[index]
                new_sentence = sentence + ' ' + word
                new_score = score - np.log(predictions[index])  # Negative log likelihood
                all_candidates.append((new_sentence, new_score))

        # Select top-k sequences with highest scores
        sequences = sorted(all_candidates, key=lambda x: x[1])[:k]

    return sequences[0][0]  # Return the best sequence

# Step 7: Main Function

def main():
    # File paths
    dataset_file = '/content/cleaned_text.txt'  # Replace with your dataset file name
    glove_file = 'glove.6B.100d.txt'

    # Load and clean dataset
    dataset = load_dataset(dataset_file)
    dataset = clean_text(dataset)

    # Preprocess text
    tokenizer, X, y, total_words, sequence_length = preprocess_text(dataset)

    # Load GloVe embeddings
    embedding_matrix = load_glove_embeddings(glove_file, tokenizer.word_index, embedding_dim=100)

    # Create model
    model = create_model_with_glove(total_words, sequence_length, embedding_matrix)

    # Train model with early stopping
    from tensorflow.keras.callbacks import EarlyStopping
    early_stop = EarlyStopping(monitor='loss', patience=10, restore_best_weights=True)
    model.fit(X, y, epochs=300, batch_size=64, callbacks=[early_stop])

    # Save the model
    model.save('text_generator_model.h5')
    return model, tokenizer, sequence_length

if __name__ == "__main__":
    model, tokenizer, sequence_length = main()

    # # Generate text using beam search
    # seed_text = "i shall be late when she thought it over"
    # generated_text = beam_search_predictions(model, tokenizer, seed_text, sequence_length, k=3, max_words=50)
    # print(generated_text)




Saving cleaned_text.txt to cleaned_text (3).txt
Word 'hadn' not found in GloVe embeddings.
Word 'comfits' not found in GloVe embeddings.
Word 'curtseying' not found in GloVe embeddings.
Word 'hearthrug' not found in GloVe embeddings.
Word 'skurried' not found in GloVe embeddings.
Word 'inquisitively' not found in GloVe embeddings.
Word 'draggled' not found in GloVe embeddings.
Word 'crossly' not found in GloVe embeddings.
Word 'snappishly' not found in GloVe embeddings.
Epoch 1/300
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - accuracy: 0.0363 - loss: 6.4471
Epoch 2/300
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.0418 - loss: 5.8404
Epoch 3/300
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.0527 - loss: 5.7781
Epoch 4/300
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.0509 - loss: 5.6608
Epoch 5/300
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━



i shall be late when she thought it over afterwards it best of a shiver i beg your pardon said the mouse frowning but very politely did you speak not i said the lory hastily i thought you did said the mouse i proceed edwin and morcar the earls of mercia and northumbria declared for him and even stigand


In [None]:
model.save('text_generatormodel.keras')

In [None]:
 # Generate text using beam search
seed_text = " but at the time it all seemed quite natural but when  "
generated_text = beam_search_predictions(model, tokenizer, seed_text, sequence_length, k=3, max_words=50)
print(generated_text)

 but at the time it all seemed quite natural but when   the rabbit actually took a watch out of its waistcoat pocket and looked at it and then hurried on alice started to her feet for it flashed across her mind that she had never before seen a rabbit with either a waistcoat pocket or a watch to take out of
