In [1]:
#TASK-4(Natural Language Processing for Text classification)

In [2]:
!pip install tensorflow numpy



In [3]:
#Data Preparation
#Importing libraries
import numpy as np
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import LSTM, Dense, Embedding

In [4]:
# Text data to be used for training the model
text = """Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do:
once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it,
'and what is the use of a book,' thought Alice 'without pictures or conversations?' So she was considering in her own mind
(as well as she could, for the hot day made her feel very sleepy and stupid), whether the pleasure of making a daisy-chain
would be worth the trouble of getting up and picking the daisies, when suddenly a White Rabbit with pink eyes ran close by her.
"""

In [5]:
# Initialize a Tokenizer to convert words to numeric tokens
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])  # Fit the tokenizer on the text

# Get the total number of unique words, plus one for padding purposes
total_words = len(tokenizer.word_index) + 1

# Initialize an empty list to store input sequences
input_sequences = []

In [6]:
# Split text into sentences based on periods and create sequences of tokens
for line in text.split('.'):
    token_list = tokenizer.texts_to_sequences([line])[0]  # Convert each sentence to a sequence of tokens
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]  # Create sequences of increasing length
        input_sequences.append(n_gram_sequence)  # Add to list of input sequences

In [7]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Find the maximum sequence length to use for padding
max_sequence_len = max([len(seq) for seq in input_sequences])

input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

In [8]:
# Split data into input (X) and output (y)
X, y = input_sequences[:, :-1], input_sequences[:, -1]
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

In [9]:
from tensorflow.keras.layers import Dropout, LSTM, Embedding, Dense

# Define the model
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))  # Embedding layer
model.add(LSTM(150))  # LSTM layer with 150 units
model.add(Dropout(0.2))  # Dropout for regularization
model.add(Dense(total_words, activation='softmax'))  # Output layer with softmax activation



In [10]:
# Compiling the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Training the model for 50 epochs
epochs = 50
history = model.fit(X, y, epochs=epochs, verbose=1)

Epoch 1/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 304ms/step - accuracy: 0.0056 - loss: 4.3701
Epoch 2/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 381ms/step - accuracy: 0.1007 - loss: 4.3565
Epoch 3/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 176ms/step - accuracy: 0.0805 - loss: 4.3433
Epoch 4/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 151ms/step - accuracy: 0.0402 - loss: 4.3164
Epoch 5/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 148ms/step - accuracy: 0.0621 - loss: 4.2492
Epoch 6/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 147ms/step - accuracy: 0.0496 - loss: 4.1863
Epoch 7/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 143ms/step - accuracy: 0.0781 - loss: 4.1448
Epoch 8/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 147ms/step - accuracy: 0.0760 - loss: 4.0602
Epoch 9/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

In [11]:
# Defining a function to generate new text based on a seed text and desired length
def generate_text(seed_text, next_words, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]  # Convert seed text to tokens
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')  # Pad sequence
        predicted = model.predict(token_list, verbose=0)  # Predict the next word
        predicted_word_index = np.argmax(predicted, axis=1)[0]  # Get index of predicted word

        # Finding the word corresponding to the predicted index
        for word, index in tokenizer.word_index.items():
            if index == predicted_word_index:
                seed_text += " " + word  # Append word to seed text
                break
    return seed_text

In [13]:
# Generating and printing text starting with a seed text
seed_text = "There was a boy named kevin"
generated_text = generate_text(seed_text, 10, max_sequence_len)
print(f"Generated text: {generated_text}")

# Save the trained model and tokenizer
model.save('text_generation_lstm_model.h5')




Generated text: There was a boy named kevin was beginning get tired tired tired sitting by her sister
