In [30]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [31]:
with open('corpus.txt','r') as file:
    corpus=file.readlines()
corpus = [line.strip() for line in corpus if line.strip()]

In [39]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts(corpus)
vocab_size=len(tokenizer.word_index)+1
sequences=tokenizer.texts_to_sequences(corpus)
max_len=max(len(seq) for seq in sequences)

X=[]
y=[]

for seq in sequences:
    for i in range(1,len(seq)):
        X.append(seq[:i])
        y.append(seq[i])

X=pad_sequences(X,maxlen=max_len,padding='pre')
y=np.array(y)

In [27]:
model=Sequential()
model.add(LSTM(100,return_sequences=False))
model.add(Dense(vocab_size,activation='softmax'))
model.add(Embedding(vocab_size,50,input_length=max_len))
model.add(Dropout(0.2))

model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [32]:
model.fit(X,y,epochs=500,batch_size=64,verbose=1)

Epoch 1/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.8615 - loss: 0.3063
Epoch 2/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.8604 - loss: 0.3214
Epoch 3/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.8417 - loss: 0.3512
Epoch 4/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.8678 - loss: 0.3196
Epoch 5/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.8601 - loss: 0.3064
Epoch 6/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.8602 - loss: 0.3212
Epoch 7/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.8378 - loss: 0.3595
Epoch 8/500
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.8471 - loss: 0.3465
Epoch 9/500
[1m11/11[0m [32m━━━━━━━━━

<keras.src.callbacks.history.History at 0x21b829a8c20>

In [34]:
def generate_text(model,tokenizer,seed_text,max_len,n_words):
    input_seq=tokenizer.texts_to_sequences([seed_text])
    input_seq=pad_sequences(input_seq,maxlen=max_len,padding='pre')

    generated_text=seed_text
    for _ in range(n_words):
        predicted_probs=model.predict(input_seq,verbose=0)[0]
        predicted_word_index=np.random.choice(range(len(predicted_probs)),p=predicted_probs)
        predicted_word=tokenizer.index_word.get(predicted_word_index,'')

        generated_text+=' '+predicted_word
        input_seq=np.append(input_seq[0][1:],predicted_word_index)
        input_seq=input_seq.reshape(1,-1)
    return generated_text
    

In [36]:
seed_text = "LSTMs are good"
generated_text = generate_text(model, tokenizer, seed_text, max_len, n_words=20)
print("Generated Text:",generated_text)

Generated Text: LSTMs are good good time series analysis sequences data include advancements analysis is many modeling ai advancements advancements has learning computing we preprocessing
