In [19]:
import pandas as pd
import tensorflow as tf
import nltk
import os
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing import sequence
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dropout



In [20]:
seq_len = 50
train_size = 0.8
embedding_vector_len = 100
max_words = 5000
epochs = 10
batch_size=50
source_folder = "gutenberg_texts"
files_to_read = 4
temperature = 1.0

In [21]:
#reads data from one file
#with open("gutenberg_texts\pg21687.txt", "r", encoding="utf-8") as f:
#   data = f.read().lower()

data_list = []
count = 0
for file_name in os.listdir(source_folder):
    if count == files_to_read:
        break
    else:
        count+=1
    try:
        file_path = os.path.join(source_folder, file_name)
        with open(file_path, "r", encoding="utf-8") as f:
            data = f.read().lower()
        data_list.append(data)

    except Exception as e:
        print(f"Error with reading {file_name}: {e}")

data_str = " ".join(data_list)
tokenizer = Tokenizer(char_level=False, filters="!\"#$%&()*+-/:;<=>?@[\\]^_`{|}~", num_words=max_words)
tokenizer.fit_on_texts([data_str])
#tokenized_data = word_tokenize(data.lower())
sequences = tokenizer.texts_to_sequences([data_str])[0]
#print(sequences[:20])

vocab_size = len(tokenizer.word_index) + 1
vocab = list(tokenizer.word_index.keys())

#print(sequences[:10])

In [22]:
#splitting up text
seq_arr = []

for i in range(len(sequences) - seq_len):
    seq_arr.append(sequences[i:i + seq_len])

padded_seq = pad_sequences(seq_arr, maxlen = seq_len)

print(padded_seq.shape)

(315158, 50)


In [23]:
#prepare data
train_partition = int(len(padded_seq) * train_size)

X_train, X_test = padded_seq[:train_partition], padded_seq[train_partition:]
y_train, y_test = sequences[seq_len: train_partition + seq_len], sequences[train_partition + seq_len:]
#y_train = to_categorical(y_train, num_classes=vocab_size)
#y_test = to_categorical(y_test, num_classes=vocab_size)
y_train = np.array(y_train)
y_test = np.array(y_test)

print(f"X_train: {len(X_train)} X_test: {len(X_test)}")
print(y_train.shape)


X_train: 252126 X_test: 63032
(252126,)


In [24]:
#define model
model = Sequential([
    Embedding(vocab_size, embedding_vector_len, input_length = seq_len),
    LSTM(150, return_sequences=True),
    LSTM(150),
    Dropout(0.4),
    Dense(vocab_size, activation="softmax"),
])

model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

print(model.summary())

None


In [25]:
#train model
model.fit(X_train, y_train, epochs=epochs, verbose=1)
model.save("lstm_model.h5")


Epoch 1/10
[1m7879/7879[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2133s[0m 270ms/step - accuracy: 0.0529 - loss: 6.7251
Epoch 2/10
[1m7879/7879[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2166s[0m 275ms/step - accuracy: 0.0827 - loss: 5.9963
Epoch 3/10
[1m7879/7879[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2219s[0m 282ms/step - accuracy: 0.1087 - loss: 5.6852
Epoch 4/10
[1m7879/7879[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2327s[0m 295ms/step - accuracy: 0.1291 - loss: 5.4644
Epoch 5/10
[1m7879/7879[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1972s[0m 250ms/step - accuracy: 0.1405 - loss: 5.2995
Epoch 6/10
[1m7879/7879[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2010s[0m 255ms/step - accuracy: 0.1494 - loss: 5.1827
Epoch 7/10
[1m7879/7879[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2170s[0m 275ms/step - accuracy: 0.1542 - loss: 5.0839
Epoch 8/10
[1m7879/7879[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2107s[0m 267ms/step - accuracy: 0.1608



In [26]:
"""#generating story
story_len = 0
#while loop for starting input
while True:
    start_text = input("How do you want to start your story?")
    if isinstance(start_text, str):
        break
    elif len(list(start_text.split())) > 50:
        print("The beginning of your story should be 50 words or less")
    else:
        print("Enter a valid beginning")

#while loop for story length
while True:
    len_choice = input("How many words do you want in your story?")
    if not len_choice.isdigit():
        print("Please enter a valid number")
    elif int(len_choice) < 0:
        print("Please enter a number greater than 0")
    else:
        story_len = int(len_choice)
        break

print("Proceeding to your story...")
print("Start text:", start_text)
for word in start_text.split():
    if word not in tokenizer.word_index:
        print(f"Word '{word}' is not in the vocabulary.")

generated_story = start_text

#preprocessing starting input
tokenized_start = tokenizer.texts_to_sequences([start_text.lower()])
print("Tokenized start:", tokenized_start)

if not tokenized_start or len(tokenized_start[0]) == 0:
    print("No valid tokens in start text.")
else:
    start_sequence = pad_sequences(tokenized_start, maxlen=seq_len, padding="pre")

start_sequence = pad_sequences(tokenized_start, maxlen=seq_len, padding="pre")

print(type(start_sequence))
print(f"Start sequence shape: {start_sequence.shape}")
print(f"Start sequence (first entry): {start_sequence[0]}")

#try statement for generating text
for i in range(story_len):
    try:
        pred_text = model.predict(start_sequence, verbose=0)
        #print(type(text))
        #print(text[0])

        preds = np.asarray(pred_text).astype("float64")
        preds = np.log(preds + 1e-7) / temperature

        #pred_text_index = np.argmax(preds, axis=-1)[0]
        exp_preds = np.exp(preds)
        preds = exp_preds / np.sum(exp_preds)

        print("Predicted probabilities:", preds)


        pred_text_index = np.random.choice(len(preds[0]), p=preds[0])

        #print(f"Predictions: {pred_text[0]}")

        pred_word = tokenizer.index_word.get(pred_text_index, "Unknown")
        generated_story+=" " + pred_word

        print(pred_word)

        start_sequence = pad_sequences([start_sequence[0].tolist() + [pred_text_index]], maxlen=seq_len, padding="pre")
    except Exception as e:
        print(f"Error with generating text: {e}")"""

'#generating story\nstory_len = 0\n#while loop for starting input\nwhile True:\n    start_text = input("How do you want to start your story?")\n    if isinstance(start_text, str):\n        break\n    elif len(list(start_text.split())) > 50:\n        print("The beginning of your story should be 50 words or less")\n    else:\n        print("Enter a valid beginning")\n\n#while loop for story length\nwhile True:\n    len_choice = input("How many words do you want in your story?")\n    if not len_choice.isdigit():\n        print("Please enter a valid number")\n    elif int(len_choice) < 0:\n        print("Please enter a number greater than 0")\n    else:\n        story_len = int(len_choice)\n        break\n\nprint("Proceeding to your story...")\nprint("Start text:", start_text)\nfor word in start_text.split():\n    if word not in tokenizer.word_index:\n        print(f"Word \'{word}\' is not in the vocabulary.")\n\ngenerated_story = start_text\n\n#preprocessing starting input\ntokenized_star

In [27]:
#print(generated_story)
