In [129]:
import numpy as np
import pandas as pd
import re

from datasets import load_dataset
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, Dense, LSTM
from tensorflow.keras.callbacks import EarlyStopping

In [111]:
dataset = load_dataset("512duncanl/wh40k_novels")
data = dataset['train']['text']

Found cached dataset json (C:/Users/90530/.cache/huggingface/datasets/512duncanl___json/512duncanl--wh40k_novels-3308fde660ed265e/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/1 [00:00<?, ?it/s]

In [112]:
def preprocess_data(sentence):
    sentence = sentence.lower()
    sentence = re.sub(r'[^a-z0-9]', ' ', sentence)  
    sentence = re.sub(' +', ' ', sentence)

    return sentence

def tokenize_data(total_words):
    cleaned_sentences = list(map(preprocess_data, data))
    cleaned_sentences = ' '.join(cleaned_sentences)[:total_words]

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts([cleaned_sentences])
    tokenized_data = tokenizer.texts_to_sequences([cleaned_sentences])[0]
    index_to_word = dict(zip(tokenizer.word_index.values(), tokenizer.word_index.keys()))

    return tokenized_data, index_to_word, tokenizer

In [113]:
tokenized_data, index_to_word, tokenizer = tokenize_data(500000)

In [114]:
num_words = len(tokenizer.word_index) + 1
sentence_length = 5

input_data = []
output_data = []

for i in range(0, len(tokenized_data)-sentence_length):
    input_data.append(tokenized_data[i:i+sentence_length])
    output_data.append(tokenized_data[i+sentence_length])

output_data = to_categorical(output_data, num_classes=num_words)

input_data = np.array(input_data)
output_data = np.array(output_data)

In [116]:
print("input_data shape : {}".format(input_data.shape))
print("output_data shape : {}".format(output_data.shape))

input_data shape : (92057, 5)
output_data shape : (92057, 9431)


In [13]:
def rnn_model(optimizer, epochs):
    model = Sequential()

    model.add(Embedding(input_dim=num_words, output_dim=300, input_length=sentence_length))
    model.add(LSTM(256, return_sequences=True))
    model.add(LSTM(128, return_sequences=True))
    model.add(LSTM(128))
    model.add(Dense(num_words, activation='softmax'))

    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

    es = EarlyStopping(monitor='loss', patience=5)
    model.fit(input_data, output_data, epochs=epochs, callbacks=[es])

    return model

In [16]:
adam_model = rnn_model('adam', 300)

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300


In [54]:
adam_model = load_model('./model/adam_model.h5')

In [194]:
def text_generation(input_text, len_text):
    word = ''
    generated_text = input_text.lower()

    for _ in range(len_text):
        text = tokenizer.texts_to_sequences([generated_text])[0][-5:]
        text = pad_sequences([text], maxlen=5, padding='post')
        predict = adam_model.predict(text)
        word = tokenizer.index_word[np.argmax(predict)]
        generated_text += ' ' + word
        generated_text = re.sub(' +', ' ', generated_text)

    return generated_text

In [204]:
samples = [
    'The ships of the speartip', 
    'Surface batteries smashed them out', 
    'Just a week or two',
    "But there's something I must", 
    'With a sound like the', 
    'I really like to eat'
    ]

output_texts = []

for sample in samples:
    output_texts.append(text_generation(sample, 10))



In [205]:
output_texts

['the ships of the speartip slipped forward through his study lucius knew again i m',
 'surface batteries smashed them out of the heavens as the burning scads of debris from',
 'just a week or two before a sozzled second engineer had explained to karkasy that',
 "but there's something i must for this right sir loken asked i don t know",
 'with a sound like the warmaster seemed to draw together the lonely storms what momus',
 'i really like to eat men there were precious active ring in jubal auto sliding']