In [1]:
import numpy as np 
import pandas as pd 
import json
import re

from datasets import load_dataset
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM
from tensorflow.keras.callbacks import ModelCheckpoint

In [2]:
class LoadData:
    def download_data():
        dataset = load_dataset("512duncanl/wh40k_novels")
        data = dataset['train']['text']

In [3]:
dataset = load_dataset("512duncanl/wh40k_novels")

Found cached dataset json (C:/Users/90530/.cache/huggingface/datasets/512duncanl___json/512duncanl--wh40k_novels-3308fde660ed265e/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/1 [00:00<?, ?it/s]

In [6]:
data = dataset['train']['text']
data[:4]

['PART ONE. THE DECEIVED\n\nONE\n\nBlood from misunderstanding\n\nOur brethren in ignorance\n\nThe Emperor dies\n\n\'I WAS THERE.\' he would say afterwards, until afterwards became a time quite devoid of laughter. \'I was there, the day Horus slew the Emperor.\' It was a delicious conceit, and his comrades would chuckle at the sheer treason of it.\n\nThe story was a good one. Torgaddon would usually be the one to cajole him into telling it, for Torgaddon was the joker, a man of mighty laughter and idiot tricks. And Loken would tell it again, a tale rehearsed through so many retellings, it almost told itself.\n\nLoken was always careful to make sure his audience properly understood the irony in his story. It was likely that he felt some shame about his complicity in the matter itself, for it was a case of blood spilled from misunderstanding. There was a great tragedy implicit in the tale of the Emperor\'s murder, a tragedy that Loken always wanted his listeners to appreciate. But the de

In [8]:
def preprocess_data(sentence):
    sentence = sentence.lower()
    sentence = re.sub(r'[^a-z0-9]', ' ', sentence)
    sentence = re.sub(' +', ' ', sentence)

    return sentence

def tokenize_data(total_words):
    cleaned_sentences = list(map(preprocess_data, data))
    cleaned_sentences = ' '.join(cleaned_sentences)[:total_words]

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts([cleaned_sentences])
    tokenized_data = tokenizer.texts_to_sequences([cleaned_sentences])[0]
    index_to_word = dict(zip(tokenizer.word_index.values(), tokenizer.word_index.keys()))

    return tokenized_data, index_to_word, tokenizer

In [9]:
tokenized_data, index_to_word, tokenizer = tokenize_data(1000000)

In [10]:
num_words = len(tokenizer.word_index) + 1
sentence_length = 5

input_data = []
output_data = []

for i in range(0, len(tokenized_data)-sentence_length):
    input_data.append(tokenized_data[i:i+sentence_length])
    output_data.append(tokenized_data[i+sentence_length])

input_data = np.array(input_data)
output_data = np.array(output_data)

output_data = to_categorical(output_data, num_classes=num_words)

In [11]:
print("input_data shape : {}".format(input_data.shape))
print("output_data shape : {}".format(output_data.shape))

input_data shape : (183651, 5)
output_data shape : (183651, 13322)


In [12]:
def rnn_model(optimizer, epochs):
    model = Sequential()

    model.add(Embedding(input_dim=num_words, output_dim=300, input_length=sentence_length))
    model.add(LSTM(256, return_sequences=True))
    model.add(LSTM(128, return_sequences=True))
    model.add(LSTM(128))
    model.add(Dense(num_words, activation='softmax'))

    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

    model.fit(input_data, output_data, epochs=epochs)

    return model

In [None]:
adam_model = rnn_model('adam', 30)

In [207]:
def text_generation(input_text):
    word = ''
    text = input_text.lower()
    text = tokenizer.texts_to_sequences([text])[0]
    text = np.array(text).reshape(-1, 1)

    predict = model.predict(text)
    predict = np.argmax(predict, axis=-1)

    for p in predict:
        word += index_to_word[p] + ' '
    
    output_text = input_text + ' ' + word
    output_text = re.sub(' +', ' ', output_text)

    return output_text

In [217]:
samples = [
    'The cat sat on the ', 'The old house creaked with ', 'The cat chased the playful ', 
    'The sun sets behind the ', 'I think it once contained', 'unsurprised when he saw that'
    ]

output_texts = []

for sample in samples:
    output_texts.append(text_generation(sample))



In [220]:
output_texts

['The cat sat on the revealing yourself pour revealing ',
 'The old house creaked with revealing custodian units turn ',
 'The cat chased the playful revealing revealing ',
 'The sun sets behind the revealing dealing injured revealing ',
 'I think it once contained fade uneasily decks rother ',
 'unsurprised when he saw that sentient shadows somehow underlit ']