### Import the required libraries

In [1]:
%pip install -q datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import numpy as np
import pandas as pd
import re

# datasets is a library that contains a lot of datasets that are useful and easy to use.
from datasets import load_dataset

# Tokenizer from tensorflow is used to convert the text into tokens.
from tensorflow.keras.preprocessing.text import Tokenizer
# pad_sequences is used to make the length of all the sentences equal.
from tensorflow.keras.preprocessing.sequence import pad_sequences
# to_categorical is used to convert the labels into one-hot encoding.
from tensorflow.keras.utils import to_categorical

# Required libraries for the model.
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, Dense, LSTM
from tensorflow.keras.callbacks import EarlyStopping

### Load Dataset

In [3]:
# Load the dataset from huggingface. https://huggingface.co/datasets/512duncanl/wh40k_novels
dataset = load_dataset("512duncanl/wh40k_novels")
data = dataset['train']['text']

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/327 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/38.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/8.91M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

### Pre-Process The Data

In [4]:
def preprocess_data(sentence):
    """
    # Preprocess the data by removing special characters and converting to lowercase.
    Args:
        sentence (str): The sentence to be preprocessed.
    Returns:
        str: The preprocessed sentence.
    """
    sentence = sentence.lower()
    sentence = re.sub(r'[^a-z0-9]', ' ', sentence)
    sentence = re.sub(' +', ' ', sentence)

    return sentence

def tokenize_data(total_words):
    '''
    # Tokenize the data
    Args:
        total_words (int): The total number of words to be used.
    Returns:
        tokenized_data (list): The tokenized data.
        index_to_word (dict): The index to word mapping.
        tokenizer (Tokenizer): The tokenizer object.
    '''
    cleaned_sentences = list(map(preprocess_data, data))
    cleaned_sentences = ' '.join(cleaned_sentences)[:total_words]

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts([cleaned_sentences])
    tokenized_data = tokenizer.texts_to_sequences([cleaned_sentences])[0]
    index_to_word = dict(zip(tokenizer.word_index.values(), tokenizer.word_index.keys()))

    return tokenized_data, index_to_word, tokenizer

In [6]:
tokenized_data, index_to_word, tokenizer = tokenize_data(500000)

### Generate Data

In [7]:
def generate_data():
    '''
    # Generate the data
    Returns:
        input_data (numpy array): The input data.
        output_data (numpy array): The output data.

    # Formula for generating the data

    sample_data = [2, 5, 6, 1, 9, 11, 2, 3, 43]

    sentence_length = 3
    num_words = 10

    | input_data  | output_data |
    | ----------  | ----------- |
    | 2, 5, 6     |      1      |
    | 5, 6, 1     |      9      |
    | 6, 1, 9     |      11     |
    | 1, 9, 11    |      2      |
    | 9, 11, 2    |      3      |
    | 11, 2, 3    |      43     |
    '''
    input_data = []
    output_data = []

    for i in range(0, len(tokenized_data) - sentence_length):
        input_data.append(tokenized_data[i:i+sentence_length])
        output_data.append(tokenized_data[i+sentence_length])

    output_data = to_categorical(output_data, num_classes=num_words)

    input_data = np.array(input_data)
    output_data = np.array(output_data)

    return input_data, output_data

In [8]:
# num_words is the total number of unique words in the dataset.
num_words = len(tokenizer.word_index) + 1

# sentence_length is the length of the input data.
sentence_length = 5

input_data, output_data = generate_data()

In [9]:
print(f"input_data shape : {input_data.shape}")
print(f"output_data shape : {output_data.shape}")

input_data shape : (92057, 5)
output_data shape : (92057, 9431)


### Train the Model

In [11]:
def rnn_model(optimizer, epochs):
    '''
    # Build the RNN model
    Args:
        optimizer (str): The optimizer to be used.
        epochs (int): The number of epochs to train the model.
    Returns:
        model (Sequential): The RNN model.
    '''
    # Initialize the model
    model = Sequential()

    # Add the Embedding layer
    model.add(Embedding(input_dim=num_words, output_dim=300, input_length=sentence_length))

    # Add LSTM layers
    model.add(LSTM(256, return_sequences=True))
    model.add(LSTM(128, return_sequences=True))
    model.add(LSTM(128))

    # Add the Dense layer for output
    model.add(Dense(num_words, activation='softmax'))

    # Compile the model
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

    # Early Stopping with
    es = EarlyStopping(monitor='loss', patience=5)
    model.fit(input_data, output_data, epochs=epochs, callbacks=[es])

    return model

In [12]:
model = rnn_model('adam', 100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100


In [13]:
# Save the model.
# model.save('./model/best_model.h5')

  saving_api.save_model(


In [14]:
# Load the model.
# model = load_model('./model/best_model.h5')

In [15]:
def text_generation(input_text, len_text):
    word = ''
    generated_text = input_text.lower()
    
    # Generate the text.
    for _ in range(len_text):
        # Tokenize the text.
        text = tokenizer.texts_to_sequences([generated_text])[0][-sentence_length:]

        # Pad the text.
        text = pad_sequences([text], maxlen=5, padding='post')

        # Predict the next word.
        predict = adam_model.predict(text)

        # Get the prediction with the highest probability and convert it to a word.
        word = tokenizer.index_word[np.argmax(predict)]

        # Append the word to the generated text.
        generated_text += ' ' + word

        # Remove extra spaces.
        generated_text = re.sub(' +', ' ', generated_text)

    return generated_text

In [None]:
samples = [
    'The ships of the speartip',
    'Surface batteries smashed them out',
    'Just a week or two',
    "But there's something I must",
    'With a sound like the',
    'I really like to eat'
    ]

output_texts = []

for sample in samples:
    output_texts.append(text_generation(sample, 10))

In [17]:
output_texts

['the ships of the speartip slipped forward running under obscurement on board stormbirds were hauled',
 'surface batteries smashed them out of the heavens as the burning scads of debris from',
 'just a week or two before a sozzled second engineer had explained to karkasy that',
 "but there's something i must the deal of all or still worn deck his face",
 'with a sound like the legion i believe that if he does even old he',
 'i really like to eat the time they knew he had switched out as a']