<a href="https://colab.research.google.com/github/thedatadj/natural-language-processing/blob/main/text_generation_shakespeare.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data

In [1]:
# Download file
!gdown 108jAePKK4R3BVYBbYJZ32JWUwxeMg20K

Downloading...
From: https://drive.google.com/uc?id=108jAePKK4R3BVYBbYJZ32JWUwxeMg20K
To: /content/sonnets.txt
  0% 0.00/93.6k [00:00<?, ?B/s]100% 93.6k/93.6k [00:00<00:00, 68.1MB/s]


In [3]:
# Load the data
with open("/content/sonnets.txt") as file:
    data = file.read()

data[:100]

"FROM fairest creatures we desire increase,\nThat thereby beauty's rose might never die,\nBut as the ri"

# Data preprocessing

In [4]:
# Prepare the data
corpus = data.lower()
corpus = corpus.split("\n")
corpus[:2]

['from fairest creatures we desire increase,',
 "that thereby beauty's rose might never die,"]

## Tokenization

In [5]:
# Import tokenizer
from tensorflow.keras.preprocessing.text import Tokenizer

In [6]:
# Fit the tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)

In [7]:
# Index to word dictionary
dic = tokenizer.word_index
dic['the']

2

In [8]:
# Constant number of unique words in the corpus
TOTAL_WORDS = len(dic) + 1
TOTAL_WORDS

3211

In [11]:
# Using the tokenizer
text = "I love to sing"
tokenizer.texts_to_sequences([text])[0]

[6, 14, 3, 323]

## N-grams generator
Maps a sequence to a sequence of n-grams.

In [14]:
def seq_ngrams(sequence, tokenizer):
    n_gram_sequences = []
    token_list = tokenizer.texts_to_sequences([sequence])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        n_gram_sequences.append(n_gram_sequence)
    return n_gram_sequences

In [15]:
text

'I love to sing'

In [17]:
seq_ngrams(text, tokenizer)

[[6, 14], [6, 14, 3], [6, 14, 3, 323]]

Maps all sequences in the corpus to sequences of n-grams.

In [22]:
def n_gram_seqs(corpus, tokenizer):
    input_sequences = []
    for line in corpus:
        n_gram_sequences = seq_ngrams(line, tokenizer)
        for sequence in n_gram_sequences:
            input_sequences.append(sequence)
    return input_sequences

In [23]:
texts = [text, "I want to eat some bread"]
texts

['I love to sing', 'I want to eat some bread']

In [24]:
n_gram_seqs(texts, tokenizer)

[[6, 14],
 [6, 14, 3],
 [6, 14, 3, 323],
 [6, 566],
 [6, 566, 3],
 [6, 566, 3, 637],
 [6, 566, 3, 637, 82]]

## Transform corpus
Transform the corpus such that each line becomes a sequence of n-grams.

In [26]:
input_sequences = n_gram_seqs(corpus, tokenizer)
input_sequences[:7]

[[34, 417],
 [34, 417, 877],
 [34, 417, 877, 166],
 [34, 417, 877, 166, 213],
 [34, 417, 877, 166, 213, 517],
 [8, 878],
 [8, 878, 134]]

Get the contants maximum sequence length

In [27]:
# Length of each sequence of n-grams
len_ngrams = [len(sequence) for sequence in input_sequences]
MAX_SEQUENCE_LEN = max(len_ngrams)
MAX_SEQUENCE_LEN

11

## Padd sequences

In [28]:
# Import padder
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [29]:
padded_sequences = pad_sequences(input_sequences, maxlen=MAX_SEQUENCE_LEN, padding='pre')

In [30]:
input_sequences[:5]

[[34, 417],
 [34, 417, 877],
 [34, 417, 877, 166],
 [34, 417, 877, 166, 213],
 [34, 417, 877, 166, 213, 517]]

In [31]:
padded_sequences[:5]

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,  34, 417],
       [  0,   0,   0,   0,   0,   0,   0,   0,  34, 417, 877],
       [  0,   0,   0,   0,   0,   0,   0,  34, 417, 877, 166],
       [  0,   0,   0,   0,   0,   0,  34, 417, 877, 166, 213],
       [  0,   0,   0,   0,   0,  34, 417, 877, 166, 213, 517]],
      dtype=int32)

In [32]:
padded_sequences.shape

(15462, 11)

## Data split

In [33]:
features = padded_sequences[:, :-1]
labels = padded_sequences[:, -1]

In [34]:
features[:5]

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,  34],
       [  0,   0,   0,   0,   0,   0,   0,   0,  34, 417],
       [  0,   0,   0,   0,   0,   0,   0,  34, 417, 877],
       [  0,   0,   0,   0,   0,   0,  34, 417, 877, 166],
       [  0,   0,   0,   0,   0,  34, 417, 877, 166, 213]], dtype=int32)

In [35]:
labels[:5]

array([417, 877, 166, 213, 517], dtype=int32)

In [40]:
features.shape

(15462, 10)

Make labels into one hot categorical vector.

In [36]:
from tensorflow.keras.utils import to_categorical

In [37]:
labels = to_categorical(labels, num_classes=TOTAL_WORDS)
labels[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [38]:
labels.shape

(15462, 3211)

# Modeling

In [42]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense

In [43]:
model = Sequential([
    Embedding(TOTAL_WORDS, 100, input_length=MAX_SEQUENCE_LEN-1),
    Bidirectional(LSTM(150)),
    Dense(TOTAL_WORDS, activation='softmax')
])

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [45]:
# Training
history = model.fit(features, labels, epochs=50, verbose=0)

# Evaluation

In [47]:
# Training accuracy
history.history['accuracy'][-1]

0.8487905859947205

# Demostration

In [48]:
import numpy as np

In [49]:
seed = "I love to eat bread with nutella"
next_words = 50

for i in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed])[0]
    token_list_padded = pad_sequences([token_list], maxlen=MAX_SEQUENCE_LEN-1, padding='pre')
    predicted = model.predict(token_list_padded, verbose=0)
    predicted = np.argmax(predicted, axis=-1).item()
    output_word = tokenizer.index_word[predicted]
    seed += " " + output_word

seed

"I love to eat bread with nutella your sweet image add be die to more more worth worth me well transferr'd so dearer free done mine own best worth be the done sweet treasure cheeks treasure cheeks so long right back of nought ' behavior tongue worth old time dost lie to me are you so dearer"

<table>
    <tr>
        <td>
            Based on
        </td>
        <td>
            Assignment from the TensorFlow Specialization
        </td>
    </tr>
</table>