<a href="https://colab.research.google.com/github/whtan88/Python-Projects/blob/master/Google_Tensorflow_Certificate_Force_Prep_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Key Concepts:


*   Build natural language processing systems
*   Prepare text to use the models
*   Use word embeddings
*   Train LSTMs or GRUs on existing text to generate text (such as songs and poetry)





In [1]:
#import libraries to use
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, GRU
import tensorflow_datasets as tfds

#important to use for bypassing cert error issues
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [2]:
#Downloading the dataset. Using Shakespeare's sonnet
!pip install gdown==5.1.0
!gdown --id 108jAePKK4R3BVYBbYJZ32JWUwxeMg20K

Collecting gdown==5.1.0
  Downloading gdown-5.1.0-py3-none-any.whl (17 kB)
Installing collected packages: gdown
  Attempting uninstall: gdown
    Found existing installation: gdown 4.7.3
    Uninstalling gdown-4.7.3:
      Successfully uninstalled gdown-4.7.3
Successfully installed gdown-5.1.0
Downloading...
From: https://drive.google.com/uc?id=108jAePKK4R3BVYBbYJZ32JWUwxeMg20K
To: /content/sonnets.txt
100% 93.6k/93.6k [00:00<00:00, 5.17MB/s]


In [3]:
#Define path for file with sonnets
SONNETS_FILE = './sonnets.txt'

#Read the data
with open('./sonnets.txt') as f:
    data = f.read()

#Convert to lower case and save as a list
corpus = data.lower().split("\n")

print(f"There are {len(corpus)} lines of sonnets\n")

#Check the average number of words per line
avg_words_line = []
for line in corpus:
  line = str(line)
  count_words = [word for word in line.split()]
  avg_words_line.append(len(count_words))

avg_words_line = int(np.average(avg_words_line))
print(f'\nThe average number of words per line is : {avg_words_line}\n')

#Inspect the first sonnet
print(f"The first sonnet look like this:\n")
for i in range(14):
  print(corpus[i])

There are 2159 lines of sonnets


The average number of words per line is : 8

The first sonnet look like this:

from fairest creatures we desire increase,
that thereby beauty's rose might never die,
but as the riper should by time decease,
his tender heir might bear his memory:
but thou, contracted to thine own bright eyes,
feed'st thy light'st flame with self-substantial fuel,
making a famine where abundance lies,
thyself thy foe, to thy sweet self too cruel.
thou that art now the world's fresh ornament
and only herald to the gaudy spring,
within thine own bud buriest thy content
and, tender churl, makest waste in niggarding.
pity the world, or else this glutton be,
to eat the world's due, by the grave and thee.


In [4]:
#Tokenizing the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

#Inspecting the sequence of the first line
tokenizer.texts_to_sequences([corpus[0]])[0]

[34, 417, 877, 166, 213, 517]

In [5]:
#Function to generate n-gram sequence
def n_gram_seqs(corpus, tokenizer):
    input_sequences = []

    for line in corpus:
      token_list = tokenizer.texts_to_sequences([line])[0]
      for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)
    return input_sequences

#Inspect the results of the n-grams sequence function
first_example_sequence = n_gram_seqs([corpus[0]], tokenizer)
print("n_gram sequences for first example look like this:\n")
first_example_sequence

n_gram sequences for first example look like this:



[[34, 417],
 [34, 417, 877],
 [34, 417, 877, 166],
 [34, 417, 877, 166, 213],
 [34, 417, 877, 166, 213, 517]]

In [6]:
#Apply the n-grams sequence function to the whole corpus
input_sequences = n_gram_seqs(corpus, tokenizer)
max_sequence_len = max([len(x) for x in input_sequences])
print(f"n_grams of input_sequences have length: {len(input_sequences)}")
print(f"maximum length of sequences is: {max_sequence_len}")

n_grams of input_sequences have length: 15462
maximum length of sequences is: 11


In [7]:
#Create padding sequence function
def pad_seqs(input_sequences, maxlen):
    padded_sequences = np.array(pad_sequences(input_sequences, maxlen=maxlen, padding='pre'))
    return padded_sequences

#Inspect the results of the padding sequence function
# Test your function with the n_grams_seq of the first example
first_padded_seq = pad_seqs(first_example_sequence, max([len(x) for x in first_example_sequence]))
first_padded_seq

array([[  0,   0,   0,   0,  34, 417],
       [  0,   0,   0,  34, 417, 877],
       [  0,   0,  34, 417, 877, 166],
       [  0,  34, 417, 877, 166, 213],
       [ 34, 417, 877, 166, 213, 517]], dtype=int32)

In [8]:
#Pad the entire corpus
input_sequences = pad_seqs(input_sequences, max_sequence_len)
print(f"padded corpus has shape: {input_sequences.shape}")

padded corpus has shape: (15462, 11)


In [9]:
#Create a function to split the data into features and labels
def features_and_labels(input_sequences, total_words):
    features = input_sequences[:,:-1]
    labels = input_sequences[:,-1]
    one_hot_labels = to_categorical(labels, num_classes=total_words)
    return features, one_hot_labels

#Test the function with a sample padded n_grams_seq
first_features, first_labels = features_and_labels(first_padded_seq, total_words)

print(f"labels have shape: {first_labels.shape}")
print("\nfeatures look like this:\n")
first_features

labels have shape: (5, 3211)

features look like this:



array([[  0,   0,   0,   0,  34],
       [  0,   0,   0,  34, 417],
       [  0,   0,  34, 417, 877],
       [  0,  34, 417, 877, 166],
       [ 34, 417, 877, 166, 213]], dtype=int32)

In [10]:
#Split the entire corpus
features, labels = features_and_labels(input_sequences, total_words)
print(f"features have shape: {features.shape}")
print(f"labels have shape: {labels.shape}")

features have shape: (15462, 10)
labels have shape: (15462, 3211)


In [14]:
#Create the model and train it
def create_model(total_words, max_sequence_len):
    model = Sequential()
    model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
    model.add(Bidirectional(LSTM(2000,return_sequences=True)))
    model.add(GRU(1000))
    model.add(Dense(1000, activation='relu'))
    model.add(Dense(total_words, activation='softmax'))

    # Compile the model
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

model = create_model(total_words, max_sequence_len)

# Train the model
history = model.fit(features, labels, epochs=30, verbose=1)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [15]:
seed_text = " Thy warm loving embrace i seek,\n"
sonnet_line = 13
next_words = avg_words_line

for i in range(sonnet_line):
  for j in range(next_words):
      # Convert the text into sequences
      token_list = tokenizer.texts_to_sequences([seed_text])[0]
      # Pad the sequences
      token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
      # Get the probabilities of predicting a word
      predicted = model.predict(token_list, verbose=0)
      # Choose the next word based on the maximum probability
      predicted = np.argmax(predicted, axis=-1).item()
      # Get the actual word from the word index
      output_word = tokenizer.index_word[predicted]
      # Append to the current text
      seed_text += " " + output_word
      if i ==sonnet_line-1:
        seed_text = seed_text + "."
      elif j == next_words-1:
        seed_text = seed_text + ",\n"
seed_text = str(seed_text)
print(seed_text)

 Thy warm loving embrace i seek,
 which doth lie to my friend one happy,
 grow sad glory be old burn'd hold nature's,
 woe tell o'er store me disgrace and some,
 winters cold decay out even even in hue,
 night grow mother woe now lend me held,
 quite back again assured eyes can see such,
 dearest spite of thine eyes dote women's pleasure,
 had stol'n of both men prove scorn to,
 change my desire of her scope can sing,
 even by away eyes see not some stand,
 can be broken told i none buried age,
 would cover both one night grow have done,
 so. deem'd. another. pleasure. change. you. be. a.
