In [1]:
import tensorflow as tf
import tensorflow.keras.backend as K
import numpy as np
from tensorflow.keras import layers
from tensorflow.keras import layers, Model
import os
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import string
import re

In [2]:
batch_size = 64
raw_data_ds = tf.data.TextLineDataset(["en_corpus.txt"])

In [3]:
text=""
for elem in raw_data_ds:
   text=text+(elem.numpy().decode('utf-8'))

In [4]:
maxlen = 24
step = 3
input_chars = []
next_char = []

In [5]:
for i in range(0, len(text) - maxlen, step):
    input_chars.append(text[i : i + maxlen])
    next_char.append(text[i + maxlen])

In [6]:
print("Number of sequences:", len(input_chars))
print("input X  (input_chars)  --->   output y (next_char) ")

for i in range(5):
  print( input_chars[i],"   --->  ", next_char[i])

Number of sequences: 636413
input X  (input_chars)  --->   output y (next_char) 
He sees me. Charlie drop    --->   p
sees me. Charlie dropped    --->    
s me. Charlie dropped to    --->    
e. Charlie dropped to he    --->   r
Charlie dropped to her h    --->   a


In [7]:
X_train_ds_raw=tf.data.Dataset.from_tensor_slices(input_chars)
y_train_ds_raw=tf.data.Dataset.from_tensor_slices(next_char)

In [8]:
def custom_standardization(input_data):
    lowercase     = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
    stripped_num  = tf.strings.regex_replace(stripped_html, "[\d-]", " ")
    stripped_punc  =tf.strings.regex_replace(stripped_num, "[%s]" % re.escape(string.punctuation), "")    
    return stripped_punc

def char_split(input_data):
  return tf.strings.unicode_split(input_data, 'UTF-8')

def word_split(input_data):
  return tf.strings.split(input_data)

In [9]:
max_features = 96           # Number of distinct chars / words  
embedding_dim = 16             # Embedding layer output dimension
sequence_length = maxlen       # Input sequence size

In [10]:
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    split=char_split, # word_split or char_split
    output_mode="int",
    output_sequence_length=sequence_length,
)

In [11]:
vectorize_layer.adapt(X_train_ds_raw.batch(batch_size))

In [12]:
print("The size of the vocabulary (number of distinct characters): ", len(vectorize_layer.get_vocabulary()))

The size of the vocabulary (number of distinct characters):  42


In [13]:
def vectorize_text(text):
  text = tf.expand_dims(text, -1)
  return tf.squeeze(vectorize_layer(text))

In [14]:
# Vectorize the data.
X_train_ds = X_train_ds_raw.map(vectorize_text)
y_train_ds = y_train_ds_raw.map(vectorize_text)

X_train_ds.element_spec, y_train_ds.element_spec

(TensorSpec(shape=(24,), dtype=tf.int64, name=None),
 TensorSpec(shape=(24,), dtype=tf.int64, name=None))

In [15]:
y_train_ds=y_train_ds.map(lambda x: x[0])

In [16]:
train_ds =  tf.data.Dataset.zip((X_train_ds,y_train_ds))

In [17]:
AUTOTUNE = tf.data.AUTOTUNE
train_ds = train_ds.shuffle(buffer_size=512).batch(batch_size, drop_remainder=True).cache().prefetch(buffer_size=AUTOTUNE)

In [18]:
for sample in train_ds.take(1):
  print("input (X) dimension: ", sample[0].numpy().shape, "\noutput (y) dimension: ",sample[1].numpy().shape)

input (X) dimension:  (64, 24) 
output (y) dimension:  (64,)


In [19]:
for sample in train_ds.take(1):
  print("input (sequence of chars): ", sample[0][0].numpy(), "\noutput (next char to complete the input): ",sample[1][0].numpy())

input (sequence of chars):  [ 9  7  3 12  2  5 16  5  8 10  9  4  2  4  7  3  2 17  5 13 13  2  5  0] 
output (next char to complete the input):  10


In [20]:
def decode_sequence (encoded_sequence):
  deceoded_sequence=[]
  for token in encoded_sequence:
    deceoded_sequence.append(vectorize_layer.get_vocabulary()[token])
  sequence= ''.join(deceoded_sequence)
  print("\t",sequence)
  return sequence

In [21]:
for sample in train_ds.take(1):
  print("input (sequence of chars): ", decode_sequence (sample[0][0].numpy()), "\noutput (next char to complete the input): ",vectorize_layer.get_vocabulary()[sample[1][0].numpy()])

	 farthest console rocked 
input (sequence of chars):  farthest console rocked  
output (next char to complete the input):  b


In [22]:
def softmax(z):
   return np.exp(z)/sum(np.exp(z))

In [23]:
def temperature_sampling (conditional_probability, temperature=1.0):
  conditional_probability = np.asarray(conditional_probability).astype("float64")
  conditional_probability = np.log(conditional_probability) / temperature
  reweighted_conditional_probability = softmax(conditional_probability)
  probas = np.random.multinomial(1, reweighted_conditional_probability, 1)
  return np.argmax(probas)

In [24]:
inputs = tf.keras.Input(shape=(sequence_length), dtype="int64")
x = layers.Embedding(max_features, embedding_dim)(inputs)
x = layers.Dropout(0.1)(x)
x = layers.LSTM(128, use_bias=False, return_sequences=True)(x)
x = layers.Dropout(0.1)(x)
x = layers.LSTM(128, return_sequences=True)(x)
x = layers.LSTM(64, return_sequences=True)(x)
x = layers.Flatten()(x)
predictions=  layers.Dense(max_features, activation='softmax')(x)
model = tf.keras.Model(inputs, predictions,name="model")

In [25]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='RMSprop', metrics=['accuracy'])
print(model.summary())

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 24)]              0         
                                                                 
 embedding (Embedding)       (None, 24, 16)            1536      
                                                                 
 dropout (Dropout)           (None, 24, 16)            0         
                                                                 
 lstm (LSTM)                 (None, 24, 128)           73728     
                                                                 


 dropout_1 (Dropout)         (None, 24, 128)           0         
                                                                 
 lstm_1 (LSTM)               (None, 24, 128)           131584    
                                                                 
 lstm_2 (LSTM)               (None, 24, 64)            49408     
                                                                 
 flatten (Flatten)           (None, 1536)              0         
                                                                 
 dense (Dense)               (None, 96)                147552    
                                                                 
Total params: 403808 (1.54 MB)
Trainable params: 403808 (1.54 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [26]:
model.fit(train_ds, epochs=25)

Epoch 1/25


Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.src.callbacks.History at 0x1d493e1b390>

In [27]:
model.save('model.keras')

In [28]:
def generate_text(model, seed_original, step,temperatures=[]):
    seed= vectorize_text(seed_original)
    print("The prompt is:",end='')
    decode_sequence(seed.numpy().squeeze())
    seed= vectorize_text(seed_original).numpy().reshape(1,-1)
    #Text Generated by Temperature Sampling
    print("Text Generated by Temperature Sampling:")
    for temperature in temperatures:
        print("\ttemperature: ", temperature)
        seed= vectorize_text(seed_original).numpy().reshape(1,-1)
        generated_temperature = (seed)
        for i in range(step):
            predictions=model.predict(seed)
            next_index = temperature_sampling(predictions.squeeze(), temperature)
            generated_temperature = np.append(generated_temperature, next_index)
            seed= generated_temperature[-sequence_length:].reshape(1,sequence_length)
        print("Output :",end="")
        decode_sequence(generated_temperature)

In [29]:
generate_text(model,"charlie ",100,[0.2])

The prompt is:	 charlie 
Text Generated by Temperature Sampling:
	temperature:  0.2


Output :	 charlie xxjjjbpllovvvv    voaav’vybccarcov’kx just in the party from the door the side of
