In [1]:
###-----------------
### Import Libraries
###-----------------;

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from collections.abc import Callable
from typing import Literal

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler


import warnings as w
w.filterwarnings('ignore')
%matplotlib inline

2023-12-02 11:56:52.995359: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-02 11:56:53.046724: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-02 11:56:53.046758: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-02 11:56:53.047856: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-02 11:56:53.055257: I tensorflow/core/platform/cpu_feature_guar

In [2]:
###----------------
### Some parameters
###----------------

inpDir = '../input'
outDir = '../output'
modelDir = '../model'
subDir = '../subdir'

RANDOM_STATE = 24 # REMEMBER: to remove at the time of promotion to production
np.random.seed(RANDOM_STATE) # Set Random Seed for reproducible  results

EPOCHS = 10 # number of epochs
ALPHA = 0.01 # learning rate
NUM_SAMPLES = 1280 # How many samples we want to generate
NOISE = 0.2 # Noise to be introduced in the data
TEST_SIZE = 0.3
BATCH_SIZE = 32
TRAIN_SIZE = 14496

# parameters for Matplotlib
params = {'legend.fontsize': 'x-large',
          'figure.figsize': (15, 8),
          'axes.labelsize': 'x-large',
          'axes.titlesize':'x-large',
          'xtick.labelsize':'x-large',
          'ytick.labelsize':'x-large'
         }

CMAP = 'coolwarm' # plt.cm.Spectral

plt.rcParams.update(params)

In [3]:
text = open('../DNN/DNN_Sep2023/input/shakespeare.txt','rb').read().decode(encoding='utf-8')
len(text)

1115395

In [4]:
text[:300] # character in sentence 

"First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you know Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us"

In [5]:
## To Generate Vocanulary of Unique Characters 

In [6]:
vocab = sorted(set(text))
len(vocab)

65

In [7]:
vocab[:10]

['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3']

In [8]:
char2idx = {u:i for i,u in enumerate(vocab)}
char2idx

{'\n': 0,
 ' ': 1,
 '!': 2,
 '$': 3,
 '&': 4,
 "'": 5,
 ',': 6,
 '-': 7,
 '.': 8,
 '3': 9,
 ':': 10,
 ';': 11,
 '?': 12,
 'A': 13,
 'B': 14,
 'C': 15,
 'D': 16,
 'E': 17,
 'F': 18,
 'G': 19,
 'H': 20,
 'I': 21,
 'J': 22,
 'K': 23,
 'L': 24,
 'M': 25,
 'N': 26,
 'O': 27,
 'P': 28,
 'Q': 29,
 'R': 30,
 'S': 31,
 'T': 32,
 'U': 33,
 'V': 34,
 'W': 35,
 'X': 36,
 'Y': 37,
 'Z': 38,
 'a': 39,
 'b': 40,
 'c': 41,
 'd': 42,
 'e': 43,
 'f': 44,
 'g': 45,
 'h': 46,
 'i': 47,
 'j': 48,
 'k': 49,
 'l': 50,
 'm': 51,
 'n': 52,
 'o': 53,
 'p': 54,
 'q': 55,
 'r': 56,
 's': 57,
 't': 58,
 'u': 59,
 'v': 60,
 'w': 61,
 'x': 62,
 'y': 63,
 'z': 64}

In [9]:
idx2char = np.array(vocab)
idx2char

array(['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?',
       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'],
      dtype='<U1')

In [10]:
# using made dictionary above to convert char to int 

text_as_int = np.array([char2idx[c] for c in text])
text_as_int.shape

(1115395,)

In [11]:
text_as_int

array([18, 47, 56, ...,  8,  0,  0])

In [12]:
text[:10]

'First Citi'

In [13]:
idx2char[18],idx2char[47],

('F', 'i')

In [14]:
# converting input_int_data to tensor 
dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
print(list(dataset.as_numpy_iterator()))

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [15]:
seq_length = 100
example_per_epoch = len(text) // (seq_length + 1)
example_per_epoch

11043

In [16]:
for i in dataset.take(10):
    print(i.numpy(),'|',idx2char[i.numpy()])

18 | F
47 | i
56 | r
57 | s
58 | t
1 |  
15 | C
47 | i
58 | t
47 | i


In [17]:
sequences = dataset.batch(seq_length+1,drop_remainder=True) # convert to batch 
for item in sequences.take(2):
    print(item)
    print('\n')
    print(repr(''.join(idx2char[item.numpy()] ) ) ) # text index values and convert to char 
    print('\n')

tf.Tensor(
[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49
  6  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10
  0 37 53 59  1], shape=(101,), dtype=int64)


'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '


tf.Tensor(
[39 56 43  1 39 50 50  1 56 43 57 53 50 60 43 42  1 56 39 58 46 43 56  1
 58 53  1 42 47 43  1 58 46 39 52  1 58 53  1 44 39 51 47 57 46 12  0  0
 13 50 50 10  0 30 43 57 53 50 60 43 42  8  1 56 43 57 53 50 60 43 42  8
  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 18 47 56 57 58  6  1
 63 53 59  1 49], shape=(101,), dtype=int64)


'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'




In [18]:
def fn_split_X_y(seq): # bring in sequence of length 101 
    
    input_text = seq[:-1] # input is first 100 char 
    output_text = seq[1:] # output is last 100 char 
    
    return input_text,output_text

dataset = sequences.map(fn_split_X_y)
dataset

<_MapDataset element_spec=(TensorSpec(shape=(100,), dtype=tf.int64, name=None), TensorSpec(shape=(100,), dtype=tf.int64, name=None))>

In [19]:
for X,y in dataset.take (2):
    print(repr(''.join(idx2char[X.numpy()] ) ) ) # X data 
    print(repr(''.join(idx2char[y.numpy()] ) ) ) # y data
    print('_'*100)
    
    # input is strating with first char 
    # output is starting with second char 
    #------------------
    # input is strating with first char 
    # output is starting with second char 
    

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'
'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
____________________________________________________________________________________________________
'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you '
're all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'
____________________________________________________________________________________________________


In [20]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64 
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE,drop_remainder=True)
dataset  

<_BatchDataset element_spec=(TensorSpec(shape=(64, 100), dtype=tf.int64, name=None), TensorSpec(shape=(64, 100), dtype=tf.int64, name=None))>

In [21]:
vocab_size = len(vocab)

embedding_dim = 256

rnn_units = 1024 

# Model Building 

"""

1.build model using tf.keras 
2.Embedding to reduce diamensionality of input 
3.2nd hidden is GPU Unit 

"""


def build_model(vocab_size,
                embedding_dim,
               rnn_unit,
               batch_size = BATCH_SIZE):
    
    model = tf.keras.Sequential([
        
        # Embedding will reduce vocab size to 256 
        
        tf.keras.layers.Embedding(vocab_size,
                                  embedding_dim,
                                 batch_input_shape = (batch_size,None)),
        
        # GRU unit are used to sequence prediction problem 
        
        tf.keras.layers.GRU(rnn_units,
                            return_sequences=True,
                           stateful=True,
                           recurrent_initializer='glorot_uniform'),
        
        
        
        tf.keras.layers.Dense(vocab_size)
        
    ])
    
    return model 
    
model = build_model(vocab_size,
                   embedding_dim,
                   rnn_units)


In [22]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (64, None, 256)           16640     
                                                                 
 gru (GRU)                   (64, None, 1024)          3938304   
                                                                 
 dense (Dense)               (64, None, 65)            66625     
                                                                 
Total params: 4021569 (15.34 MB)
Trainable params: 4021569 (15.34 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [23]:
# prediction without training (i.e we haven't used tf.compile )

for X,y in dataset.take(2):
    y_pred = model(X)

In [24]:
y_pred.shape

TensorShape([64, 100, 65])

In [28]:
loss_fn = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer='adam',loss=loss_fn)

In [29]:
cktPtPath = os.path.join(modelDir,subDir)

chkPtPrefix = os.path.join(cktPtPath,'chkpt_(epoch)')

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=chkPtPrefix,
                                                         save_weights_only=True)

In [30]:
hist = model.fit(dataset,epochs=EPOCHS,callbacks=[checkpoint_callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [34]:
tf.train.latest_checkpoint(cktPtPath)

'../model/../subdir/chkpt_(epoch)'

In [38]:
model = build_model(vocab_size,
                    embedding_dim,
                    rnn_units,
                    batch_size=1)


model.load_weights(tf.train.latest_checkpoint(cktPtPath))
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (1, None, 256)            16640     
                                                                 
 gru_3 (GRU)                 (1, None, 1024)           3938304   
                                                                 
 dense_3 (Dense)             (1, None, 65)             66625     
                                                                 
Total params: 4021569 (15.34 MB)
Trainable params: 4021569 (15.34 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [39]:
model.build(tf.TensorShape([1,None] ) ) 
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (1, None, 256)            16640     
                                                                 
 gru_3 (GRU)                 (1, None, 1024)           3938304   
                                                                 
 dense_3 (Dense)             (1, None, 65)             66625     
                                                                 
Total params: 4021569 (15.34 MB)
Trainable params: 4021569 (15.34 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [49]:
def gen_text(model,start_string):
    num_generate  = 10000
    
    input_eval = [char2idx[c] for c in start_string]
    
    print(f'Input :{start_string} | {input_eval}')
    
    input_eval = tf.expand_dims(input_eval,0)
    
    text_generated = []
    
    temperature = 1,0
    model.reset_states()
    
    for i in range(num_generate):
        
        prediction = model(input_eval)
        
        prediction = tf.squeeze(prediction,0)
        
        prediction_td = tf.random.categorical(prediction,
                                              num_samples=1)[-1,0].numpy()
        
        input_eval= tf.expand_dims([prediction_td],0)
        text_generated.append(idx2char[prediction_td])
        
    return start_string + ''.join(text_generated) 
        

In [50]:
print(gen_text(model,start_string='ROMEO:'))

Input :ROMEO: | [30, 27, 25, 17, 27, 10]
ROMEO:
Hardy and I know no furnish'd king in good temporan.
Meethought the queen than twn to his men:
Harry! Believe me I could sell that
I not thanks. But! what art thou unst?
My prince, Climase! and he look'd ask'd, how or we pick?

BUCKINGHAM:
Be record in sight;
But shall we so apoar him with unhorimany,
You had been been, that did leave him whither
About with our saffle sentenced with the hose would have heard
Than be together, and Edward be long a name
To Norfold.

TYRRELL:
'Twould you never seem and lies up;
Set, would have here so do due love to Rome.

LUCETHBY BOLINGBROKE:
Nay, give as me to me.
Were forced, Turn lanus, whose vile ways free arrivy with an end,
Thought's away from which he childers knew;
Do't artations by me that would
But every up pale can bale part, whose guilty hands. Buy, as I will full of idle wings:
Commend me not a king!

ANGELO:
Nay, come, gentlemen, alone.
Thou shouldst slisy, and leave up this joy
How the lady 