In [6]:
from numpy import array
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import SimpleRNN
from keras.layers import Embedding

In [7]:
# source text
data="""The Avengers aren't available at the moment, 
but your call is very important to them. 
Instead, this summer, there will be a new superhero team-up – 
between Bucky Barnes, Yelena Belova, Red Guardian, Ghost, Taskmaster, and John Walker – to fight a new-to-us supervillain who hasn’t yet been
established in the MCU (but who is well known to comic-book readers)."""

In [8]:
# integer encode text
tokenizer=Tokenizer()
tokenizer.fit_on_texts([data])
encoded_data=tokenizer.texts_to_sequences([data])[0]
encoded_data

[2,
 9,
 10,
 11,
 12,
 2,
 13,
 3,
 14,
 15,
 4,
 16,
 17,
 1,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 5,
 6,
 25,
 26,
 27,
 7,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 7,
 1,
 40,
 5,
 6,
 1,
 41,
 42,
 8,
 43,
 44,
 45,
 46,
 47,
 2,
 48,
 3,
 8,
 4,
 49,
 50,
 1,
 51,
 52,
 53]

In [9]:
# determine the vocabulary size
vocab_size= len(tokenizer.word_index)+1 # 0 is reserved for padding so that's why we added 1

print('Vocabulary Size: %d' % vocab_size)

Vocabulary Size: 54


# Next, we need to create sequences of words to fit the model with one word as input and one word as output

In [10]:
# create word -> word sequences
sequences = list()
for i in range(1, len(encoded_data)):
    sequence=encoded_data[i-1:i+1]
    sequences.append(sequence)
print("Total Sequences: %d" % len(sequences))
# Split into X and y elements


Total Sequences: 63


# Running this piece shows that we have a total of 34 input-output pairs to train the network

In [11]:
sequences
#sequences[:5] # [input, output]

[[2, 9],
 [9, 10],
 [10, 11],
 [11, 12],
 [12, 2],
 [2, 13],
 [13, 3],
 [3, 14],
 [14, 15],
 [15, 4],
 [4, 16],
 [16, 17],
 [17, 1],
 [1, 18],
 [18, 19],
 [19, 20],
 [20, 21],
 [21, 22],
 [22, 23],
 [23, 24],
 [24, 5],
 [5, 6],
 [6, 25],
 [25, 26],
 [26, 27],
 [27, 7],
 [7, 28],
 [28, 29],
 [29, 30],
 [30, 31],
 [31, 32],
 [32, 33],
 [33, 34],
 [34, 35],
 [35, 36],
 [36, 37],
 [37, 38],
 [38, 39],
 [39, 7],
 [7, 1],
 [1, 40],
 [40, 5],
 [5, 6],
 [6, 1],
 [1, 41],
 [41, 42],
 [42, 8],
 [8, 43],
 [43, 44],
 [44, 45],
 [45, 46],
 [46, 47],
 [47, 2],
 [2, 48],
 [48, 3],
 [3, 8],
 [8, 4],
 [4, 49],
 [49, 50],
 [50, 1],
 [1, 51],
 [51, 52],
 [52, 53]]

# We can split the sequences into input (X) and output elements (y)

In [12]:
sequences=array(sequences)
X,y =sequences[:,0],sequences[:,1]

In [13]:
# one hot encode outputs
X= to_categorical(y, num_classes=vocab_size)
# define model
X[:5]

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.],
    

In [14]:
# one hot encode outputs
y= to_categorical(y, num_classes=vocab_size)
# define model
y[:5]

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.],
    

In [20]:
model=Sequential()
model.add(Embedding(vocab_size, 10, input_length=1))
model.add(SimpleRNN(50))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

None


In [22]:
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [23]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [24]:
# compile netrwork
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [25]:
# fit network
model.fit(X,y,epochs=10)

Epoch 1/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 30ms/step - accuracy: 0.0106 - loss: 3.9956  
Epoch 2/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.0316 - loss: 3.9798
Epoch 3/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.0843 - loss: 3.9716
Epoch 4/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.1262 - loss: 3.9512
Epoch 5/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.0630 - loss: 3.9298
Epoch 6/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.0420 - loss: 3.9072
Epoch 7/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.0949 - loss: 3.8868
Epoch 8/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.1263 - loss: 3.8359
Epoch 9/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

<keras.src.callbacks.history.History at 0x295ba1bcad0>

In [26]:
# generate a sequence from the model
def generate_seq(model, tokenizer, enter_text, n_pred):
    in_text, result=enter_text, enter_text
    # generate a fixed number of words
    for _ in range(n_pred):
        
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        encoded=array(encoded)
        
        encoded = encoded.reshape(1, -1)
        
        # predict a word in the vocabulary
        yhat=model.predict(encoded)
        yhat = yhat.argmax(axis=-1)[0]
        
        # map predicted word index to word
        out_word=''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word=word
                break
        # append to input
        in_text, result=out_word, result + ' ' + out_word
    return result

In [29]:
# evaluate
print(generate_seq(model,tokenizer, 'bucky',6))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
bucky but but but but but but


In [28]:
print(generate_seq(model,tokenizer, "The",6))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
The this – – – – –
