In [2]:
# Importações necessárias
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import LSTM, Embedding, Bidirectional, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
import numpy as np

In [3]:
tokenizer = Tokenizer() # Instanciando o tokenizer
# Carregamento do arquivo teste
with open("dancing_queen.txt", "r") as f:
    texto = f.read()
texto = texto.lower().split("\n") # Separa pelas quebras de linha e deixa as letras maiusculas minusculas
texto

['you can dance, you can jive',
 'having the time of your life',
 'ooh, see that girl, watch that scene',
 'digging the dancing queen',
 '',
 'friday night and the lights are low',
 'looking out for a place to go',
 'where they play the right music',
 'getting in the swing',
 'you come to look for a king',
 '',
 'anybody could be that guy',
 "night is young and the music's high",
 'with a bit of rock music',
 'everything is fine',
 "you're in the mood for a dance",
 'and when you get the chance',
 '',
 'you are the dancing queen',
 'young and sweet, only seventeen',
 'dancing queen',
 'feel the beat from the tambourine, oh, yeah',
 '',
 'you can dance, you can jive',
 'having the time of your life',
 'ooh, see that girl, watch that scene',
 'digging the dancing queen',
 '',
 "you're a teaser, you turn them on",
 'leave them burning and then you are gone',
 'looking out for another, anyone will do',
 "you're in the mood for a dance",
 'and when you get the chance',
 '',
 'you are the da

In [4]:
tokenizer.fit_on_texts(texto) # Aplica o tokenizer ao texto
sequences = []
for line in texto:
    token = tokenizer.texts_to_sequences([line])[0]# Cria uma lista de tokens para cada linha
    for i in range(1, len(token)):
        sequences.append(token[:i:+1])# Pega os valores até i+1
sequences


[[2],
 [2, 7],
 [2, 7, 9],
 [2, 7, 9, 2],
 [2, 7, 9, 2, 7],
 [15],
 [15, 1],
 [15, 1, 16],
 [15, 1, 16, 11],
 [15, 1, 16, 11, 17],
 [19],
 [19, 20],
 [19, 20, 5],
 [19, 20, 5, 21],
 [19, 20, 5, 21, 22],
 [19, 20, 5, 21, 22, 5],
 [12],
 [12, 1],
 [12, 1, 3],
 [47],
 [47, 27],
 [47, 27, 6],
 [47, 27, 6, 1],
 [47, 27, 6, 1, 48],
 [47, 27, 6, 1, 48, 13],
 [28],
 [28, 29],
 [28, 29, 10],
 [28, 29, 10, 8],
 [28, 29, 10, 8, 50],
 [28, 29, 10, 8, 50, 30],
 [52],
 [52, 53],
 [52, 53, 54],
 [52, 53, 54, 1],
 [52, 53, 54, 1, 55],
 [56],
 [56, 24],
 [56, 24, 1],
 [2],
 [2, 58],
 [2, 58, 30],
 [2, 58, 30, 59],
 [2, 58, 30, 59, 10],
 [2, 58, 30, 59, 10, 8],
 [61],
 [61, 62],
 [61, 62, 63],
 [61, 62, 63, 5],
 [27],
 [27, 32],
 [27, 32, 25],
 [27, 32, 25, 6],
 [27, 32, 25, 6, 1],
 [27, 32, 25, 6, 1, 65],
 [67],
 [67, 8],
 [67, 8, 68],
 [67, 8, 68, 11],
 [67, 8, 68, 11, 69],
 [70],
 [70, 32],
 [26],
 [26, 24],
 [26, 24, 1],
 [26, 24, 1, 33],
 [26, 24, 1, 33, 10],
 [26, 24, 1, 33, 10, 8],
 [6],
 [6, 34]

In [5]:
max_len = max([len(x) for x in sequences]) # Pega a maior sequencia 
sequences = np.array(pad_sequences(sequences, maxlen = max_len, padding='pre')) # Adiciona zeros as esqueradas para todos ficarem com o mesmo tamanho

In [6]:
npalavras = len(tokenizer.word_index) +  1
xs = sequences[:, :-1]
labels = sequences[:, -1]
ys = tf.keras.utils.to_categorical(labels, num_classes=npalavras)

In [10]:
model = Sequential() # Cria um modelo sequencial
model.add(Embedding(npalavras, 240, input_length=max_len-1)) # Camada de embedding
model.add(Bidirectional(LSTM(150))) # camada capaz de aprender dependencias a longo prazo
model.add(Dense(npalavras, activation='softmax')) # Camada densa 
adam = Adam(learning_rate=0.01) # Otimizado com a taxa de apredizado personalizada
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy']) # Compilação
history = model.fit(xs, ys, epochs=100, verbose=1) # treinamento

Epoch 1/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 27ms/step - accuracy: 0.0764 - loss: 4.2737
Epoch 2/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.1545 - loss: 3.7050
Epoch 3/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.3163 - loss: 2.9045
Epoch 4/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.3475 - loss: 2.3571
Epoch 5/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.5799 - loss: 1.4779
Epoch 6/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.6870 - loss: 1.2297
Epoch 7/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 0.7230 - loss: 0.9970
Epoch 8/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 0.7323 - loss: 0.9212
Epoch 9/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

In [11]:
seed_text = "Just a test" # Frase inicial
next_words = 20 # Número de palavras a serem escolhidas
for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_len-1, padding='pre')
    predicted = model.predict(token_list, verbose=0) # Lista com probabilidades de cada palavra
    predicted_class = predicted.argmax(axis=-1) # Pega a palavra com maior probabilidade
    output_word = ''
    for word, index in tokenizer.word_index.items():
        if index == predicted_class:
            output_word = word
            break
    seed_text+=" " + output_word # Monta a frase
print(seed_text)

Just a test they play the right right right tambourine oh oh them music's music's        
