In [1]:
import tensorflow as tf
import numpy as np
import os
import time

In [2]:
# Mendapatkan path file untuk dataset Shakespeare dari URL
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt


In [3]:
# membaca, lalu mendekode konten tersebut dari binary menjadi string menggunakan encoding ‘utf-8’
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
# mencetak jumlah karakter dalam teks tersebut
print(f'Length of text: {len(text)} characters')

Length of text: 1115394 characters


In [4]:
# Mencetak 250 text pertama
print(text[:250])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.



In [5]:
# mencari jumlah karakter unik dalam teks
vocab = sorted(set(text)) # mengubah teks menjadi set untuk menghilangkan duplikat
print(f'{len(vocab)} unique characters') # mencetak jumlah karakter unik dalam teks

65 unique characters


In [6]:
example_texts = ['abcdefg', 'xyz'] # mendefinisikan list string yang akan diproses
chars = tf.strings.unicode_split (example_texts, input_encoding='UTF-8') # membagi setiap string dalam example_texts menjadi list karakter
chars # mencetak hasil dari operasi split

<tf.RaggedTensor [[b'a', b'b', b'c', b'd', b'e', b'f', b'g'], [b'x', b'y', b'z']]>

In [7]:
# membuat instance dari layer StringLookup dengan vocabulari yang ditentukan oleh vocab
ids_from_chars = tf.keras.layers.StringLookup(vocabulary=list(vocab), mask_token=None) 

ids = ids_from_chars(chars) # mengubah chars menjadi indeks integer menggunakan ids_from_chars
ids # mencetak array indeks integer yang mewakili chars

<tf.RaggedTensor [[40, 41, 42, 43, 44, 45, 46], [63, 64, 65]]>

In [8]:
# membuat instance dari layer StringLookup dengan vocabulari yang ditentukan oleh ids_from_chars.get_vocabulary()
chars_from_ids = tf.keras.layers.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)

chars = chars_from_ids(ids) # mengubah ids kembali menjadi karakter menggunakan chars_from_ids
chars # mencetak array karakter yang mewakili ids

<tf.RaggedTensor [[b'a', b'b', b'c', b'd', b'e', b'f', b'g'], [b'x', b'y', b'z']]>

In [9]:
tf.strings.reduce_join(chars, axis=-1).numpy() # menggabungkan elemen-elemen dalam array chars menjadi string

array([b'abcdefg', b'xyz'], dtype=object)

In [10]:
# Membuat fungsi mengubah list indeks integer kembali menjadi string
def text_from_ids(ids):
    return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

# membagi teks menjadi list karakter dan hasilnya disimpan di variabel all_ids
all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))
all_ids

<tf.Tensor: shape=(1115394,), dtype=int64, numpy=array([19, 48, 57, ..., 46,  9,  1])>

In [11]:
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids) # membuat tf.data.Dataset dari all_ids

# Menampilkan karakter-karakter pertama dalam 10 tensor ID
for ids in ids_dataset.take(10):
    print(chars_from_ids(ids).numpy().decode('utf-8'))

F
i
r
s
t
 
C
i
t
i


In [12]:
# Menentukan panjang urutan yang diinginkan
seq_length = 100

# Membuat urutan dari dataset dengan panjang yang ditentukan
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

# Menampilkan urutan karakter pertama dalam satu batch
for seq in sequences.take(1):
  print(chars_from_ids(seq))

tf.Tensor(
[b'F' b'i' b'r' b's' b't' b' ' b'C' b'i' b't' b'i' b'z' b'e' b'n' b':'
 b'\n' b'B' b'e' b'f' b'o' b'r' b'e' b' ' b'w' b'e' b' ' b'p' b'r' b'o'
 b'c' b'e' b'e' b'd' b' ' b'a' b'n' b'y' b' ' b'f' b'u' b'r' b't' b'h'
 b'e' b'r' b',' b' ' b'h' b'e' b'a' b'r' b' ' b'm' b'e' b' ' b's' b'p'
 b'e' b'a' b'k' b'.' b'\n' b'\n' b'A' b'l' b'l' b':' b'\n' b'S' b'p' b'e'
 b'a' b'k' b',' b' ' b's' b'p' b'e' b'a' b'k' b'.' b'\n' b'\n' b'F' b'i'
 b'r' b's' b't' b' ' b'C' b'i' b't' b'i' b'z' b'e' b'n' b':' b'\n' b'Y'
 b'o' b'u' b' '], shape=(101,), dtype=string)


In [13]:
# Menampilkan teks dari lima urutan pertama
for seq in sequences.take(5):
    print(text_from_ids(seq).numpy())

b'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
b'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'
b"now Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us ki"
b"ll him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be d"
b'one: away, away!\n\nSecond Citizen:\nOne word, good citizens.\n\nFirst Citizen:\nWe are accounted poor citi'


In [14]:
# Mendefinisikan fungsi untuk membagi input dan target dari suatu urutan
def split_input_target(sequence):
  input_text = sequence[:-1]
  target_text = sequence[1:]
  return input_text, target_text

split_input_target(list("Tensorflow")) # Memanggil fungsi pada contoh urutan

dataset = sequences.map(split_input_target) # Membuat dataset dari urutan dengan input dan target yang terpisah

# Menampilkan contoh input dan target dari dataset
for input_example, target_example in dataset.take(1):
  print("Input :", text_from_ids(input_example).numpy())
  print("Target:", text_from_ids(target_example).numpy())

Input : b'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'
Target: b'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '


In [15]:
# Ukuran batch
BATCH_SIZE = 64

# Ukuran buffer untuk mengacak dataset
BUFFER_SIZE = 10000

# Membuat dataset dengan mengacak, mengelompokkan, dan memuat dengan prefetch
dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

dataset # Menampilkan dataset

<_PrefetchDataset element_spec=(TensorSpec(shape=(64, 100), dtype=tf.int64, name=None), TensorSpec(shape=(64, 100), dtype=tf.int64, name=None))>

In [16]:
# Length of the vocabulary in StringLookup Layer
vocab_size = len(ids_from_chars.get_vocabulary())

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

# Mendefinisikan kelas model yang merupakan turunan dari tf.keras.Model
class MyModel(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True,
                                   return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else:
      return x

# Membuat objek model menggunakan kelas yang telah didefinisikan
model = MyModel(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

In [17]:
# Melakukan prediksi pada satu batch contoh input dari dataset
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

model.summary() # Menampilkan ringkasan arsitektur model

(64, 100, 66) # (batch_size, sequence_length, vocab_size)
Model: "my_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       multiple                  16896     
                                                                 
 gru (GRU)                   multiple                  3938304   
                                                                 
 dense (Dense)               multiple                  67650     
                                                                 
Total params: 4022850 (15.35 MB)
Trainable params: 4022850 (15.35 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [18]:
# Menghasilkan indeks teracak dari distribusi prediksi
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()

sampled_indices

# Menampilkan contoh input dan prediksi karakter berikutnya
print("Input:\n", text_from_ids(input_example_batch[0]).numpy())
print()
print("Next Char Predictions:\n", text_from_ids(sampled_indices).numpy())

Input:
 b'ding\nThat changeth thus his manners.\n\nCAMILLO:\nI dare not know, my lord.\n\nPOLIXENES:\nHow! dare not! '

Next Char Predictions:
 b"wIDUIH'sOJvM\nr.RVeeJiLUl'3\ns[UNK]AqvZZ:EsJpTNZWS:HdtnlIABv'bwhhcaYKUornRGZ;NgsxnaR[UNK]-d[UNK]PdXO; pamFY--\nGPuY"


In [19]:
# Menggunakan SparseCategoricalCrossentropy sebagai fungsi loss
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

# Menghitung rata-rata loss pada contoh batch
example_batch_mean_loss = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("Mean loss:        ", example_batch_mean_loss)

Prediction shape:  (64, 100, 66)  # (batch_size, sequence_length, vocab_size)
Mean loss:         tf.Tensor(4.1894326, shape=(), dtype=float32)


In [20]:
# Menghitung eksp dari rata-rata loss
tf.exp(example_batch_mean_loss).numpy()

65.985344

In [21]:
# mengkompilasi model dengan menentukan optimizer dan fungsi loss
model.compile(optimizer='adam', loss=loss)

In [24]:
# mendefinisikan direktori di mana checkpoint akan disimpan
checkpoint_dir = './training_checkpoints'

# mendefinisikan prefix untuk nama file checkpoint
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

# membuat callback ModelCheckpoint yang akan menyimpan bobot model ke file checkpoint setelah setiap epoch
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

EPOCHS = 20 # mendefinisikan jumlah epoch

# melatih model dengan dataset yang ditentukan selama jumlah epoch yang ditentukan, dan menyimpan bobot model setelah setiap epoch
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [25]:
class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    # Create a mask to prevent "[UNK]" from being generated.
    skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices=skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(ids_from_chars.get_vocabulary())])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature
    # Apply the prediction mask: prevent "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)

    # Return the characters and model state.
    return predicted_chars, states

one_step_model = OneStep(model, chars_from_ids, ids_from_chars)

In [26]:
start = time.time()
states = None
next_char = tf.constant(['ROMEO:'])
result = [next_char]

for n in range(1000):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print('\nRun time:', end - start)

ROMEO:
The utterance of the luminable and cheques
Rome to me than one coats and beat two more stones
In unewering name myself exactly,
It shall be freely pompetual wit
demanded; what with our ready sorrow?

ANGELO:
You say your days o' the conquerors.

MENENIUS:
The worthy leggards, this endured men sir: you may attend our guests,
Lozed takes, the city begins to die.

KING RICHARD II:
An if what horse is our confess restruity.
First would you have broken from him, dear sin's presence
Through't with that word 'banishe'l'd in one light,--

QUEEN MARGARET:
Go, fool! why, An if she be opentry,' then,
By deep in cellowing were true. Take my meaning
The country's friend. You shall my words
With words that cannot be corression with
with the heavens to the last.

ESCALUS:
ETward with me: beseech you, tell: be calm
His glad to vessel those bastards.

GLOUCESTER:
This island goes had been many-gentle Lad Anny
The childress when the boar will pray their fortune soundly.

Volsce:
I am a quarrel so

In [27]:
start = time.time()
states = None
next_char = tf.constant(['ROMEO:', 'ROMEO:', 'ROMEO:', 'ROMEO:', 'ROMEO:'])
result = [next_char]

for n in range(1000):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result, '\n\n' + '_'*80)
print('\nRun time:', end - start)

tf.Tensor(
[b"ROMEO:\nThe own particular shame with slain:\nAnd yet I warrant her; Duly obedience,\nHaving no more of thif; and it had so dear,\nAnd I forefore we speak no more. I'll be miled.\n\nYORK:\nWhat is the news whom thou wert sworn too well?\nI mean, in Vienna, sir.\n\nSEBASTIAN:\nAy, what a miserable hure these roared\nThey are both broken soft! how loves no reason\nAs Pehelo's to crave, and for your days,\nTo prive the taste in famous moes,\nThere shall not proceed to wail invectives,\nAnd never will it tetchy high her naturally permit?\nUnch-vouch, of you have one\nAn aught roubling their hearts attended,\nTo make his foundant in steel,\nAnd these decless from his son are old men,\nAnd give I believe me, this prevails, we must strange,\nI cannot lour' to knowly. If\nMarch'd with this delivery? let's still a rark;\nBut, soft! what a little thing I saw her,\nAgainst the holour unto my foes,\nAnd take the sclittle grave of our morance, which\nThey are bound in guared beating m

In [28]:
tf.saved_model.save(one_step_model, 'one_step')
one_step_reloaded = tf.saved_model.load('one_step')

states = None
next_char = tf.constant(['ROMEO:'])
result = [next_char]

for n in range(100):
  next_char, states = one_step_reloaded.generate_one_step(next_char, states=states)
  result.append(next_char)

print(tf.strings.join(result)[0].numpy().decode("utf-8"))



ROMEO:
The chaplish'd spenation of my faith, come on the cause
Of thricting with the highwimation that you
