In [None]:
!pip install tokenizers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
#This defines the vocabulary size. 
vocabulary_size = 25000

In [None]:
#This piece of code is taken from Dr. Scannell's notebook
import os
from tokenizers.models import BPE
from tokenizers import Tokenizer
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.normalizers import NFC, Sequence
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.trainers import BpeTrainer

class BPE_token(object):
    def __init__(self):
        self.tokenizer = Tokenizer(BPE())
        self.tokenizer.normalizer = Sequence([
            NFC()
        ])
        self.tokenizer.pre_tokenizer = ByteLevel()
        self.tokenizer.decoder = ByteLevelDecoder()

    def bpe_train(self, paths):
        trainer = BpeTrainer(vocab_size=vocabulary_size, show_progress=True, inital_alphabet=ByteLevel.alphabet(), special_tokens=[
            "<s>",
            "<pad>",
            "</s>",
            "<unk>",
            "<mask>"
        ])
        self.tokenizer.train(paths, trainer)

    def save_tokenizer(self, location, prefix=None):
        if not os.path.exists(location):
            os.makedirs(location)
        self.tokenizer.model.save(location, prefix)

In [None]:
paths = ['sw-train.txt']

In [None]:
tokenizer = BPE_token()
tokenizer.bpe_train(paths)
tokenizer.save_tokenizer('.')

In [None]:
import tensorflow as tf
from transformers import GPT2Config, TFGPT2LMHeadModel, GPT2Tokenizer
# loading tokenizer from the saved model path
tokenizer = GPT2Tokenizer.from_pretrained('.')
tokenizer.add_special_tokens({
  "eos_token": "</s>",
  "bos_token": "<s>",
  "unk_token": "<unk>",
  "pad_token": "<pad>",
  "mask_token": "<mask>"
})
# creating the configurations from which the model can be made
config = GPT2Config(
  vocab_size=tokenizer.vocab_size,
  bos_token_id=tokenizer.bos_token_id,
  eos_token_id=tokenizer.eos_token_id,
  n_embd=128,
  n_layer=2,
  n_head=4
)
# creating the model
model = TFGPT2LMHeadModel(config)
model(model.dummy_inputs)
model.summary()

Model: "tfgpt2lm_head_model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 transformer (TFGPT2MainLaye  multiple                 3727872   
 r)                                                              
                                                                 
Total params: 3,727,872
Trainable params: 3,727,872
Non-trainable params: 0
_________________________________________________________________


In [None]:
single_string = ''
for filename in paths:
  with open(filename, "r", encoding='utf-8') as f:
    x = f.read()
  single_string += x + tokenizer.eos_token
string_tokenized = tokenizer.encode(single_string)

In [None]:
string_tokenized[:10]

[86, 466, 3305, 439, 65, 3637, 10, 65, 679, 768]

In [None]:
examples = []
block_size = 100
BATCH_SIZE = 12
BUFFER_SIZE = 1000
for i in range(0, len(string_tokenized) - block_size + 1, block_size):
  examples.append(string_tokenized[i:i + block_size])
inputs, labels = [], []
for ex in examples:
  inputs.append(ex[:-1])
  labels.append(ex[1:])
dataset = tf.data.Dataset.from_tensor_slices((inputs, labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [None]:
# defining our optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01, epsilon=0.9, clipnorm=1.0)
# definining our loss function
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# defining our metric which we want to observe
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
# compiling the model
model.compile(optimizer=optimizer, loss=[loss, *[None] * model.config.n_layer], metrics=[metric])

In [None]:
num_epoch = 1
history = model.fit(dataset, epochs=num_epoch)



In [None]:
paths_test= ['sw-test.txt']

In [None]:
text=paths_test
# encoding the input text
input_ids = tokenizer.encode(text, return_tensors='tf')
# getting out output
beam_output = model.generate(
  input_ids,
  max_length = 50,
  num_beams = 5,
  temperature = 0.4,
  no_repeat_ngram_size=2,
  num_return_sequences=5
)
print(tokenizer.decode(beam_output[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to 2 (first `eos_token_id`) to generate sequence


<unk> wa mwaka wa kwanza, na watu wa mkoa wa wakazi wa nchi yake. kwa sababu ya nchi ya kwanza. mwaka mwaka huu. ni mji wa serikali ya serikali, kwa nchi hiyo. lakini ni mwaka mkuu wa tanzania. kuna watu, lakini kwa


In [None]:
tokenizer_test = BPE_token()
tokenizer_test.bpe_train(paths_test)
tokenizer_test.save_tokenizer('.')

In [None]:
import tensorflow as tf
from transformers import GPT2Config, TFGPT2LMHeadModel, GPT2Tokenizer
# loading tokenizer from the saved model path
tokenizer_test = GPT2Tokenizer.from_pretrained('.')
tokenizer_test.add_special_tokens({
  "eos_token": "</s>",
  "bos_token": "<s>",
  "unk_token": "<unk>",
  "pad_token": "<pad>",
  "mask_token": "<mask>"
})
# creating the configurations from which the model can be made
config = GPT2Config(
  vocab_size=tokenizer_test.vocab_size,
  bos_token_id=tokenizer_test.bos_token_id,
  eos_token_id=tokenizer_test.eos_token_id,
  n_embd=128,
  n_layer=2,
  n_head=4
)
# creating the model
model_test = TFGPT2LMHeadModel(config)
model_test(model_test.dummy_inputs)
model_test.summary()

Model: "tfgpt2lm_head_model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 transformer (TFGPT2MainLaye  multiple                 3727872   
 r)                                                              
                                                                 
Total params: 3,727,872
Trainable params: 3,727,872
Non-trainable params: 0
_________________________________________________________________


In [None]:
single_string_test = ''
for filename in paths_test:
  with open(filename, "r", encoding='utf-8') as f:
    x = f.read()
  single_string_test += x + tokenizer_test.eos_token
string_tokenized_test = tokenizer_test.encode(single_string_test)

In [None]:
string_tokenized_test[:10]

[86, 470, 3265, 436, 65, 4062, 10, 65, 684, 787]

In [None]:
examples = []
block_size = 100
BATCH_SIZE = 12
BUFFER_SIZE = 1000
for i in range(0, len(string_tokenized_test) - block_size + 1, block_size):
  examples.append(string_tokenized_test[i:i + block_size])
inputs, labels = [], []
for ex in examples:
  inputs.append(ex[:-1])
  labels.append(ex[1:])
dataset_test = tf.data.Dataset.from_tensor_slices((inputs, labels))
dataset_test = dataset_test.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [None]:
# defining our optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=0.1, epsilon=0.8, clipnorm=1.0)
# definining our loss function
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# defining our metric which we want to observe
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
# compiling the model
model_test.compile(optimizer=optimizer, loss=[loss, *[None] * model.config.n_layer], metrics=[metric])

In [None]:
num_epoch = 1
history_test = model_test.fit(dataset_test, epochs=num_epoch)

