In [None]:
!pip install datasets evaluate transformers[sentencepiece]

# Only install when running locally. Colab V100 Compute Engine already has CUDA, keras and tensorflow installed.
#!pip install tf-keras tensorflow tensorflow[and-cuda]

In [None]:
from datasets import load_dataset

dataset = load_dataset("juancopi81/mutopia_guitar_dataset")

In [None]:
print(dataset)

In [None]:
import random

sample_num = random.randint(0, len(dataset["train"]))
print(f"{dataset['train'][sample_num]['text'][:200]}")

In [None]:
from transformers import AutoTokenizer

context_length = 256

tokenizer = AutoTokenizer.from_pretrained("juancopi81/mutopia_guitar_dataset_tokenizer")

outputs = tokenizer(
    dataset["test"][:2]["text"],
    truncation=True,
    max_length=context_length,
    return_overflowing_tokens=True,
    return_length=True,
)

print(f"Input IDs length: {len(outputs['input_ids'])}")
print(f"Input chunk lengths: {(outputs['length'])}")
print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}")

In [None]:
def tokenize(element):
    removed_elements_counter = 0
    outputs = tokenizer(
        element["text"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
        else:
            removed_elements_counter += 1
    print(f"Removed chunks with size less than context_size: {removed_elements_counter}")
    return {"input_ids": input_batch}

tokenized_datasets = dataset.map(
    tokenize, batched=True, remove_columns=dataset["train"].column_names
)

tokenized_datasets

In [None]:
from transformers import AutoTokenizer, TFGPT2LMHeadModel, AutoConfig

config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer),
    n_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

In [None]:
model = TFGPT2LMHeadModel(config)
model(model.dummy_inputs)
model.summary()

In [None]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, return_tensors="tf")

In [None]:
out = data_collator([tokenized_datasets["test"][i] for i in range(5)])

for key in out:
  print(f"{key} shape: {out[key].shape}")

In [None]:
print(tokenizer.decode(out["input_ids"][0]))

In [None]:
tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
    columns=["input_ids", "attention_mask", "labels"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=8
)

tf_eval_dataset = tokenized_datasets["test"].to_tf_dataset(
    columns=["input_ids", "attention_mask", "labels"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=8
)

tf_train_dataset

In [None]:
from transformers import create_optimizer
import tensorflow as tf

num_epochs = 10
num_train_steps = len(tf_train_dataset) * num_epochs

optimizer, schedule = create_optimizer(
    init_lr=5e-5,
    num_warmup_steps=1_000,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)

model.compile(optimizer=optimizer)

tf.keras.mixed_precision.set_global_policy("mixed_float16")

In [None]:
model.fit(tf_train_dataset, validation_data=tf_eval_dataset, epochs=num_epochs)