In [None]:
!nvidia-smi

In [None]:
!pip install --upgrade transformers datasets accelerate

**Dataset**

Prerequisities

1. Download conversations data and preprocess it to required format
2. Create "dataset" dir and copy there all .txt files

In [None]:
import glob
from datasets import load_dataset

# load text files into dataset
dataset = load_dataset("text", data_files={"train": glob.glob("dataset/*.txt")})
print(dataset)
print(dataset["train"]["text"][0])

**Tokenizer**

In [None]:
# tokenize dataset
# load tokenizer for model
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "flax-community/papuGaPT2"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer

In [None]:
# add special tokens - <speaker1> and <speaker2> to tokenizer
SPEAKER_TOKENS = {"additional_special_tokens": ["<speaker1>", "<speaker2>"]}
tokenizer.add_special_tokens(SPEAKER_TOKENS)
# resize model embeddings size - extend by 2 for speaker tokens
model.resize_token_embeddings(len(tokenizer))
# set pad token as eos
tokenizer.pad_token = tokenizer.eos_token

tokenizer

In [None]:
# transform text to list of subword token ids
def tokenize_func(examples):
    return tokenizer(examples["text"])


tokenized_dataset = dataset.map(
    tokenize_func, batched=True, num_proc=4, remove_columns=["text"]
)
tokenized_dataset

In [None]:
print(dataset["train"]["text"][0])
print(
    f"length of example (number of words): {len(dataset['train']['text'][0].split())}"
)
print()
print(tokenized_dataset["train"]["input_ids"][0])
print(f"length of tokenized example: {len(tokenized_dataset['train']['input_ids'][0])}")
print()
print("Decoded input ids by tokenizer")
print(tokenizer.decode(tokenized_dataset["train"]["input_ids"][0]))

**Examples grouping**

In [None]:
# concatenate examples for largest blocks of text as possible for model, which is 512 tokens
block_size = 512


def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result


language_model_dataset = tokenized_dataset.map(
    group_texts, batched=True, batch_size=1000, num_proc=4
)
language_model_dataset

In [None]:
print(f"length of example: {len(language_model_dataset['train']['input_ids'][0])}")
print(f"num of examples: {len(language_model_dataset['train']['input_ids'])}")

In [None]:
# split to train and test dataset (which will be used as a validation dataset :)
language_model_dataset_train_test = language_model_dataset["train"].train_test_split(
    test_size=0.1, seed=666
)

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
data_collator

**Training**

In [None]:
# training config
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    num_train_epochs=10,
    learning_rate=1e-5,
    weight_decay=0.01,
    auto_find_batch_size=True,  # requires accelerate
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=language_model_dataset_train_test["train"],
    eval_dataset=language_model_dataset_train_test["test"],
    data_collator=data_collator,
)

In [None]:
# list callbacks
trainer.callback_handler.callbacks

In [None]:
# run training
trainer.train()

In [None]:
# eval
trainer.evaluate()

In [None]:
# play with custom inputs
from transformers import pipeline, set_seed

set_seed(93)

generator = pipeline("text-generation", model=model.eval().cpu(), tokenizer=tokenizer)

In [None]:
generator("<speaker1> hej<speaker2> czesc, co robisz w weekend?<speaker1>")

In [None]:
# advanced playing with custom inputs
input_text = "<speaker2> hej, pójdziemy na piwko w czwartek?<speaker1>"

input_ids = tokenizer.encode(input_text, return_tensors="pt")

sample_outputs = model.generate(
    input_ids,
    do_sample=True,
    max_length=100,
    top_k=50,
    top_p=0.95,
    num_return_sequences=4,
)

print(f"INPUT: {input_text}")
print("OUTPUT")
for i, sample_output in enumerate(sample_outputs):
    print(f"{i}: {tokenizer.decode(sample_output)}")
    print()