# Training model
We will use our dataset to train a causal language model... not from scratch, but to continue training it from its checkpoint.



In [94]:
from datasets import load_dataset
from dotenv import load_dotenv
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import DataCollatorForLanguageModeling
import os

load_dotenv()

True

In [95]:
# !pip install -r requirements.txt

In [96]:
dataset = load_dataset(f"Roverto/ultimate-articles", token = os.environ["HF_TOKEN"])
dataset

Downloading readme:   0%|          | 0.00/387 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.44M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/773 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'length'],
        num_rows: 773
    })
})

In [97]:
from transformers import AutoTokenizer, TFGPT2LMHeadModel, AutoConfig, AutoModelForCausalLM

model_name = "openai-community/gpt2"
# model_name = "google/gemma-7b"
# model_name = "bert-base-cased"

print(model_name)

context_length = 128

tokenizer = AutoTokenizer.from_pretrained(model_name)
# tokenizer.pad_token = tokenizer.eos_token

config = AutoConfig.from_pretrained(
    model_name,
    vocab_size=len(tokenizer),
    n_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id,
    # use_auth_token=True,
    token=os.environ["HF_TOKEN"]
)

model = TFGPT2LMHeadModel.from_pretrained(model_name)
model.summary()

openai-community/gpt2


All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Model: "tfgpt2lm_head_model_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 transformer (TFGPT2MainLay  multiple                  124439808 
 er)                                                             
                                                                 
Total params: 124439808 (474.70 MB)
Trainable params: 124439808 (474.70 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [98]:
dataset["train"].column_names

['text', 'length']

In [99]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, return_tensors="tf")

In [100]:
def tokenize(element):
    outputs = tokenizer(
        element["text"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}


tokenized_dataset = dataset.map(
    tokenize, 
    batched=True, 
    remove_columns=dataset["train"].column_names
)
tokenized_datasets

Map:   0%|          | 0/773 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 9599
    })
})

In [109]:
max_length = 0
total_length = 0
for input in tokenized_dataset["train"]["input_ids"]:
    total_length += len(input)
    if len(input)>max_length:
        max_length = len(input)
max_length, total_length/len(tokenized_dataset["train"]["input_ids"])

(128, 128.0)

In [112]:
out = data_collator([tokenized_dataset["train"][i] for i in range(5)])
for key in out:
    print(f"{key} shape: {out[key].shape}")

input_ids shape: (5, 128)
attention_mask shape: (5, 128)
labels shape: (5, 128)


In [113]:
tf_train_dataset = model.prepare_tf_dataset(
    tokenized_dataset["train"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=32,
)

In [114]:
from transformers import create_optimizer
import tensorflow as tf

num_train_steps = len(tf_train_dataset)
optimizer, schedule = create_optimizer(
    init_lr=5e-5,
    num_warmup_steps=1_000,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

# Train in mixed-precision float16
tf.keras.mixed_precision.set_global_policy("mixed_float16")

The dtype policy mixed_float16 may run slowly because this machine does not have a GPU. Only Nvidia GPUs with compute capability of at least 7.0 run quickly with mixed_float16.


In [115]:
model.fit(tf_train_dataset, verbose=1)


