# Training base model
We will use our ultiworld article dataset to train a causal language model... not from scratch, but to continue training it from its checkpoint. This will be used as the base model for various fine-tuning tasks later on.

The training is done in the cloud using runpod then the model is pushed to Huggingface hub.



In [1]:
from datasets import load_dataset
from dotenv import load_dotenv
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import DataCollatorForLanguageModeling
import os

load_dotenv()
# load_dotenv("env.txt")




True

In [2]:
# !pip install -r requirements.txt

In [3]:
dataset = load_dataset(f"Roverto/ultimate-articles", token = os.environ["HF_TOKEN"])
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'length'],
        num_rows: 773
    })
})

In [4]:
from transformers import AutoTokenizer, TFGPT2LMHeadModel, AutoConfig, AutoModelForCausalLM

model_name = "openai-community/gpt2"
# model_name = "bert-base-cased"

print(model_name)

context_length = 128

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

config = AutoConfig.from_pretrained(
    model_name,
    vocab_size=len(tokenizer),
    n_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id,
    # use_auth_token=True,
    token=os.environ["HF_TOKEN"]
)

# model = TFGPT2LMHeadModel.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, from_tf = True, is_decoder=True)
model

openai-community/gpt2


tf_model.h5:   0%|          | 0.00/498M [00:00<?, ?B/s]

KeyboardInterrupt: 

In [31]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, return_tensors="pt")

In [32]:
def tokenize(element):
    outputs = tokenizer(
        element["text"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}


tokenized_dataset = dataset.map(
    tokenize, 
    batched=True, 
    remove_columns=dataset["train"].column_names
)
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 10012
    })
})

In [25]:
max_length = 0
total_length = 0
for input in tokenized_dataset["train"]["input_ids"]:
    total_length += len(input)
    if len(input)>max_length:
        max_length = len(input)
max_length, total_length/len(tokenized_dataset["train"]["input_ids"])

(128, 128.0)

In [33]:
tokenizer.pad_token,tokenizer.eos_token

(None, None)

In [27]:
tokenizer.pad_token = tokenizer.eos_token

out = data_collator([tokenized_dataset["train"][i] for i in range(5)])
for key in out:
    print(f"{key} shape: {out[key].shape}")

ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.

In [20]:
tf_train_dataset = model.prepare_tf_dataset(
    tokenized_dataset["train"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=32,
)

AttributeError: 'BertLMHeadModel' object has no attribute 'prepare_tf_dataset'

In [114]:
from transformers import create_optimizer
import tensorflow as tf

num_train_steps = len(tf_train_dataset)
optimizer, schedule = create_optimizer(
    init_lr=5e-5,
    num_warmup_steps=1_000,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

# Train in mixed-precision float16
tf.keras.mixed_precision.set_global_policy("mixed_float16")

The dtype policy mixed_float16 may run slowly because this machine does not have a GPU. Only Nvidia GPUs with compute capability of at least 7.0 run quickly with mixed_float16.


In [115]:
model.fit(tf_train_dataset, verbose=1)




In [None]:
model.push_to_hub(
    repo_id = "Roverto/gpt2-retrained-ultimate",
    private = True,
    token = os.environ["HF_TOKEN"]
)