In [None]:
!pip install transformers datasets

# Create the AutoTokenizer of the dataset

In [None]:
from transformers import AutoTokenizer
from datasets import load_dataset

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


dataset = load_dataset("yelp_review_full")
dataset["train"][100]

In [None]:
# apply the same tokenizer used to train the original model
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-italian-cased")
tokenized_datasets = dataset.map(tokenize_function, batched=True)

## Encode the text with the tokenizer

In [None]:
text = "How are you?"
encoded_text = tokenizer(text)
encoded_text

In [None]:
decoded_text = tokenizer.decode(encoded_text["input_ids"])
decoded_text

## Padding and truncation

### Adjust padding and truncation

In [None]:
several_sentences = ["How are you?", "Another text", "Yes"]
tokenizer(several_sentences)

In [None]:
tokenizer.truncation_side = "left"
tokenizer(several_sentences, max_length=6, padding=True, truncation=True)

In [None]:
from trl.train import SFTTrainer, TrainingArguments


os.environ["WANDB_PROJECT"] = "alpaca_ft"  # name your W&B project
os.environ["WANDB_LOG_MODEL"] = "checkpoint"  # log all model checkpoints


training_args = TrainingArguments(
    report_to="wandb",
    per_device_train_batch_size=16,
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    num_train_epochs=3,
    gradient_accumulation_steps=2, # simulate larger batch sizes
)


trainer = SFTTrainer(
    model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    packing=True, # pack samples together for efficient training
    max_seq_length=1024, # maximum packed length 
    args=training_args,
    formatting_func=formatting_func, # format samples with a model schema
)
trainer.train()
