# Train DistilBert on the abstracts

In [None]:
import numpy as np

import torch
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorForLanguageModeling, AutoModelForMaskedLM
from transformers import TrainingArguments

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch.cuda.empty_cache()
device

Load Abstracts

In [None]:
abstracts = load_dataset("csv", data_files="data\\cvpr_data.csv")
abstracts

Load pretrained dislroberta model

In [None]:
model_checkpoint = "distilbert-base-uncased"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint).to(device)


Get the tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Tokenize the abstract (pad to max length, truncate if longer)

In [None]:
def tokenize_function(examples):
    result = tokenizer(examples["abstract"], padding="max_length", max_length=256, truncation=True,)
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Use batched=True to activate fast multithreading!
tokenized_datasets = abstracts.map(
    tokenize_function, batched=True, remove_columns=abstracts["train"].features.keys()
)
tokenized_datasets

In [None]:
tokenizer.decode(tokenized_datasets["train"].select([1])["input_ids"][0])

data collator

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

Test/Train split

In [None]:
train_size = 9_000

downsampled_dataset = tokenized_datasets["train"].train_test_split(
    train_size=train_size, seed=123456,
)
downsampled_dataset

In [None]:
batch_size = 16
# Show the training loss with every epoch
logging_steps = len(downsampled_dataset["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

training_args = TrainingArguments(
    output_dir=f"{model_name}-finetuned-cvpr",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=5e-6,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=False,
    fp16=True,
    logging_steps=logging_steps,
    num_train_epochs = 15
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=downsampled_dataset["train"],
    eval_dataset=downsampled_dataset["test"],
    data_collator=data_collator,
)

Calculate model perplexity before training

In [None]:
eval_results = trainer.evaluate()
print(f"Perplexity: {np.exp(eval_results['eval_loss']):.2f}")

Train model and check perplexity

In [None]:
trainer.train()

In [None]:
eval_results = trainer.evaluate()
print(f"Perplexity: {np.exp(eval_results['eval_loss']):.2f}")

Save the model and the tokenizer

In [None]:
model.save_pretrained(f"{model_name}-finetuned-cvpr")
tokenizer.save_pretrained(f"{model_name}-finetuned-cvpr")