# 05_train_transformer

Fine-tune DistilBERT (or RoBERTa) using Hugging Face Trainer.

In [1]:
import transformers, accelerate, datasets
print("transformers:", transformers.__version__)
print("accelerate:", accelerate.__version__)
print("datasets:", datasets.__version__)


  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


transformers: 4.36.2
accelerate: 0.25.0
datasets: 2.14.5


In [5]:
# FAST + CLEAN TRAINING CODE FOR DISTILBERT

# Required packages (already installed):
# transformers==4.36.2
# datasets==2.14.5
# accelerate
# evaluate

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import os

model_name = "distilbert-base-uncased"
out = "artifacts/transformer"
os.makedirs(out, exist_ok=True)

# 1) Load IMDB dataset
ds = load_dataset("imdb")

# 2) Load tokenizer
tok = AutoTokenizer.from_pretrained(model_name)

# 3) Preprocess function
def preprocess(x):
    return tok(
        x["text"],
        truncation=True,
        padding="max_length",
        max_length=128    # shorter = faster
    )

# Apply tokenizer
tds = ds.map(preprocess, batched=True)

# Cleanup columns and format
tds = (
    tds.remove_columns(["text"])
    .rename_column("label", "labels")
    .with_format("torch")
)

# ----------------------------------------------------
# ðŸ”¥ REDUCED TRAINING SET (MUCH FASTER)
# ----------------------------------------------------
train_small = tds["train"].select(range(800))   # reduced from 25,000
eval_small  = tds["test"].select(range(200))

# 4) Load pretrained DistilBERT model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# 5) Training arguments
training_args = TrainingArguments(
    output_dir=out,
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    num_train_epochs=1,
    logging_steps=20,
)

# 6) Metric function
def compute_metrics(p):
    import evaluate
    metric = evaluate.load("accuracy")
    preds = np.argmax(p.predictions, axis=1)
    return metric.compute(predictions=preds, references=p.label_ids)

# 7) Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_small,
    eval_dataset=eval_small,
    tokenizer=tok,
    compute_metrics=compute_metrics,
)

# 8) Train
trainer.train()

# 9) Save model
trainer.save_model(out)

print("\nðŸŽ‰ Training complete! Model saved to:", out)


Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 25000/25000 [00:19<00:00, 1292.23 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 25000/25000 [00:22<00:00, 1106.81 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 50000/50000 [00:46<00:00, 1080.93 examples/s]
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
                                                  
 20%|â–ˆâ–‰        | 99/500 [14:11<1:03:57,  9.57s/it]

{'loss': 0.1053, 'learning_rate': 4e-05, 'epoch': 0.2}


                                                  
 20%|â–ˆâ–‰        | 99/500 [15:10<1:03:57,  9.57s/it]

{'loss': 0.002, 'learning_rate': 3e-05, 'epoch': 0.4}


                                                  
 20%|â–ˆâ–‰        | 99/500 [16:09<1:03:57,  9.57s/it]

{'loss': 0.0009, 'learning_rate': 2e-05, 'epoch': 0.6}


                                                  
 20%|â–ˆâ–‰        | 99/500 [17:15<1:03:57,  9.57s/it]

{'loss': 0.0007, 'learning_rate': 1e-05, 'epoch': 0.8}


                                                  
 20%|â–ˆâ–‰        | 99/500 [18:36<1:03:57,  9.57s/it]

{'loss': 0.0006, 'learning_rate': 0.0, 'epoch': 1.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

Downloading builder script: 4.20kB [00:00, 1.37MB/s]
                                                  

[A[A                                         
 20%|â–ˆâ–‰        | 99/500 [19:02<1:03:57,  9.57s/it]
[A
                                                  
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 100/100 [05:51<00:00,  3.51s/it]]


{'eval_loss': 0.0004960223450325429, 'eval_accuracy': 1.0, 'eval_runtime': 26.2221, 'eval_samples_per_second': 7.627, 'eval_steps_per_second': 0.953, 'epoch': 1.0}
{'train_runtime': 351.1819, 'train_samples_per_second': 2.278, 'train_steps_per_second': 0.285, 'train_loss': 0.021903112158179285, 'epoch': 1.0}

ðŸŽ‰ Training complete! Model saved to: artifacts/transformer
