In [None]:
#!pip -q install -U transformers datasets evaluate accelerate

import numpy as np
from datasets import load_dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    DataCollatorWithPadding, TrainingArguments, Trainer
)
import evaluate


In [None]:
from datasets import load_dataset


In [None]:
ds = load_dataset("sh0416/ag_news")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
ds
print(ds["train"][0])
print(ds["train"].features)


{'label': 3, 'title': 'Wall St. Bears Claw Back Into the Black (Reuters)', 'description': "Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again."}
{'label': Value('int64'), 'title': Value('string'), 'description': Value('string')}


In [None]:
ds["train"].features["label"]


Value('int64')

In [None]:
import pandas as pd

df = pd.DataFrame(ds["train"][:2000])
df["label"].value_counts()


Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
4,777
1,477
3,408
2,338


In [None]:
checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_fn(batch):
    # Combine 'title' and 'description' for each item in the batch
    texts = [t + " " + d for t, d in zip(batch["title"], batch["description"])]
    return tokenizer(texts, truncation=True)

tokenized = ds.map(tokenize_fn, batched=True, remove_columns=[col for col in ds["train"].column_names if col != "label"])
tokenized

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 7600
    })
})

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=preds, references=labels)

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, f1_score, classification_report, confusion_matrix

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    accuracy = accuracy_score(labels, preds)
    recall = recall_score(labels, preds, average="macro")
    f1 = f1_score(labels, preds, average="macro")

    return {
        "accuracy": accuracy,
        "recall": recall,
        "f1": f1,
    }

In [None]:
num_labels = 4
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels)

args = TrainingArguments(
    output_dir="agnews_distilbert",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    logging_steps=50,
)



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import torch

# Clear CUDA cache before Trainer initialization to help with potential GPU state corruption
if torch.cuda.is_available():
    torch.cuda.empty_cache()

def map_labels_to_zero_indexed(example):
    example["label"] = example["label"] - 1
    return example

# Apply the mapping to the train and evaluation datasets
zero_indexed_train_dataset = tokenized["train"].map(map_labels_to_zero_indexed)
zero_indexed_eval_dataset = tokenized["test"].map(map_labels_to_zero_indexed) if "test" in tokenized else tokenized["validation"].map(map_labels_to_zero_indexed)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=zero_indexed_train_dataset,
    eval_dataset=zero_indexed_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.evaluate()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Recall,F1
1,0.1805,0.177975,0.943026,0.943026,0.943089
2,0.1268,0.196057,0.942105,0.942105,0.942025
3,0.0776,0.249247,0.943553,0.943553,0.943493
4,0.0709,0.270165,0.945132,0.945132,0.94518
5,0.0105,0.348525,0.943684,0.943684,0.943725
6,0.0632,0.364148,0.940789,0.940789,0.940779
7,0.0284,0.420189,0.944079,0.944079,0.944078
8,0.0157,0.452722,0.945132,0.945132,0.945095
9,0.0094,0.48652,0.944737,0.944737,0.94477
10,0.0088,0.506271,0.944605,0.944605,0.944661


{'eval_loss': 0.27016469836235046,
 'eval_accuracy': 0.9451315789473684,
 'eval_recall': 0.9451315789473684,
 'eval_f1': 0.9451798325836654,
 'eval_runtime': 20.1004,
 'eval_samples_per_second': 378.103,
 'eval_steps_per_second': 23.631,
 'epoch': 10.0}

In [None]:
import os
os.listdir("agnews_distilbert")

['checkpoint-75000',
 'checkpoint-45000',
 'checkpoint-37500',
 'checkpoint-7500',
 'checkpoint-60000',
 'runs',
 'checkpoint-67500',
 'checkpoint-30000',
 'checkpoint-15000',
 'checkpoint-52500',
 'checkpoint-22500']

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

ckpt = "agnews_distilbert/checkpoint-75000"

model = AutoModelForSequenceClassification.from_pretrained(ckpt)
tokenizer = AutoTokenizer.from_pretrained(ckpt)

In [None]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [None]:
from transformers import pipeline
import torch

clf = pipeline(
    "text-classification",
    model=ckpt,
    tokenizer=ckpt,
    device=0 if torch.cuda.is_available() else -1
)

clf("Apple releases a new AI-powered chip for MacBooks")

Device set to use cuda:0


[{'label': 'LABEL_3', 'score': 0.9999936819076538}]

In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

checkpoint_path = "agnews_distilbert/checkpoint-75000"

model = AutoModelForSequenceClassification.from_pretrained(checkpoint_path)

eval_args = TrainingArguments(
    output_dir="tmp_eval",
    per_device_eval_batch_size=16,
    do_train=False,
    do_eval=True,
    no_cuda=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=eval_args,
)



In [None]:
from datasets import load_dataset
ds = load_dataset("ag_news")

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
def tokenize_fn(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

tokenized_test = ds["test"].map(tokenize_fn, batched=True)

In [None]:
def map_labels_to_zero_indexed(example):
    example["label"] = example["label"]
    return example

zero_indexed_eval_dataset = tokenized_test.map(map_labels_to_zero_indexed)

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]