In [1]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")  
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU device count: {torch.cuda.device_count()}")
else:
    print("No GPU available, using CPU instead.")
    device = torch.device("cpu")

Using GPU: Tesla T4
GPU device count: 2


In [2]:
import os
from transformers import AutoTokenizer

os.environ["TOKENIZERS_PARALLELISM"] = "FALSE"

model_name = "roberta-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/381 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/780k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.01M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/280 [00:00<?, ?B/s]

In [3]:
from datasets import load_dataset

dataset = load_dataset("go_emotions", "simplified").shuffle(seed=42)
emotions = dataset["train"].features["labels"].feature.names
n_emotion = len(emotions)
id2label = {i: emotion for i, emotion in enumerate(emotions)}
label2id = {emotion: i for i, emotion in enumerate(emotions)}

Downloading builder script:   0%|          | 0.00/2.02k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.67k [00:00<?, ?B/s]

Downloading and preparing dataset go_emotions/simplified (download: 4.19 MiB, generated: 5.03 MiB, post-processed: Unknown size, total: 9.22 MiB) to /root/.cache/huggingface/datasets/go_emotions/simplified/0.0.0/2637cfdd4e64d30249c3ed2150fa2b9d279766bfcd6a809b9f085c61a90d776d...


Downloading data:   0%|          | 0.00/1.61M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/203k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/201k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/43410 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5426 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5427 [00:00<?, ? examples/s]

Dataset go_emotions downloaded and prepared to /root/.cache/huggingface/datasets/go_emotions/simplified/0.0.0/2637cfdd4e64d30249c3ed2150fa2b9d279766bfcd6a809b9f085c61a90d776d. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
import numpy as np


def tokenize(batch):
    return tokenizer(
        batch["text"], padding="max_length", truncation=True, max_length=64
    )


def one_hot_encode(batch):
    one_hot_labels = np.zeros((len(batch["labels"]), n_emotion))

    for i, label in enumerate(batch["labels"]):
        one_hot_labels[i][label] = 1

    return {"one_hot_labels": one_hot_labels}

dataset_encoded = dataset.map(one_hot_encode, batched=True, batch_size=5000)
dataset_encoded = dataset_encoded.map(tokenize, batched=True, batch_size=5000)
dataset_encoded.set_format(
    "torch", columns=["input_ids", "attention_mask", "one_hot_labels"]
)
dataset_encoded

  0%|          | 0/9 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/9 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'id', 'one_hot_labels', 'input_ids', 'attention_mask'],
        num_rows: 43410
    })
    validation: Dataset({
        features: ['text', 'labels', 'id', 'one_hot_labels', 'input_ids', 'attention_mask'],
        num_rows: 5426
    })
    test: Dataset({
        features: ['text', 'labels', 'id', 'one_hot_labels', 'input_ids', 'attention_mask'],
        num_rows: 5427
    })
})

In [5]:
def clean_dataset(ds): 
    return (
        ds.remove_columns(["id", 'labels', 'text'])
        .rename_column("one_hot_labels", "labels")
    )

train_clean = clean_dataset(dataset_encoded["train"])
validation_clean = clean_dataset(dataset_encoded["validation"])
test_clean = clean_dataset(dataset_encoded["test"])

In [6]:
from transformers import AutoModelForSequenceClassification, AutoConfig

config = AutoConfig.from_pretrained(
    model_name, 
    num_labels=n_emotion, 
    problem_type="multi_label_classification",
    id2label=id2label,
    label2id=label2id,
)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name, 
    config=config
).to(device)

Downloading:   0%|          | 0.00/1.88k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32G [00:00<?, ?B/s]

In [7]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch.nn.functional as F

threshold = 0.15


def compute_metrics(pred):
    labels = pred.label_ids    
    preds = torch.tensor(pred.predictions)
    preds = F.softmax(preds, dim=1)
    preds[preds > threshold] = 1
    preds[preds <= threshold] = 0

    result = {"accuracy": accuracy_score(labels, preds)}
    (
        result["precision"],
        result["recall"],
        result["f1"],
        _,
    ) = precision_recall_fscore_support(labels, preds, average="macro")
    
    return result

In [8]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [9]:
from transformers import Trainer, TrainingArguments

batch_size = 64
logging_steps = len(train_clean) // batch_size
training_args = TrainingArguments(
    output_dir="roberta-large-go-emotions",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    optim="adamw_torch",
    disable_tqdm=False,
    push_to_hub=True,
    learning_rate=5e-5,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=logging_steps,
    log_level="error",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

trainer = Trainer(
    model=model, 
    args=training_args, 
    train_dataset=train_clean,
    eval_dataset=validation_clean,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

Cloning https://huggingface.co/tasinhoque/roberta-large-go-emotions-3 into local empty directory.


In [10]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.086199,0.42923,0.501185,0.555821,0.520822
2,0.059700,0.092388,0.432916,0.516358,0.536239,0.515113
3,0.059700,0.095626,0.444526,0.524148,0.53278,0.516055


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=1020, training_loss=0.053331214306401274, metrics={'train_runtime': 2931.9466, 'train_samples_per_second': 44.418, 'train_steps_per_second': 0.348, 'total_flos': 1.517201040250368e+16, 'train_loss': 0.053331214306401274, 'epoch': 3.0})

In [11]:
test_result = trainer.predict(test_clean).metrics
valid_result = trainer.predict(validation_clean).metrics
print(test_result)
valid_result



  _warn_prf(average, modifier, msg_start, len(result))


{'test_loss': 0.0844530463218689, 'test_accuracy': 0.4363368343467846, 'test_precision': 0.49554689305921557, 'test_recall': 0.5654775888506688, 'test_f1': 0.5204270499531086, 'test_runtime': 37.7808, 'test_samples_per_second': 143.644, 'test_steps_per_second': 1.138}


  _warn_prf(average, modifier, msg_start, len(result))


{'test_loss': 0.08619901537895203,
 'test_accuracy': 0.4292296350903059,
 'test_precision': 0.5011850995128874,
 'test_recall': 0.5558214808209453,
 'test_f1': 0.5208218686505952,
 'test_runtime': 36.9716,
 'test_samples_per_second': 146.761,
 'test_steps_per_second': 1.163}

In [12]:
trainer.push_to_hub(commit_message="Training completed!")

Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 32.0k/1.32G [00:00<?, ?B/s]

Upload file runs/Feb23_11-13-07_92b39eb7c599/events.out.tfevents.1677150812.92b39eb7c599.23.0: 100%|##########…

remote: Scanning LFS files for validity...        
remote: LFS file scan complete.        
To https://huggingface.co/tasinhoque/roberta-large-go-emotions-3
   2f1d1b3..6166c3e  main -> main

To https://huggingface.co/tasinhoque/roberta-large-go-emotions-3
   6166c3e..ea3aa2c  main -> main



'https://huggingface.co/tasinhoque/roberta-large-go-emotions-3/commit/6166c3e35a04eed4e05543c9a62b2bb942344ea2'