In [1]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")  
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU device count: {torch.cuda.device_count()}")
else:
    print("No GPU available, using CPU instead.")
    device = torch.device("cpu")

Using GPU: Tesla T4
GPU device count: 2


In [6]:
import os
from transformers import AutoTokenizer

os.environ["TOKENIZERS_PARALLELISM"] = "FALSE"

model_name = "tasinhoque/roberta-large-go-emotions"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [7]:
from datasets import load_dataset

dataset = load_dataset("go_emotions", "simplified")
n_emotion = len(dataset["train"].features["labels"].feature.names)

Downloading builder script:   0%|          | 0.00/2.02k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.67k [00:00<?, ?B/s]

Downloading and preparing dataset go_emotions/simplified (download: 4.19 MiB, generated: 5.03 MiB, post-processed: Unknown size, total: 9.22 MiB) to /root/.cache/huggingface/datasets/go_emotions/simplified/0.0.0/2637cfdd4e64d30249c3ed2150fa2b9d279766bfcd6a809b9f085c61a90d776d...


Downloading data:   0%|          | 0.00/1.61M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/203k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/201k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/43410 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5426 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5427 [00:00<?, ? examples/s]

Dataset go_emotions downloaded and prepared to /root/.cache/huggingface/datasets/go_emotions/simplified/0.0.0/2637cfdd4e64d30249c3ed2150fa2b9d279766bfcd6a809b9f085c61a90d776d. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [10]:
import numpy as np


def tokenize(batch):
    return tokenizer(
        batch["text"], padding="max_length", truncation=True, max_length=64
    )


def one_hot_encode(batch):
    one_hot_labels = np.zeros((len(batch["labels"]), n_emotion))

    for i, label in enumerate(batch["labels"]):
        one_hot_labels[i][label] = 1

    return {"one_hot_labels": one_hot_labels}

dataset_encoded = dataset.map(one_hot_encode, batched=True, batch_size=5000)
dataset_encoded = dataset_encoded.map(tokenize, batched=True, batch_size=5000)
dataset_encoded.set_format(
    "torch", columns=["input_ids", "attention_mask", "one_hot_labels"]
)
dataset_encoded

  0%|          | 0/9 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/9 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'id', 'one_hot_labels', 'input_ids', 'attention_mask'],
        num_rows: 43410
    })
    validation: Dataset({
        features: ['text', 'labels', 'id', 'one_hot_labels', 'input_ids', 'attention_mask'],
        num_rows: 5426
    })
    test: Dataset({
        features: ['text', 'labels', 'id', 'one_hot_labels', 'input_ids', 'attention_mask'],
        num_rows: 5427
    })
})

In [11]:
def clean_dataset(ds): 
    return (
        ds.remove_columns(["id", 'labels', 'text'])
        .rename_column("one_hot_labels", "labels")
    )

train_clean = clean_dataset(dataset_encoded["train"])
validation_clean = clean_dataset(dataset_encoded["validation"])
test_clean = clean_dataset(dataset_encoded["test"])

In [12]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    model_name, 
    num_labels=n_emotion, 
    problem_type="multi_label_classification"
).to(device)

Downloading:   0%|          | 0.00/1.87k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32G [00:00<?, ?B/s]

In [13]:
import torch
import torch.nn.functional as F


def forward_pass(batch):
    inputs = {
        k: v.clone().to("cuda") for k, v in batch.items() if k in tokenizer.model_input_names
    }

    with torch.no_grad():
        output = model(**inputs)
        output = output.logits
        probabilities = F.softmax(output, dim=1).cpu().numpy()

    return {"probabilities": probabilities}

In [14]:
chosen_dataset = test_clean
labels = chosen_dataset["labels"]
preds = chosen_dataset.map(forward_pass, batched=True)
preds = preds["probabilities"].numpy()
threshold = 0.15
preds[preds > threshold] = 1
preds[preds <= threshold] = 0
preds

  0%|          | 0/6 [00:00<?, ?ba/s]

array([[0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.]], dtype=float32)

In [15]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

result = {"accuracy": accuracy_score(labels, preds)}
(
    result["precision"],
    result["recall"],
    result["f1"],
    _,
) = precision_recall_fscore_support(labels, preds, average="macro")

result

  _warn_prf(average, modifier, msg_start, len(result))


{'accuracy': 0.4271236410539893,
 'precision': 0.5101494353184485,
 'recall': 0.5763722014150806,
 'f1': 0.5297380709491947}