In [1]:
  pip install transformers datasets evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3
Note: you may need to restart the kernel to use updated packages.


In [None]:
!mkdir -p ~/.kaggle
!echo '{"wandb_api_key": "5af3f6dab5b4be7bfb38dadf6554a3d40a09ada9"}' > ~/.kaggle/secrets.json
!chmod 600 ~/.kaggle/secrets.json


In [3]:
import json
import wandb

# Load API Key từ Kaggle Secrets
with open("/root/.kaggle/secrets.json", "r") as f:
    secrets = json.load(f)
    wandb_api_key = secrets["wandb_api_key"]

# Đăng nhập vào W&B
wandb.login(key=wandb_api_key)


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mtruongminhphuc08102005[0m ([33mtruongminhphuc08102005-hanoi-university-of-science-and-t[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [29]:
import torch
import numpy as np
from datasets import load_dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    Trainer, 
    TrainingArguments
)
from sklearn.metrics import precision_recall_fscore_support

# 1. Load the GoEmotions simplified dataset
dataset = load_dataset("google-research-datasets/go_emotions", "simplified")
num_labels = dataset["train"].features["labels"].feature.num_classes

# 2. Load the BERT tokenizer and model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    problem_type="multi_label_classification"
)

# 3. Preprocessing function: tokenize text and convert label lists to multi-hot vectors as floats
def preprocess_function(examples):
    tokenized_inputs = tokenizer(
        examples["text"], 
        truncation=True, 
        padding="max_length", 
        max_length=128
    )
    multi_hot_labels = []
    for label_list in examples["labels"]:
        vector = [0] * num_labels
        for label in label_list:
            vector[label] = 1
        # Convert to floats
        multi_hot_labels.append([float(x) for x in vector])
    tokenized_inputs["labels"] = multi_hot_labels
    return tokenized_inputs

# Apply preprocessing and remove original columns
tokenized_datasets = dataset.map(
    preprocess_function, 
    batched=True, 
    remove_columns=["text", "labels", "id"]
)

# 4. Set the dataset format to torch for our columns, specifying that labels are floats.
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# 5. Create a custom data collator to ensure labels are float tensors
'''def custom_collator(features):
    batch = {}
    for key in features[0].keys():
        if key == "labels":
            # Convert each label list to a tensor and stack
            batch["labels"] = torch.stack([torch.tensor(f[key], dtype=torch.float) for f in features])
        else:
            batch[key] = torch.stack([torch.tensor(f[key]) for f in features])
    return batch'''
# 5. Create a custom data collator that uses clone().detach() to avoid warnings
def custom_collator(features):
    batch = {}
    for key in features[0].keys():
        # If the feature is already a tensor, clone and detach it
        if key == "labels":
            batch["labels"] = torch.stack([f[key].clone().detach().float() for f in features])
        else:
            batch[key] = torch.stack([f[key].clone().detach() for f in features])
    return batch


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# 6. Define compute_metrics for evaluation
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # Convert logits to probabilities
    probs = torch.sigmoid(torch.tensor(logits)).numpy()
    # Apply threshold to get binary predictions
    preds = (probs > 0.5).astype(int)
    # Convert labels to int for scikit-learn metrics
    labels_int = labels.astype(int)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels_int, preds, average="micro", zero_division=0
    )
    # Exact match accuracy: all labels for a sample must be correct
    exact_match_acc = (preds == labels_int).all(axis=1).mean()
    return {"accuracy": exact_match_acc, "precision": precision, "recall": recall, "f1": f1}

# 7. Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# 8. Initialize the Trainer with the custom data collator
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=custom_collator,
    compute_metrics=compute_metrics,
)

# 9. Fine-tune the model
trainer.train()

# 10. Evaluate on the test set
test_results = trainer.evaluate(tokenized_datasets["test"])
print("Test set evaluation results:")
print(test_results)

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
