In [None]:
!pip install --upgrade transformers datasets evaluate


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [None]:
import pandas as pd
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)
from datasets import Dataset, DatasetDict, ClassLabel
from google.colab import drive
import os
import evaluate
# Mount Google Drive
drive.mount('/content/drive')

# ─── Configuration ────────────────────────────────────────────────────────────
MODEL_CHECKPOINT = "roberta-base"  # public alias for Facebook's RoBERTa
EMOTIONS        = ["anger", "disgust", "fear", "joy", "neutral", "sadness", "surprise"]
NUM_LABELS      = len(EMOTIONS)
MAX_LEN         = 128
BATCH_SIZE      = 16
EPOCHS          = 3
LR              = 2e-5

# ─── 1) Load CSVs into pandas ─────────────────────────────────────────────────
df_train = pd.read_csv("/content/drive/MyDrive/MELD/train_with_context.csv")
df_dev   = pd.read_csv("/content/drive/MyDrive/MELD/dev_with_context.csv")
df_test  = pd.read_csv("/content/drive/MyDrive/MELD/test_with_context.csv")

# ─── 2) Convert pandas DataFrames to Hugging Face Datasets ────────────────────
train_ds = Dataset.from_pandas(df_train)
dev_ds   = Dataset.from_pandas(df_dev)
test_ds  = Dataset.from_pandas(df_test)

# Remove the pandas index column if present
for ds in (train_ds, dev_ds, test_ds):
    if "_pandas_index" in ds.column_names:
        ds = ds.remove_columns(["_pandas_index"])

dataset = DatasetDict({"train": train_ds, "validation": dev_ds, "test": test_ds})

# ─── 3) Encode labels as ClassLabel ───────────────────────────────────────────
label_feature = ClassLabel(names=EMOTIONS)

def add_labels(example):
    example["labels"] = label_feature.str2int(example["Emotion"])
    return example

dataset = dataset.map(add_labels)

# ─── 4) Tokenize bert_input column ────────────────────────────────────────────
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

def tokenize_batch(examples):
    return tokenizer(
        examples["bert_input"],
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN
    )

dataset = dataset.map(tokenize_batch, batched=True)

# ─── 5) Prepare for PyTorch ───────────────────────────────────────────────────
dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels"]
)

# ─── 6) Load the RoBERTa sequence classification model ────────────────────────
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_CHECKPOINT,
    num_labels=NUM_LABELS
)

# ─── 7) Define evaluation metric ──────────────────────────────────────────────
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    return metric.compute(predictions=preds, references=labels)

# ─── 8) Set up training arguments ──────────────────────────────────────────────
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/MELD/teacher_roberta_erc",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

# ─── 9) Initialize Trainer and fine-tune ─────────────────────────────────────
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.save_model("/content/drive/MyDrive/MELD/teacher_roberta_erc")



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Map:   0%|          | 0/9988 [00:00<?, ? examples/s]

Map:   0%|          | 0/1108 [00:00<?, ? examples/s]

Map:   0%|          | 0/2610 [00:00<?, ? examples/s]

Map:   0%|          | 0/9988 [00:00<?, ? examples/s]

Map:   0%|          | 0/1108 [00:00<?, ? examples/s]

Map:   0%|          | 0/2610 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkonthodores[0m ([33mkonthodores-university-of-patras[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2446,1.201751,0.612816
2,1.0302,1.13927,0.629061
3,0.901,1.133685,0.620036


In [None]:
from transformers import TrainingArguments
import inspect

# Print the signature
print(inspect.signature(TrainingArguments.__init__))

# Or get full help text
help(TrainingArguments)


Help on class TrainingArguments in module transformers.training_args:

class TrainingArguments(builtins.object)
 |  
 |  TrainingArguments is the subset of the arguments we use in our example scripts **which relate to the training loop
 |  itself**.
 |  
 |  Using [`HfArgumentParser`] we can turn this class into
 |  [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
 |  command line.
 |  
 |  Parameters:
 |      output_dir (`str`, *optional*, defaults to `"trainer_output"`):
 |          The output directory where the model predictions and checkpoints will be written.
 |      overwrite_output_dir (`bool`, *optional*, defaults to `False`):
 |          If `True`, overwrite the content of the output directory. Use this to continue training if `output_dir`
 |          points to a checkpoint directory.
 |      do_train (`bool`, *optional*, defaults to `False`):
 |          Whether to run training or not. This argument is not directly