In [1]:
%pip install transformers datasets evaluate accelerate sentencepiece -q
!pip install --upgrade transformers datasets evaluate accelerate -q


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: C:\Users\akhil\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: C:\Users\akhil\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [2]:
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    pipeline
)
import evaluate
import numpy as np
import pandas as pd


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
dataset = load_dataset(
    "json",
    data_files={
        "train": "train_v2.jsonl",
        "valid": "valid_v2.jsonl"
    }
)
dataset

Generating train split: 1800 examples [00:00, 112526.60 examples/s]
Generating valid split: 96 examples [00:00, 10669.42 examples/s]


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 1800
    })
    valid: Dataset({
        features: ['text', 'label'],
        num_rows: 96
    })
})

In [6]:
BASE_MODEL = "SamLowe/roberta-base-go_emotions"

label2id = {
    "positive_high_energy": 0,
    "positive_low_energy": 1,
    "negative_high_stress": 2,
    "negative_low_energy": 3,
    "anxious": 4,
    "neutral": 5
}
id2label = {v: k for k, v in label2id.items()}
num_labels = len(label2id)

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)


In [7]:
def preprocess(example):
    enc = tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )
    enc["label"] = label2id[example["label"]]
    return enc


In [8]:
tokenized = dataset.map(preprocess, batched=False)

Map: 100%|██████████| 1800/1800 [00:00<00:00, 4598.68 examples/s]
Map: 100%|██████████| 96/96 [00:00<00:00, 2836.28 examples/s]


In [9]:
from transformers import AutoConfig

config = AutoConfig.from_pretrained(
    BASE_MODEL,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
)

config.problem_type = "single_label_classification"

model = AutoModelForSequenceClassification.from_pretrained(
    BASE_MODEL,
    config=config,
    ignore_mismatched_sizes=True
)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at SamLowe/roberta-base-go_emotions and are newly initialized because the shapes did not match:
- classifier.out_proj.bias: found shape torch.Size([28]) in the checkpoint and torch.Size([6]) in the model instantiated
- classifier.out_proj.weight: found shape torch.Size([28, 768]) in the checkpoint and torch.Size([6, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
args = TrainingArguments(
    output_dir="./journal_model",
    eval_strategy="epoch",     # old transformers syntax
    save_strategy="epoch",
    logging_steps=50,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
)


In [11]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return metric.compute(predictions=preds, references=labels)


In [12]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["valid"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


In [13]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.0808,0.633497,0.84375
2,0.0056,0.710274,0.864583
3,0.0033,0.754976,0.864583
4,0.0023,0.777758,0.875
5,0.0021,0.785981,0.875


TrainOutput(global_step=565, training_loss=0.10205989637931363, metrics={'train_runtime': 168.2057, 'train_samples_per_second': 53.506, 'train_steps_per_second': 3.359, 'total_flos': 592021135872000.0, 'train_loss': 0.10205989637931363, 'epoch': 5.0})

In [17]:
model.save_pretrained("emotion_model")
tokenizer.save_pretrained("emotion_model")


('emotion_model\\tokenizer_config.json',
 'emotion_model\\special_tokens_map.json',
 'emotion_model\\vocab.json',
 'emotion_model\\merges.txt',
 'emotion_model\\added_tokens.json',
 'emotion_model\\tokenizer.json')

In [18]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("emotion_model")
tokenizer = AutoTokenizer.from_pretrained("emotion_model")
model.eval()


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         