In [1]:
%pip install transformers datasets evaluate accelerate sentencepiece -q
!pip install --upgrade transformers datasets evaluate accelerate -q


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: C:\Users\akhil\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: C:\Users\akhil\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [2]:
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    AutoConfig
)
import evaluate
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [10]:

# 2. Load Dataset (V1 + V2 combined)
print("Loading datasets...")
dataset = load_dataset(
"json",
    data_files={
    "train": ["train.jsonl","train_v2.jsonl", "train_updated.jsonl", "idioms.jsonl"],
    "valid": ["valid.jsonl", "valid_v2.jsonl", "idiomsvalid.jsonl"]
}
)
print(f"Train size: {len(dataset['train'])}")
print(f"Valid size: {len(dataset['valid'])}")

Loading datasets...


Generating train split: 4135 examples [00:00, 165405.68 examples/s]
Generating valid split: 490 examples [00:00, 27939.60 examples/s]

Train size: 4135
Valid size: 490





In [11]:
BASE_MODEL = "svkapoor/emotion_model_RoBERTa"
    
label2id = {
    "positive_high_energy": 0,
    "positive_low_energy": 1,
    "negative_high_stress": 2,
    "negative_low_energy": 3,
    "neutral": 4
}
id2label = {v: k for k, v in label2id.items()}
num_labels = len(label2id)

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

def preprocess(example):
    enc = tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )
    enc["label"] = label2id[example["label"]]
    return enc

In [12]:
print("Tokenizing datasets...")
tokenized = dataset.map(preprocess, batched=False)


Tokenizing datasets...


Map:   0%|          | 0/4135 [00:00<?, ? examples/s]

Map: 100%|██████████| 4135/4135 [00:00<00:00, 5175.41 examples/s]
Map: 100%|██████████| 490/490 [00:00<00:00, 4409.53 examples/s]


In [13]:
# 4. Initialize Model
config = AutoConfig.from_pretrained(
    BASE_MODEL,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
)
config.problem_type = "single_label_classification"

model = AutoModelForSequenceClassification.from_pretrained(
    BASE_MODEL,
    config=config,
    ignore_mismatched_sizes=True
)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at svkapoor/emotion_model_RoBERTa and are newly initialized because the shapes did not match:
- classifier.out_proj.bias: found shape torch.Size([6]) in the checkpoint and torch.Size([5]) in the model instantiated
- classifier.out_proj.weight: found shape torch.Size([6, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
# 5. Metrics
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return metric.compute(predictions=preds, references=labels)

In [15]:
# 6. Training Arguments (Optimized)
args = TrainingArguments(
    output_dir="./journal_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    learning_rate=2e-5,              # Optimal for RoBERTa
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,             # Increased since we use early stopping
    weight_decay=0.01,
    load_best_model_at_end=True,     # Load best model, not last
    metric_for_best_model="accuracy",
    save_total_limit=2,              # Save space
    report_to="none"
)


In [16]:
# 7. Trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["valid"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [17]:
print("Starting training...")
trainer.train()

Starting training...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3276,0.423866,0.867347
2,0.1878,0.8593,0.814286
3,0.1201,0.69586,0.861224
4,0.0714,0.767028,0.859184
5,0.0628,0.932795,0.84898
6,0.0473,0.838488,0.863265
7,0.0023,0.967283,0.861224
8,0.0203,0.878758,0.879592
9,0.0018,0.970727,0.869388
10,0.0008,0.943494,0.867347


TrainOutput(global_step=2590, training_loss=0.09288183510893215, metrics={'train_runtime': 765.9592, 'train_samples_per_second': 53.985, 'train_steps_per_second': 3.381, 'total_flos': 2719983797721600.0, 'train_loss': 0.09288183510893215, 'epoch': 10.0})

In [18]:
print("Saving model to ./4emotion_model...")
model.save_pretrained("4emotion_model")
tokenizer.save_pretrained("4emotion_model")
print("Done!")

Saving model to ./4emotion_model...
Done!
