In [1]:
%pip install transformers datasets evaluate accelerate sentencepiece -q
!pip install --upgrade transformers datasets evaluate accelerate -q


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: C:\Users\akhil\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: C:\Users\akhil\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [2]:
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    AutoConfig
)
import evaluate
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [4]:

# 2. Load Dataset (V1 + V2 combined)
print("Loading datasets...")
dataset = load_dataset(
"json",
    data_files={
    "train": ["train_updated.jsonl","train_v2.jsonl"],
    "valid": ["idiomsvalid.jsonl", "valid.jsonl", "valid_v2.jsonl"]
}
)
print(f"Train size: {len(dataset['train'])}")
print(f"Valid size: {len(dataset['valid'])}")

Loading datasets...


Generating train split: 2862 examples [00:00, 183945.48 examples/s]
Generating valid split: 490 examples [00:00, 23245.55 examples/s]

Train size: 2862
Valid size: 490





In [5]:
BASE_MODEL = "svkapoor/emotion_model_RoBERTa"
    
label2id = {
    "positive_high_energy": 0,
    "positive_low_energy": 1,
    "negative_high_stress": 2,
    "negative_low_energy": 3,
    "anxious": 4,
    "neutral": 5
}
id2label = {v: k for k, v in label2id.items()}
num_labels = len(label2id)

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

def preprocess(example):
    enc = tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )
    enc["label"] = label2id[example["label"]]
    return enc

In [6]:
print("Tokenizing datasets...")
tokenized = dataset.map(preprocess, batched=False)


Tokenizing datasets...


Map: 100%|██████████| 2862/2862 [00:00<00:00, 5210.51 examples/s]
Map: 100%|██████████| 490/490 [00:00<00:00, 3602.85 examples/s]


In [7]:
# 4. Initialize Model
config = AutoConfig.from_pretrained(
    BASE_MODEL,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
)
config.problem_type = "single_label_classification"

model = AutoModelForSequenceClassification.from_pretrained(
    BASE_MODEL,
    config=config,
    ignore_mismatched_sizes=True
)


In [8]:
# 5. Metrics
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return metric.compute(predictions=preds, references=labels)

In [9]:
# 6. Training Arguments (Optimized)
args = TrainingArguments(
    output_dir="./journal_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    learning_rate=2e-5,              # Optimal for RoBERTa
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,             # Increased since we use early stopping
    weight_decay=0.01,
    load_best_model_at_end=True,     # Load best model, not last
    metric_for_best_model="accuracy",
    save_total_limit=2,              # Save space
    report_to="none"
)


In [10]:
# 7. Trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["valid"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [11]:
print("Starting training...")
trainer.train()

Starting training...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2934,0.876914,0.783673
2,0.1959,0.737161,0.826531
3,0.1793,1.315118,0.789796
4,0.1128,0.994894,0.85102
5,0.0393,1.246568,0.810204
6,0.0087,1.410012,0.797959
7,0.0151,1.350893,0.820408
8,0.0007,1.525812,0.806122
9,0.0014,1.480514,0.804082
10,0.0102,1.477537,0.810204


TrainOutput(global_step=1790, training_loss=0.09439540356137482, metrics={'train_runtime': 582.987, 'train_samples_per_second': 49.092, 'train_steps_per_second': 3.07, 'total_flos': 1882627212072960.0, 'train_loss': 0.09439540356137482, 'epoch': 10.0})

In [13]:
print("Saving model to ./idiom_emotion_model...")
model.save_pretrained("idiom_emotion_model")
tokenizer.save_pretrained("idiom_emotion_model")
print("Done!")

Saving model to ./idiom_emotion_model...
Done!
