In [1]:
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: C:\Users\akhil\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [2]:
%pip install transformers datasets evaluate accelerate sentencepiece -q
!pip install --upgrade transformers datasets evaluate accelerate -q


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: C:\Users\akhil\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: C:\Users\akhil\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [3]:
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    AutoConfig
)
import evaluate
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [5]:

# Load Dataset
print("Loading datasets...")
dataset = load_dataset(
"json",
    data_files={
    "train": ["data/train.jsonl","data/train_v2.jsonl", "data/train_updated.jsonl", "data/idioms.jsonl", "data/greetings.jsonl"],
    "valid": ["data/valid.jsonl", "data/valid_v2.jsonl", "data/idiomsvalid.jsonl", "data/valid-greetings.jsonl"]
}
)
print(f"Train size: {len(dataset['train'])}")
print(f"Valid size: {len(dataset['valid'])}")

Loading datasets...


Generating train split: 4374 examples [00:00, 111854.24 examples/s]
Generating valid split: 540 examples [00:00, 17364.39 examples/s]

Train size: 4374
Valid size: 540





In [6]:
BASE_MODEL = "svkapoor/5EmoteModelRoBERTa"

# Define categories    
label2id = {
    "positive_high_energy": 0,
    "positive_low_energy": 1,
    "negative_high_stress": 2,
    "negative_low_energy": 3,
    "neutral": 4
}
id2label = {v: k for k, v in label2id.items()}
num_labels = len(label2id)

# Tokenize text stream
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

def preprocess(example):
    enc = tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )
    enc["label"] = label2id[example["label"]]
    return enc

print("Tokenizing datasets...")
tokenized = dataset.map(preprocess, batched=False)

Tokenizing datasets...


Map: 100%|██████████| 4374/4374 [00:00<00:00, 5684.46 examples/s]
Map: 100%|██████████| 540/540 [00:00<00:00, 5001.32 examples/s]


In [7]:
# Initialize Model
config = AutoConfig.from_pretrained(
    BASE_MODEL,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
)
config.problem_type = "single_label_classification"

model = AutoModelForSequenceClassification.from_pretrained(
    BASE_MODEL,
    config=config,
    ignore_mismatched_sizes=True
)


In [8]:
# Metrics
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return metric.compute(predictions=preds, references=labels)

In [9]:
# Training Arguments
args = TrainingArguments(
    output_dir="./journal_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    save_total_limit=2,
    report_to="none"
)


In [10]:
# Trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["valid"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [12]:
# Train
print("Starting training...")
trainer.train()

Starting training...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.0445,0.890061,0.887037
2,0.0275,1.018155,0.866667
3,0.047,1.352888,0.82963
4,0.0107,1.143029,0.855556
5,0.0168,1.040326,0.874074
6,0.0342,1.048527,0.872222
7,0.0205,1.044027,0.883333
8,0.0001,1.076054,0.864815
9,0.0083,1.09967,0.87037
10,0.0001,1.115135,0.872222


TrainOutput(global_step=2740, training_loss=0.023414225467518108, metrics={'train_runtime': 804.9378, 'train_samples_per_second': 54.34, 'train_steps_per_second': 3.404, 'total_flos': 2910876010079232.0, 'train_loss': 0.023414225467518108, 'epoch': 10.0})

In [13]:
# Save model
print("Saving model to ./greeting_model...")
model.save_pretrained("finetuned_model")
tokenizer.save_pretrained("finetuned_model")
print("Done!")

Saving model to ./greeting_model...
Done!
