In [1]:
!pip install optuna
!pip install datasets

Collecting optuna
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.1-py3-none-any.whl.metadata (7.2 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.9-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.2.1-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.6/383.6 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.15.1-py3-none-any.whl (231 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.8/231.8 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.9-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.5/78.5 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Mak

In [5]:
#importing the required Lubraries and framworks

import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import numpy as np
from datasets import Dataset

In [3]:
df = pd.read_csv("/content/tone_dataset.csv")
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df.head(5)

Unnamed: 0,original_email,Tone,passive-aggressiv
0,Your absence from the meeting was noted.,negative,"Your absence from the meeting was noted, not t..."
1,Thank you for your valuable input on this proj...,positive,"Thank you, I guess, for your 'valuable' input ..."
2,You didn’t prepare adequately for the discussion.,negative,You didn’t prepare adequately for the discussi...
3,Please confirm your attendance.,neutral,"Please confirm your attendance, if it's not to..."
4,It's frustrating to have to remind you about d...,negative,It's frustrating to have to remind you about d...


In [8]:

tone_dataset = Dataset.from_pandas(df)

# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
def tokenize_function(examples):
    return tokenizer(examples['original_email'], padding="max_length", truncation=True, max_length=128)
tokenized_dataset = tone_dataset.map(tokenize_function, batched=True)

# Map tones to labels
tone_map = {"negative": 0, "neutral": 1, "positive": 2}
tokenized_dataset = tokenized_dataset.map(lambda x: {"labels": tone_map[x["Tone"]]})

# Split dataset (80% train, 20% eval)
train_size = int(0.8 * len(tokenized_dataset))
train_dataset = tokenized_dataset.select(range(train_size))
eval_dataset = tokenized_dataset.select(range(train_size, len(tokenized_dataset)))

# Load model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Training arguments

training_args = TrainingArguments(
    output_dir="./bert_tone_results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
    report_to=[],
)

from sklearn.metrics import precision_recall_fscore_support
#Define compute metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    accuracy = (predictions == labels).mean()
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }


# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

# Fine-tune
trainer.train()

# Save
model.save_pretrained("fine_tuned_bert_tone")
tokenizer.save_pretrained("fine_tuned_bert_tone")

Map:   0%|          | 0/147 [00:00<?, ? examples/s]

Map:   0%|          | 0/147 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0382,0.634479,0.833333,0.872593,0.833333,0.82006
2,0.4437,0.33538,0.966667,0.969444,0.966667,0.966598
3,0.2601,0.22643,0.933333,0.94359,0.933333,0.934206


('fine_tuned_bert_tone/tokenizer_config.json',
 'fine_tuned_bert_tone/special_tokens_map.json',
 'fine_tuned_bert_tone/vocab.txt',
 'fine_tuned_bert_tone/added_tokens.json')

In [9]:
eval_dataset

Dataset({
    features: ['original_email', 'Tone', 'passive-aggressiv', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 30
})

In [10]:
# 9. Evaluate on test set
test_results = trainer.evaluate(eval_dataset)
print("Test set results:", test_results)

Test set results: {'eval_loss': 0.22643016278743744, 'eval_accuracy': 0.9333333333333333, 'eval_precision': 0.9435897435897436, 'eval_recall': 0.9333333333333333, 'eval_f1': 0.934206349206349, 'eval_runtime': 15.6094, 'eval_samples_per_second': 1.922, 'eval_steps_per_second': 0.256, 'epoch': 3.0}
