In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.0.1


In [3]:
from sklearn.model_selection import train_test_split
from datasets import load_dataset, Dataset
import torch
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, GPT2Model
from sentence_transformers import SentenceTransformer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from bayes_opt import BayesianOptimization
import random
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict


2024-08-11 22:23:40.304616: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-11 22:23:40.304752: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-11 22:23:40.438636: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
# set the random seed
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

In [5]:
# Load the dataset
dataset = load_dataset("dair-ai/emotion", "split")

# Load each split
train_dataset = dataset['train']
val_dataset = dataset['validation']
test_dataset = dataset['test']

def stratified_sample(dataset, fraction):
    label_counts = defaultdict(list)
    
    # Group indices by label
    for i, example in enumerate(dataset):
        label_counts[example['label']].append(i)
    
    sampled_indices = []
    for label, indices in label_counts.items():
        # Calculate the number of samples to draw for each label
        sample_size = int(len(indices) * fraction)
        sampled_indices.extend(random.sample(indices, min(sample_size, len(indices))))
    
    return dataset.select(sampled_indices)

# Calculate the fraction size (one fifth) for each dataset
fraction = 1 / 2

# Perform stratified sampling
sampled_train_dataset = stratified_sample(train_dataset, fraction)
sampled_val_dataset = stratified_sample(val_dataset, fraction)
sampled_test_dataset = stratified_sample(test_dataset, fraction)


Downloading readme:   0%|          | 0.00/9.05k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.03M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/127k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/129k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [6]:
# Print the sizes of the sampled datasets
print(f"Sampled training set size: {len(sampled_train_dataset)}")
print(f"Sampled validation set size: {len(sampled_val_dataset)}")
print(f"Sampled test set size: {len(sampled_test_dataset)}")

# Check the label distribution in the sampled datasets
def print_label_distribution(dataset, name):
    label_counts = defaultdict(int)
    for example in dataset:
        label_counts[example['label']] += 1
    print(f"Label distribution in {name}: {dict(label_counts)}")

print_label_distribution(sampled_train_dataset, "sampled training set")
print_label_distribution(sampled_val_dataset, "sampled validation set")
print_label_distribution(sampled_test_dataset, "sampled test set")

Sampled training set size: 7999
Sampled validation set size: 999
Sampled test set size: 998
Label distribution in sampled training set: {0: 2333, 3: 1079, 2: 652, 5: 286, 4: 968, 1: 2681}
Label distribution in sampled validation set: {0: 275, 2: 89, 3: 137, 1: 352, 4: 106, 5: 40}
Label distribution in sampled test set: {0: 290, 1: 347, 4: 112, 3: 137, 2: 79, 5: 33}


In [7]:
# Load GPT2Tokenizer and Model
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')



# Add pad_token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

model = GPT2ForSequenceClassification.from_pretrained('distilgpt2', num_labels=6)

# Resize token embeddings to match the tokenizer length
model.resize_token_embeddings(len(tokenizer))

# Set the padding token ID in the model configuration
model.config.pad_token_id = tokenizer.pad_token_id

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = model.to(device)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at distilgpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding=True, max_length=512)

train_data1 = sampled_train_dataset.map(tokenize_function, batched=True)
val_data1 = sampled_val_dataset.map(tokenize_function, batched=True)
test_data1 = sampled_test_dataset.map(tokenize_function, batched=True)

# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define compute metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir='./logs',
    load_best_model_at_end=True,
    report_to="none",
    seed=seed,
)

# Define Trainer
trainer1 = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data1,
    eval_dataset=val_data1,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Train the model
trainer1.train()

# Evaluate the best model on the test set
eval_result = trainer1.evaluate(test_data1)
print(f"Final evaluation results on test set: {eval_result}")

Map:   0%|          | 0/7999 [00:00<?, ? examples/s]

Map:   0%|          | 0/999 [00:00<?, ? examples/s]

Map:   0%|          | 0/998 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.1578,0.462062,0.853854,0.849943,0.86045,0.853854
2,0.3819,0.294717,0.88989,0.890062,0.899521,0.88989
3,0.2265,0.243283,0.915916,0.915168,0.917267,0.915916
4,0.1745,0.214634,0.920921,0.921336,0.923617,0.920921
5,0.1462,0.177273,0.930931,0.930558,0.930416,0.930931
6,0.1198,0.205011,0.925926,0.926045,0.927325,0.925926
7,0.1153,0.210357,0.922923,0.923112,0.924585,0.922923
8,0.1037,0.217582,0.930931,0.930624,0.93124,0.930931
9,0.0841,0.224087,0.921922,0.92167,0.922685,0.921922
10,0.0847,0.222884,0.923924,0.923696,0.924469,0.923924


Final evaluation results on test set: {'eval_loss': 0.20079690217971802, 'eval_accuracy': 0.9198396793587175, 'eval_f1': 0.9185627468693977, 'eval_precision': 0.9185335884638598, 'eval_recall': 0.9198396793587175, 'eval_runtime': 1.3174, 'eval_samples_per_second': 757.554, 'eval_steps_per_second': 47.822, 'epoch': 10.0}


In [9]:
enhanced_prompts = ["What mood does the author convey in the message?",
"What would be your feelings towards the message?",
"How would you describe the author's feeling in the message?",
"What feeling does the author communicate in the message?",
"What feeling does the author express in the message?",
"How might you react emotionally to the news?",
"How are the feelings conveyed in this message?",
"How does the author express feelings in the message?",
"What feeling is communicated in this message?",
"How is emotion shown in this message?",
"What would be your feelings about the message?",
"How is emotion conveyed in this message?",
"What sentiment does the author communicate in the message?",
"What is your emotional response to the message?",
"How is emotion expressed in this message?",
"What emotion is the writer communicating in the message?",
"How does this message express emotion?",
"How would you describe the emotions in this message?",
"What emotion is conveyed in this message?",
"What would your feelings be towards the message?",
"How is the feeling expressed in this message?",
"How would you describe the emotion contained in this message?",
"What feelings are conveyed in this message?",
"How would you describe the author’s feelings in his message?",
"In what way is the emotion shown in this message?",
"What emotion does the writer show in the message?",
"How does this message express feelings?",
"How is the feeling conveyed in this message?",
"How are the emotions portrayed in this message?",
"How is the emotion portrayed in this message?",
"How do you feel when you read the message?",
"What is your anticipated reaction to the message?",
"How do you think you will feel about the message?",
"What emotions does the author express in his message?",
"How would you describe the writer's feeling in the message?",
"What sentiment is conveyed in this message?",
"How are the feelings portrayed in this message?",
"What emotion does the author express in the message?",
"What reaction do you expect to the message?",
"How is the emotion expressed in this message?",
"What feeling does the author convey in the message?",
"What feeling is conveyed in this message?",
"How are the emotions shown in this message?",
"What emotion is the writer showing in the message?",
"How would you describe the feelings expressed by the author in the message?",
"How would you describe the feelings shown in this message?",
"How are you likely to react to the news?",
"How would you feel about this message?",
"How would you describe your reaction to the news?",
"What emotion does this message communicate?",
"What sentiment does the author convey in the message?",
"How would you describe the emotion demonstrated in this message?",
"How does this message express emotions?",
"What feeling does the writer convey in his message?",
"What emotions does the writer show in the message?",
"What feelings does the author convey in the message?",
"How do you react emotionally to the news?",
"How is the emotion conveyed in this message?",
"How would you describe the emotion expressed in this message?",
"What sentiment is expressed in this message?",
"What emotions are conveyed in this message?",
"How would you feel about this news?",
"How is emotion portrayed in this message?",
"How does the writer express emotions in the message?",
"What would your feelings be for the message?",
"What would your feelings be regarding the message?",
"What feeling does the author convey in his message?",
"How would you describe the emotion shown in this message?",
"What feeling does the writer express in the message?",
"What emotion is the writer demonstrating in the message?",
"How are the emotions conveyed in this message?",
"How do you think you will react to the news?",
"What feeling does the writer convey in the message?",
"What feeling is expressed in this message?",
"How does the author express his feelings in his message?",
"What emotion does the author convey in the message?",
"What emotion does this message convey?",
"What emotion does the author convey in his message?",
"What sentiment does the writer express in the message?",
"What emotion does the writer express in the message?",
"How would you describe the feelings in this message?",
"What feeling is being communicated in this message?",
"What feeling does the author express in his message?",
"How does the writer express emotion in the message?",
"What is your expected reaction to the message?",
"How is emotion represented in this message?",
"What feeling does this message communicate?",
"How would you describe your reaction to the message?",
"How would you describe the emotion in this message?",
"What emotions does the author convey in his message?",
"How would you describe the feeling of this message?",
"What emotion is communicated in this message?",
"How would you react emotionally to the message?",
"What emotion is expressed in this message?",
"How is the emotion shown in this post?",
"How is emotion demonstrated in this message?",
"What feeling does this message convey?",
"How might you react emotionally to the message?",
"What emotion does the writer convey in the message?",
"How would you describe the emotion of this message?",
"How is the feeling portrayed in this message?"]




In [10]:
# load Sentence Transformer model
embedmodel = SentenceTransformer('all-MiniLM-L6-v2')

# generate embeddings for each prompt
prompt_embeddings = embedmodel.encode(enhanced_prompts)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

In [11]:
# classify function
def classify_article_with_prompt_embedding(prompt_embedding, article_text):
    prompt_text = " ".join(map(str, prompt_embedding))
    input_text = f"{prompt_text} {article_text}"
    inputs = tokenizer(input_text, truncation=True, max_length=512, padding='max_length', return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    label_id = outputs.logits.argmax(dim=1).item()
    return label_id

def evaluate_prompt_embedding(prompt_embedding):
    predictions = [classify_article_with_prompt_embedding(prompt_embedding, article_text) for article_text in sampled_train_dataset['text']]
    return accuracy_score(sampled_train_dataset['label'], predictions)

# Bayesian optimization function
def black_box_function(prompt_idx):
    prompt_embedding = prompt_embeddings[int(prompt_idx)]
    return evaluate_prompt_embedding(prompt_embedding)

pbounds = {'prompt_idx': (0, len(prompt_embeddings) - 1)}

optimizer = BayesianOptimization(
    f=black_box_function,
    pbounds=pbounds,
    random_state=42,
)

optimizer.maximize(
    init_points=5,
    n_iter=15,
)

|   iter    |  target   | prompt... |
-------------------------------------
| [30m1         | [30m0.3352    | [30m37.45     |
| [30m2         | [30m0.121     | [30m95.07     |
| [30m3         | [30m0.3352    | [30m73.2      |
| [30m4         | [30m0.2917    | [30m59.87     |
| [30m5         | [30m0.121     | [30m15.6      |
| [30m6         | [30m0.121     | [30m46.61     |
| [30m7         | [30m0.3352    | [30m73.2      |
| [30m8         | [30m0.121     | [30m32.54     |
| [30m9         | [30m0.121     | [30m40.14     |
| [30m10        | [30m0.2917    | [30m35.97     |
| [30m11        | [30m0.121     | [30m70.37     |
| [30m12        | [30m0.121     | [30m75.43     |
| [30m13        | [30m0.2917    | [30m58.25     |
| [30m14        | [30m0.3352    | [30m56.26     |
| [30m15        | [30m0.121     | [30m54.6      |
| [30m16        | [30m0.121     | [30m57.13     |
| [30m17        | [30m0.121     | [30m36.86     |
| [30m18        | [30

In [12]:
# Step 1: Find the maximum target value
max_target_value = max(res['target'] for res in optimizer.res)

# Step 2: Collect all unique prompt indices with the maximum target value
best_prompt_indices = [int(res['params']['prompt_idx']) for res in optimizer.res if res['target'] == max_target_value]
unique_best_prompt_indices = list(set(best_prompt_indices))

# Step 3: Get the unique embeddings
best_prompt_embeddings = [prompt_embeddings[idx] for idx in unique_best_prompt_indices]

# Step 4: Find the top one most similar prompt for each of the best embeddings
all_top_similar_prompts_dict = {}

for idx, best_prompt_embedding in enumerate(best_prompt_embeddings):
    similarities = cosine_similarity(best_prompt_embedding[np.newaxis, :], prompt_embeddings)[0]
    # Get the index of the most similar prompt for each best embedding
    top_index = np.argmax(similarities)
    # Collect the most similar prompt
    top_similar_prompt = enhanced_prompts[top_index]
    all_top_similar_prompts_dict[unique_best_prompt_indices[idx]] = top_similar_prompt

# Output best prompt embedding indices and their corresponding top one similar prompts
print("Best prompt embedding indices and their top one similar prompts:")
for index, prompt in all_top_similar_prompts_dict.items():
    print(f"Index: {index}, Top Similar Prompt: {prompt}")

Best prompt embedding indices and their top one similar prompts:
Index: 56, Top Similar Prompt: How do you react emotionally to the news?
Index: 73, Top Similar Prompt: What feeling is expressed in this message?
Index: 37, Top Similar Prompt: What emotion does the author express in the message?


In [13]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir='./logs',
    load_best_model_at_end=True,
    report_to="none",
    seed= seed,
)

# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Iterate over each index and its most similar prompt
for idx, prompts in all_top_similar_prompts_dict.items():
    best_prompt = prompts  # Since we now store only the top prompt per index
    print(f"Training and evaluating for best prompt embedding index: {idx}")
    print(f"Most similar prompt: {best_prompt}")

    # Function to tokenize text with the given prompt
    def tokenize_with_prompt(examples):
        inputs = [f"{best_prompt} {text}" for text in examples['text']]
        return tokenizer(inputs, truncation=True, padding='max_length', max_length=512)

    # Apply the tokenize function to datasets
    train_data_with_prompt = sampled_train_dataset.map(tokenize_with_prompt, batched=True)
    val_data_with_prompt = sampled_val_dataset.map(tokenize_with_prompt, batched=True)
    test_data_with_prompt = sampled_test_dataset.map(tokenize_with_prompt, batched=True)

    # Initialize the Trainer
    trainer2 = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data_with_prompt,
        eval_dataset= val_data_with_prompt,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    # Train the model
    trainer2.train()

    # Evaluate the model on the test set
    eval_result = trainer2.evaluate(test_data_with_prompt)
    print(f"Final evaluation results for prompt index {idx} on test set: {eval_result}")

Training and evaluating for best prompt embedding index: 56
Most similar prompt: How do you react emotionally to the news?




Map:   0%|          | 0/7999 [00:00<?, ? examples/s]

Map:   0%|          | 0/999 [00:00<?, ? examples/s]

Map:   0%|          | 0/998 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1642,0.208584,0.928929,0.928593,0.93101,0.928929
2,0.1326,0.264085,0.926927,0.926587,0.928544,0.926927
3,0.1116,0.253454,0.925926,0.924828,0.925654,0.925926
4,0.1002,0.266765,0.92993,0.929688,0.932196,0.92993
5,0.0845,0.24849,0.931932,0.931672,0.932174,0.931932
6,0.0763,0.27213,0.931932,0.932207,0.933425,0.931932
7,0.0617,0.290767,0.925926,0.925814,0.926613,0.925926
8,0.0581,0.30021,0.92993,0.929096,0.929974,0.92993
9,0.0458,0.307187,0.928929,0.928142,0.929013,0.928929
10,0.0431,0.298696,0.930931,0.930389,0.930826,0.930931


Final evaluation results for prompt index 56 on test set: {'eval_loss': 0.19828422367572784, 'eval_accuracy': 0.9178356713426854, 'eval_f1': 0.9161227477466253, 'eval_precision': 0.9172542735900773, 'eval_recall': 0.9178356713426854, 'eval_runtime': 10.1972, 'eval_samples_per_second': 97.87, 'eval_steps_per_second': 6.178, 'epoch': 10.0}
Training and evaluating for best prompt embedding index: 73
Most similar prompt: What feeling is expressed in this message?


Map:   0%|          | 0/7999 [00:00<?, ? examples/s]

Map:   0%|          | 0/999 [00:00<?, ? examples/s]

Map:   0%|          | 0/998 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1336,0.27125,0.918919,0.919106,0.921172,0.918919
2,0.1181,0.288233,0.923924,0.922888,0.924427,0.923924
3,0.0959,0.286206,0.92993,0.928986,0.929952,0.92993
4,0.0866,0.28848,0.926927,0.927113,0.930435,0.926927
5,0.0706,0.276311,0.92993,0.92968,0.930131,0.92993
6,0.0634,0.288074,0.928929,0.929018,0.929606,0.928929
7,0.06,0.291224,0.928929,0.928649,0.92862,0.928929
8,0.0441,0.312407,0.926927,0.926708,0.926896,0.926927
9,0.0427,0.323263,0.927928,0.927511,0.927836,0.927928
10,0.0376,0.321416,0.928929,0.928714,0.929017,0.928929


Final evaluation results for prompt index 73 on test set: {'eval_loss': 0.23099927604198456, 'eval_accuracy': 0.9248496993987976, 'eval_f1': 0.9238621592367912, 'eval_precision': 0.9250700293004582, 'eval_recall': 0.9248496993987976, 'eval_runtime': 10.134, 'eval_samples_per_second': 98.48, 'eval_steps_per_second': 6.217, 'epoch': 10.0}
Training and evaluating for best prompt embedding index: 37
Most similar prompt: What emotion does the author express in the message?


Map:   0%|          | 0/7999 [00:00<?, ? examples/s]

Map:   0%|          | 0/999 [00:00<?, ? examples/s]

Map:   0%|          | 0/998 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1231,0.252381,0.926927,0.926804,0.928129,0.926927
2,0.1095,0.253866,0.928929,0.928261,0.929128,0.928929
3,0.0907,0.266465,0.925926,0.925432,0.925512,0.925926
4,0.0779,0.272647,0.934935,0.934922,0.935898,0.934935
5,0.0629,0.254136,0.934935,0.934755,0.934825,0.934935
6,0.0526,0.274086,0.936937,0.937019,0.937249,0.936937
7,0.0448,0.313235,0.933934,0.933615,0.933694,0.933934
8,0.0404,0.310546,0.935936,0.935444,0.935642,0.935936
9,0.0369,0.317429,0.932933,0.932159,0.932832,0.932933
10,0.032,0.312066,0.935936,0.935341,0.935771,0.935936


Final evaluation results for prompt index 37 on test set: {'eval_loss': 0.2306322157382965, 'eval_accuracy': 0.9268537074148296, 'eval_f1': 0.9256715974727546, 'eval_precision': 0.9278889951552929, 'eval_recall': 0.9268537074148296, 'eval_runtime': 10.207, 'eval_samples_per_second': 97.776, 'eval_steps_per_second': 6.172, 'epoch': 10.0}


In [14]:
# classify function
def classify_article_with_prompt(prompt, article_text):
    input_text = f"{prompt} {article_text}"
    inputs = tokenizer(input_text, truncation=True, max_length=512, padding='max_length', return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    label_id = outputs.logits.argmax(dim=1).item()
    return label_id

# evaluation function
def evaluate_prompt(prompt):
    predictions = [classify_article_with_prompt(prompt, article_text) for article_text in sampled_train_dataset['text']]
    return accuracy_score(sampled_train_dataset['label'], predictions)

# Bayesian optimization function
def black_box_function(prompt_idx):
    prompt = enhanced_prompts[int(prompt_idx)]
    return evaluate_prompt(prompt)

pbounds = {'prompt_idx': (0, len(enhanced_prompts) - 1)}

optimizer = BayesianOptimization(
    f=black_box_function,
    pbounds=pbounds,
    random_state=42,
)

optimizer.maximize(
    init_points=5,
    n_iter=15,
)

|   iter    |  target   | prompt... |
-------------------------------------
| [30m1         | [30m0.9619    | [30m37.45     |
| [35m2         | [35m0.962     | [35m95.07     |
| [30m3         | [30m0.962     | [30m73.2      |
| [35m4         | [35m0.9625    | [35m59.87     |
| [30m5         | [30m0.961     | [30m15.6      |
| [30m6         | [30m0.9617    | [30m50.95     |
| [30m7         | [30m0.9625    | [30m59.87     |
| [30m8         | [30m0.9615    | [30m65.16     |
| [30m9         | [30m0.9607    | [30m56.78     |
| [30m10        | [30m0.9591    | [30m61.71     |
| [30m11        | [30m0.9607    | [30m96.37     |
| [30m12        | [30m0.9616    | [30m94.0      |
| [30m13        | [30m0.9612    | [30m58.75     |
| [30m14        | [30m0.961     | [30m74.28     |
| [30m15        | [30m0.9611    | [30m72.26     |
| [30m16        | [30m0.9614    | [30m36.49     |
| [30m17        | [30m0.9616    | [30m38.4      |
| [30m18        | [30

In [15]:
max_target_value = max(res['target'] for res in optimizer.res)

best_prompts_dict = {}


for res in optimizer.res:
    if res['target'] == max_target_value:
        idx = int(res['params']['prompt_idx'])  
        prompt = enhanced_prompts[idx]          
        best_prompts_dict[idx] = prompt         


print("Best prompt indices and their prompts:", best_prompts_dict)

Best prompt indices and their prompts: {59: 'What sentiment is expressed in this message?'}


In [16]:
# Define compute metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Training arguments setup
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir='./logs',
    load_best_model_at_end=True,
    report_to="none",
    seed=42,  # Ensure reproducibility
)

# Data collator for padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Iterate over each of the best prompts
for idx, best_prompt in best_prompts_dict.items():
    print(f"Training and evaluating for best prompt index: {idx}")
    print(f"Best prompt: {best_prompt}")

    # Function to tokenize text with the given prompt
    def tokenize_with_prompt(examples):
        inputs = [f"{best_prompt} {text}" for text in examples['text']]
        return tokenizer(inputs, truncation=True, padding='max_length', max_length=512)

    # Apply the tokenize function to datasets
    train_data_with_prompt = sampled_train_dataset.map(tokenize_with_prompt, batched=True)
    val_data_with_prompt = sampled_val_dataset.map(tokenize_with_prompt, batched=True)
    test_data_with_prompt = sampled_test_dataset.map(tokenize_with_prompt, batched=True)

    # Initialize the Trainer
    trainer3 = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data_with_prompt,
        eval_dataset=val_data_with_prompt,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    # Train the model
    trainer3.train()

    # Evaluate the model on the test set
    eval_result = trainer3.evaluate(test_data_with_prompt)
    print(f"Final evaluation results for prompt index {idx} on test set: {eval_result}")

Training and evaluating for best prompt index: 59
Best prompt: What sentiment is expressed in this message?




Map:   0%|          | 0/7999 [00:00<?, ? examples/s]

Map:   0%|          | 0/999 [00:00<?, ? examples/s]

Map:   0%|          | 0/998 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.0687,0.315819,0.928929,0.929492,0.931264,0.928929
2,0.1072,0.277204,0.926927,0.92655,0.927865,0.926927
3,0.0808,0.307815,0.930931,0.930423,0.930974,0.930931
4,0.0726,0.313917,0.935936,0.935923,0.937038,0.935936
5,0.0584,0.277438,0.92993,0.9297,0.929859,0.92993
6,0.0471,0.330539,0.935936,0.935973,0.936958,0.935936
7,0.0426,0.327963,0.928929,0.928673,0.928662,0.928929
8,0.0304,0.319174,0.92993,0.929555,0.929556,0.92993
9,0.0259,0.329155,0.933934,0.933641,0.933881,0.933934
10,0.0235,0.338713,0.937938,0.937605,0.937699,0.937938


Final evaluation results for prompt index 59 on test set: {'eval_loss': 0.2768343687057495, 'eval_accuracy': 0.9168336673346693, 'eval_f1': 0.9170560655085109, 'eval_precision': 0.9189793345774923, 'eval_recall': 0.9168336673346693, 'eval_runtime': 10.1858, 'eval_samples_per_second': 97.979, 'eval_steps_per_second': 6.185, 'epoch': 10.0}
