In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.0.1


In [3]:
from sklearn.model_selection import train_test_split
from datasets import load_dataset, Dataset
import torch
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, GPT2Model
from sentence_transformers import SentenceTransformer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from bayes_opt import BayesianOptimization
import random
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict

2024-08-11 20:09:54.334560: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-11 20:09:54.334680: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-11 20:09:54.473851: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
# set the random seed
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

In [5]:
# Load the dataset
dataset = load_dataset("dair-ai/emotion", "split")

# Load each split
train_dataset = dataset['train']
val_dataset = dataset['validation']
test_dataset = dataset['test']

def stratified_sample(dataset, fraction):
    label_counts = defaultdict(list)
    
    # Group indices by label
    for i, example in enumerate(dataset):
        label_counts[example['label']].append(i)
    
    sampled_indices = []
    for label, indices in label_counts.items():
        # Calculate the number of samples to draw for each label
        sample_size = int(len(indices) * fraction)
        sampled_indices.extend(random.sample(indices, min(sample_size, len(indices))))
    
    return dataset.select(sampled_indices)

# Calculate the fraction size (one fifth) for each dataset
fraction = 1 / 2

# Perform stratified sampling
sampled_train_dataset = stratified_sample(train_dataset, fraction)
sampled_val_dataset = stratified_sample(val_dataset, fraction)
sampled_test_dataset = stratified_sample(test_dataset, fraction)

Downloading readme:   0%|          | 0.00/9.05k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.03M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/127k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/129k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [6]:
# Load GPT2Tokenizer and Model
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')



# Add pad_token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

model = GPT2ForSequenceClassification.from_pretrained('distilgpt2', num_labels=6)

# Resize token embeddings to match the tokenizer length
model.resize_token_embeddings(len(tokenizer))

# Set the padding token ID in the model configuration
model.config.pad_token_id = tokenizer.pad_token_id

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = model.to(device)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at distilgpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
enhanced_prompts1 = ["What is the emotion expressed in this message?",
"What emotion does this message express?",
"How will you feel about the message?",
"What emotion does the writer express for the message?",
"How is the emotion conveyed in this message?",
"What feeling is communicated in this message?",
"How would you describe the emotion shown in this message?",
"What sentiment is conveyed in this message?",
"How is the feeling expressed in this message?",
"How is the emotion portrayed in this message?",
"In what way is the emotion shown in this message?",
"What feeling is expressed in this message?",
"What feeling does this message convey?",
"How does this message express emotion?",
"How would you describe the emotion in this message?",
"What emotion is conveyed in this message?",
"What feeling does this message communicate?",
"What sentiment is expressed in this message?",
"How would you describe your reaction to the message?",
"What is your emotional response to the message?",
"How might you react emotionally to the message?",
"What would your feelings be towards the message?",
"How do you think you will feel about the message?",
"What is your anticipated reaction to the message?",
"How do you feel when you read the message?",
"What feeling does the writer convey in the message?",
"What sentiment does the writer express in the message?",
"How does the writer express emotion in the message?",
"What emotion does the author convey in the message?",
"How would you describe the writer's feeling in the message?",
"What emotion is the writer showing in the message?",
"What sentiment does the author communicate in the message?",
"What is the emotion expressed in this message: joy, sadness, anger, fear, love or surprise?",
"What emotion does this message express: joy, sadness, anger, fear, love or surprise?",
"How will you feel about the message: joy, sadness, anger, fear, love or surprise?",
"What emotion does the writer express for the message: joy, sadness, anger, fear, love or surprise?",
"How is the emotion conveyed in this message: joy, sadness, anger, fear, love or surprise?",
"What feeling is communicated in this message: joy, sadness, anger, fear, love or surprise?",
"How would you describe the emotion shown in this message: joy, sadness, anger, fear, love or surprise?",
"What sentiment is conveyed in this message: joy, sadness, anger, fear, love or surprise?",
"How is the feeling expressed in this message: joy, sadness, anger, fear, love or surprise?",
"How is the emotion portrayed in this message: joy, sadness, anger, fear, love or surprise?",
"In what way is the emotion shown in this message: joy, sadness, anger, fear, love or surprise?",
"What feeling is expressed in this message: joy, sadness, anger, fear, love or surprise?",
"What feeling does this message convey: joy, sadness, anger, fear, love or surprise?",
"How does this message express emotion: joy, sadness, anger, fear, love or surprise?",
"How would you describe the emotion in this message: joy, sadness, anger, fear, love or surprise?",
"What emotion is conveyed in this message: joy, sadness, anger, fear, love or surprise?",
"What feeling does this message communicate: joy, sadness, anger, fear, love or surprise?",
"What sentiment is expressed in this message: joy, sadness, anger, fear, love or surprise?",
"How would you describe your reaction to the message: joy, sadness, anger, fear, love or surprise?",
"What is your emotional response to the message: joy, sadness, anger, fear, love or surprise?",
"How might you react emotionally to the message: joy, sadness, anger, fear, love or surprise?",
"What would your feelings be towards the message: joy, sadness, anger, fear, love or surprise?",
"How do you think you will feel about the message: joy, sadness, anger, fear, love or surprise?",
"What is your anticipated reaction to the message: joy, sadness, anger, fear, love or surprise?",
"How do you feel when you read the message: joy, sadness, anger, fear, love or surprise?",
"What feeling does the writer convey in the message: joy, sadness, anger, fear, love or surprise?",
"What sentiment does the writer express in the message: joy, sadness, anger, fear, love or surprise?",
"How does the writer express emotion in the message: joy, sadness, anger, fear, love or surprise?",
"What emotion does the author convey in the message: joy, sadness, anger, fear, love or surprise?",
"How would you describe the writer's feeling in the message: joy, sadness, anger, fear, love or surprise?",
"What emotion is the writer showing in the message: joy, sadness, anger, fear, love or surprise?",
"What sentiment does the author communicate in the message: joy, sadness, anger, fear, love or surprise?",
"What is the emotion expressed in this message: joy, sadness, anger, fear, love or surprise? Example: 'I am ever feeling nostalgic about the fireplace I will know that it is still on the property' is love.",
"What emotion does this message express: joy, sadness, anger, fear, love or surprise? Example: 'I have been with Petronas for years I feel that Petronas has performed well and made a huge profit' is joy.",
"How will you feel about the message: joy, sadness, anger, fear, love or surprise? Example: 'I've been taking or milligrams or times recommended amount and I've fallen asleep a lot faster but I also feel like so funny' is surprise.",
"What emotion does the writer express for the message: joy, sadness, anger, fear, love or surprise? Example: 'I am ever feeling nostalgic about the fireplace I will know that it is still on the property' is love.",
"How is the emotion conveyed in this message: joy, sadness, anger, fear, love or surprise? Example: 'I didn't feel humiliated' is sadness.",
"What feeling is communicated in this message: joy, sadness, anger, fear, love or surprise? Example: 'I didn't feel humiliated' is sadness.",
"How would you describe the emotion shown in this message: joy, sadness, anger, fear, love or surprise? Example: 'I am ever feeling nostalgic about the fireplace I will know that it is still on the property' is love.",
"What sentiment is conveyed in this message: joy, sadness, anger, fear, love or surprise? Example: 'I didn't feel humiliated' is sadness.",
"How is the feeling expressed in this message: joy, sadness, anger, fear, love or surprise? Example: 'I didn't feel humiliated' is sadness.",
"How is the emotion portrayed in this message: joy, sadness, anger, fear, love or surprise? Example: 'I didn't feel humiliated' is sadness.",
"In what way is the emotion shown in this message: joy, sadness, anger, fear, love or surprise? Example: 'I didn't feel humiliated' is sadness.",
"What feeling is expressed in this message: joy, sadness, anger, fear, love or surprise? Example: 'I feel as confused about life as a teenager or as jaded as a year old man' is fear.",
"What feeling does this message convey: joy, sadness, anger, fear, love or surprise? Example: 'I feel as confused about life as a teenager or as jaded as a year old man' is fear.",
"How does this message express emotion: joy, sadness, anger, fear, love or surprise? Example: 'I have been with Petronas for years I feel that Petronas has performed well and made a huge profit' is joy.",
"How would you describe the emotion in this message: joy, sadness, anger, fear, love or surprise? Example: 'I'm grabbing a minute to post I feel greedy wrong' is anger.",
"What emotion is conveyed in this message: joy, sadness, anger, fear, love or surprise? Example: 'I've been taking or milligrams or times recommended amount and I've fallen asleep a lot faster but I also feel like so funny' is surprise.",
"What feeling does this message communicate: joy, sadness, anger, fear, love or surprise? Example: 'I didn't feel humiliated' is sadness.",
"What sentiment is expressed in this message: joy, sadness, anger, fear, love or surprise? Example: 'I have been with Petronas for years I feel that Petronas has performed well and made a huge profit' is joy.",
"How would you describe your reaction to the message: joy, sadness, anger, fear, love or surprise? Example: 'I have been with Petronas for years I feel that Petronas has performed well and made a huge profit' is joy.",
"What is your emotional response to the message: joy, sadness, anger, fear, love or surprise? Example: 'I am ever feeling nostalgic about the fireplace I will know that it is still on the property' is love.",
"How might you react emotionally to the message: joy, sadness, anger, fear, love or surprise? Example: 'I have been with Petronas for years I feel that Petronas has performed well and made a huge profit' is joy.",
"What would your feelings be towards the message: joy, sadness, anger, fear, love or surprise? Example: 'I am ever feeling nostalgic about the fireplace I will know that it is still on the property' is love.",
"How do you think you will feel about the message: joy, sadness, anger, fear, love or surprise? Example: 'I've been taking or milligrams or times recommended amount and I've fallen asleep a lot faster but I also feel like so funny' is surprise.",
"What is your anticipated reaction to the message: joy, sadness, anger, fear, love or surprise? Example: 'I didn't feel humiliated' is sadness.",
"How do you feel when you read the message: joy, sadness, anger, fear, love or surprise? Example: 'I didn't feel humiliated' is sadness.",
"What feeling does the writer convey in the message: joy, sadness, anger, fear, love or surprise? Example: 'I'm grabbing a minute to post I feel greedy wrong' is anger.",
"What sentiment does the writer express in the message: joy, sadness, anger, fear, love or surprise? Example: 'I feel as confused about life as a teenager or as jaded as a year old man' is fear.",
"How does the writer express emotion in the message: joy, sadness, anger, fear, love or surprise? Example: 'I have been with Petronas for years I feel that Petronas has performed well and made a huge profit' is joy.",
"What emotion does the author convey in the message: joy, sadness, anger, fear, love or surprise? Example: 'I have been with Petronas for years I feel that Petronas has performed well and made a huge profit' is joy.",
"How would you describe the writer's feeling in the message: joy, sadness, anger, fear, love or surprise? Example: 'I didn't feel humiliated' is sadness.",
"What emotion is the writer showing in the message: joy, sadness, anger, fear, love or surprise? Example: 'I feel as confused about life as a teenager or as jaded as a year old man' is fear.",
"What sentiment does the author communicate in the message: joy, sadness, anger, fear, love or surprise? Example: 'I feel as confused about life as a teenager or as jaded as a year old man' is fear."]

In [8]:
# load Sentence Transformer model
embedmodel = SentenceTransformer('all-MiniLM-L6-v2')

# generate embeddings for each prompt
prompt_embeddings = embedmodel.encode(enhanced_prompts1)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

In [9]:
# classify function
def classify_article_with_prompt_embedding(prompt_embedding, article_text):
    prompt_text = " ".join(map(str, prompt_embedding))
    input_text = f"{prompt_text} {article_text}"
    inputs = tokenizer(input_text, truncation=True, max_length=512, padding='max_length', return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    label_id = outputs.logits.argmax(dim=1).item()
    return label_id

def evaluate_prompt_embedding(prompt_embedding):
    predictions = [classify_article_with_prompt_embedding(prompt_embedding, article_text) for article_text in sampled_train_dataset['text']]
    return accuracy_score(sampled_train_dataset['label'], predictions)

# Bayesian optimization function
def black_box_function(prompt_idx):
    prompt_embedding = prompt_embeddings[int(prompt_idx)]
    return evaluate_prompt_embedding(prompt_embedding)

pbounds = {'prompt_idx': (0, len(prompt_embeddings) - 1)}

optimizer = BayesianOptimization(
    f=black_box_function,
    pbounds=pbounds,
    random_state=42,
)

optimizer.maximize(
    init_points=5,
    n_iter=15,
)

|   iter    |  target   | prompt... |
-------------------------------------
| [30m1         | [30m0.121     | [30m35.58     |
| [30m2         | [30m0.121     | [30m90.32     |
| [30m3         | [30m0.121     | [30m69.54     |
| [30m4         | [30m0.121     | [30m56.87     |
| [30m5         | [30m0.121     | [30m14.82     |
| [35m6         | [35m0.2917    | [35m0.001105  |
| [30m7         | [30m0.2917    | [30m0.7833    |
| [30m8         | [30m0.121     | [30m4.558     |
| [30m9         | [30m0.121     | [30m46.23     |
| [30m10        | [30m0.121     | [30m79.93     |
| [30m11        | [30m0.121     | [30m25.2      |
| [30m12        | [30m0.121     | [30m63.2      |
| [30m13        | [30m0.121     | [30m1.768     |
| [30m14        | [30m0.2917    | [30m0.3968    |
| [30m15        | [30m0.121     | [30m51.56     |
| [30m16        | [30m0.121     | [30m40.9      |
| [30m17        | [30m0.121     | [30m74.74     |
| [30m18        | [30

In [10]:
# Step 1: Find the maximum target value
max_target_value = max(res['target'] for res in optimizer.res)

# Step 2: Collect all unique prompt indices with the maximum target value
best_prompt_indices = [int(res['params']['prompt_idx']) for res in optimizer.res if res['target'] == max_target_value]
unique_best_prompt_indices = list(set(best_prompt_indices))

# Step 3: Get the unique embeddings
best_prompt_embeddings = [prompt_embeddings[idx] for idx in unique_best_prompt_indices]

# Step 4: Find the top one most similar prompt for each of the best embeddings
all_top_similar_prompts_dict = {}

for idx, best_prompt_embedding in enumerate(best_prompt_embeddings):
    similarities = cosine_similarity(best_prompt_embedding[np.newaxis, :], prompt_embeddings)[0]
    # Get the index of the most similar prompt for each best embedding
    top_index = np.argmax(similarities)
    # Collect the most similar prompt
    top_similar_prompt = enhanced_prompts1[top_index]
    all_top_similar_prompts_dict[unique_best_prompt_indices[idx]] = top_similar_prompt

# Output best prompt embedding indices and their corresponding top one similar prompts
print("Best prompt embedding indices and their top one similar prompts:")
for index, prompt in all_top_similar_prompts_dict.items():
    print(f"Index: {index}, Top Similar Prompt: {prompt}")

Best prompt embedding indices and their top one similar prompts:
Index: 0, Top Similar Prompt: What is the emotion expressed in this message?


In [12]:
# Define compute metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir='./logs',
    load_best_model_at_end=True,
    report_to="none",
    seed= seed,
)

# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Iterate over each index and its most similar prompt
for idx, prompts in all_top_similar_prompts_dict.items():
    best_prompt = prompts  # Since we now store only the top prompt per index
    print(f"Training and evaluating for best prompt embedding index: {idx}")
    print(f"Most similar prompt: {best_prompt}")

    # Function to tokenize text with the given prompt
    def tokenize_with_prompt(examples):
        inputs = [f"{best_prompt} {text}" for text in examples['text']]
        return tokenizer(inputs, truncation=True, padding='max_length', max_length=512)

    # Apply the tokenize function to datasets
    train_data_with_prompt = sampled_train_dataset.map(tokenize_with_prompt, batched=True)
    val_data_with_prompt = sampled_val_dataset.map(tokenize_with_prompt, batched=True)
    test_data_with_prompt = sampled_test_dataset.map(tokenize_with_prompt, batched=True)

    # Initialize the Trainer
    trainer2 = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data_with_prompt,
        eval_dataset= val_data_with_prompt,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    # Train the model
    trainer2.train()

    # Evaluate the model on the test set
    eval_result = trainer2.evaluate(test_data_with_prompt)
    print(f"Final evaluation results for prompt index {idx} on test set: {eval_result}")



Training and evaluating for best prompt embedding index: 0
Most similar prompt: What is the emotion expressed in this message?


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.1735,0.483395,0.855856,0.850598,0.86154,0.855856
2,0.4011,0.265568,0.911912,0.911823,0.912975,0.911912
3,0.2329,0.214836,0.916917,0.916305,0.917325,0.916917
4,0.1749,0.18918,0.927928,0.928181,0.929241,0.927928
5,0.1457,0.189355,0.931932,0.931585,0.931671,0.931932
6,0.1292,0.216408,0.930931,0.930687,0.931284,0.930931
7,0.1119,0.21044,0.922923,0.923387,0.925085,0.922923
8,0.1045,0.215408,0.924925,0.924539,0.925436,0.924925
9,0.0838,0.206583,0.928929,0.928061,0.928512,0.928929
10,0.0858,0.204452,0.922923,0.922579,0.922876,0.922923


Final evaluation results for prompt index 0 on test set: {'eval_loss': 0.21594026684761047, 'eval_accuracy': 0.9248496993987976, 'eval_f1': 0.9247668686374074, 'eval_precision': 0.9248156699709753, 'eval_recall': 0.9248496993987976, 'eval_runtime': 10.3113, 'eval_samples_per_second': 96.787, 'eval_steps_per_second': 6.11, 'epoch': 10.0}
