In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.0.1


In [3]:
from sklearn.model_selection import train_test_split
from datasets import load_dataset, Dataset
import torch
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, GPT2Model
from sentence_transformers import SentenceTransformer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from bayes_opt import BayesianOptimization
import random
from sklearn.metrics.pairwise import cosine_similarity

2024-07-29 16:26:34.093369: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-29 16:26:34.093474: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-29 16:26:34.202327: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
# set the random seed
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    
    
# Load the dataset
dataset = load_dataset("SetFit/bbc-news")

# Convert the training and test datasets to DataFrames
train_df = dataset['train'].to_pandas()
test_df = dataset['test'].to_pandas()

# Combine the training and test DataFrames
all_df = pd.concat([train_df, test_df])

# 30 % data are used for test
train_df, test_df = train_test_split(all_df, test_size=0.2, random_state=seed)

#  20% of training set are divided to be validation set
train_df, val_df = train_test_split(train_df, test_size=0.125, random_state=seed)

# turn DataFrames into Dataset object
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)



Downloading readme:   0%|          | 0.00/880 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.87M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.28M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1225 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [5]:
print("Number of train data: ", len(train_dataset))
print("Number of val data: ", len(val_dataset))
print("Number of test data: ", len(test_dataset))

Number of train data:  1557
Number of val data:  223
Number of test data:  445


In [6]:


# Load GPT2Tokenizer and Model
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')



# Add pad_token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

model = GPT2ForSequenceClassification.from_pretrained('distilgpt2', num_labels=5)

# Resize token embeddings to match the tokenizer length
model.resize_token_embeddings(len(tokenizer))

# Set the padding token ID in the model configuration
model.config.pad_token_id = tokenizer.pad_token_id

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = model.to(device)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at distilgpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding=True, max_length=512)

train_data1 = train_dataset.map(tokenize_function, batched=True)
val_data1 = val_dataset.map(tokenize_function, batched=True)
test_data1 = test_dataset.map(tokenize_function, batched=True)

# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define compute metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir='./logs',
    load_best_model_at_end=True,
    report_to="none",
    seed=seed,
)

# Define Trainer
trainer1 = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data1,
    eval_dataset=val_data1,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Train the model
trainer1.train()

# Evaluate the best model on the test set
eval_result = trainer1.evaluate(test_data1)
print(f"Final evaluation results on test set: {eval_result}")

Map:   0%|          | 0/1557 [00:00<?, ? examples/s]

Map:   0%|          | 0/223 [00:00<?, ? examples/s]

Map:   0%|          | 0/445 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.357079,0.932735,0.932773,0.941962,0.932735
2,No log,0.164561,0.973094,0.972913,0.973114,0.973094
3,No log,0.202698,0.973094,0.972906,0.973465,0.973094
4,No log,0.134134,0.973094,0.973002,0.973184,0.973094
5,No log,0.172868,0.973094,0.973006,0.973118,0.973094
6,0.274400,0.162212,0.973094,0.973002,0.973184,0.973094
7,0.274400,0.195228,0.959641,0.959616,0.960731,0.959641
8,0.274400,0.18916,0.973094,0.972998,0.973309,0.973094
9,0.274400,0.196428,0.973094,0.972998,0.973309,0.973094
10,0.274400,0.194779,0.973094,0.972998,0.973309,0.973094


Final evaluation results on test set: {'eval_loss': 0.15737980604171753, 'eval_accuracy': 0.9707865168539326, 'eval_f1': 0.9708435085090811, 'eval_precision': 0.9715983068651995, 'eval_recall': 0.9707865168539326, 'eval_runtime': 4.5225, 'eval_samples_per_second': 98.396, 'eval_steps_per_second': 6.191, 'epoch': 10.0}


In [8]:
enhanced_prompts = [
"What is the central focus of this newspaper article?",
"How would you formulate the main theme of this message?",
"Where is this story most likely to be found in the newspaper?",
"How would you define the primary subject of this news?",
"What is the main topic of this news?",
"Which category is best suited for this news?",
"What is the main topic of this news article?",
"What is central to this news article?",
"Which part of the newspaper is most likely to contain this news?",
"Where in the newspaper is this news most likely to be located?",
"How would you explain the main topic of this news?",
"Which section of the newspaper is this news most likely to be found in?",
"In what section of the newspaper is this news most likely to be found?",
"Which category is the best match for this news article?",
"In which category does this news article best fit?",
"Which category does this news fall into?",
"Which category is most appropriate for this news?",
"What category does this news belong to?",
"Which section of the newspaper will most likely contain this news?",
"In which section of the newspaper is this news story most likely to appear?",
"What is the central purpose of this news article?",
"In which section of the newspaper would one likely find this news item?",
"How would you summarize the central theme of this message?",
"In which section of the newspaper is this news most likely?",
"Which part of the newspaper is likely to contain this news?",
"Which category does this news best fit into?",
"How would you define the main topic of this news?",
"Which category best matches this news article?",
"What is the main emphasis of this news article?",
"How would you describe the main topic of this news?",
"In which section of the newspaper would this news item most likely be found?",
"How would you characterize the main theme of this message?",
"Where in the newspaper is this news story most likely to be found?",
"What is your description of the main theme of this news?",
"Which category is best for this news article?",
"What is the main point of this press article?",
"In your opinion, what is the main objective of this news?",
"In which section of the newspaper is this news most likely to appear?",
"What category does this news article fall under?",
"In what section of the newspaper is this news most likely to appear?",
"Under which category does this news article fit best?",
"What category does this news article belong to?",
"Where in the newspaper is this news most likely to appear?",
"What is your description of the main topic of this news story?",
"In which section of the newspaper would this news most likely be found?",
"How would you describe the main theme of this message?",
"Which section of the newspaper is most likely to contain this news?",
"Where in the newspaper is this news most likely to be found?",
"In which section of the newspaper is this news most likely to be found?",
"In which section of the newspaper are you most likely to find this news?",
"What is this news article focused on?",
"In what part of the newspaper would this news most likely appear?",
"How would you articulate the primary subject of this news?",
"What do you think is the main focus of this message?",
"What is the main emphasis of this news?",
"How would you define the main theme of this message?",
"Which part of the newspaper would most likely contain this news?",
"Which category best fits this news?",
"How would you explain the main theme of this news?",
"Which category is best suited for this news article?",
"What do you think is the main focus of this news?",
"Which category does this news article best fit into?",
"How would you describe the main topic of this news story?",
"What is the main focus of this news?",
"In which part of the newspaper is this news likely to be found?",
"In which category does this news article belong?",
"Which category best suits this news article?",
"What do you think is the main objective of this news?",
"How would you summarize the central theme of this news?",
"What would you say is the primary focus of this news?",
"How do you describe the main topic of this news story?",
"How would you characterize the main topic of this news story?",
"How would you describe the main theme of this news story?",
"What is the primary focus of this news article?",
"How would you define the primary topic of this news item?",
"Which section of the newspaper is most likely to publish this story?",
"What is the main focus of this news article?",
"What is the main theme of this news?",
"In which category does this news article fit best?",
"What is your description of the main subject of this news?",
"What is your description of the main topic of this news?",
"How would you articulate the main subject of this news?",
"How would you explain the main subject of this news?",
"What category does this newspaper article belong to?",
"Which section of the newspaper is most likely to contain this news story?",
"How would you phrase the main subject of this news story?",
"Where in the newspaper might this news be found?",
"What is the central point of this news article?",
"Which category best fits this news article?",
"What is the main purpose of this press article?",
"What is the main subject of this news article?",
"How would you characterize the main subject of this news?",
"Which section of the newspaper is most likely to feature this news?",
"In which part of the newspaper is this news most likely to be found?",
"In which section of the newspaper is it most likely to find this news?",
"In which part of the newspaper is this news most likely to appear?",
"What is the main topic of this article?",
"What category does this news article fall into?",
"Into which category does this news article best fit?",
"Which category is most suitable for this news article?",
"How would you phrase the primary topic of this news story?",
"In which section of the newspaper would this news likely be found?",
"What would you say is the main focus of this news?",
"How would you explain the main theme of this message?",
"In which part of the newspaper would this news most likely appear?",
"Which category does this news article fall under?",
"How would you describe the main theme of this news?",
"What is the main point of this news?",
"What is the key focus of this news article?",
"In which section of the newspaper would this news story likely be found?",
"How would you articulate the main point of this news story?",
"Which part of the newspaper would most likely contain this news story?",
"What is the central focus of this news article?",
"What is the main point of this news article?",
"What category is this news article in?",
"How would you characterize the main topic of this news?",
"Under which category does this news article best fit?",
"What category best fits this news article?",
"Which category is the best match for this news story?",
"Which section of the newspaper is this news most likely to be in?",
"How would you define the main subject of this news?",
"Which part of the newspaper would most likely feature this news?",
"In which section of the newspaper is this news story most likely to be found?",
"Which category does this news article best fit under?",
"How would you summarize the central theme of this news story?",
"What is the focus of this news article?",
"How would you articulate the main theme of this news story?",
"Which category is most appropriate for this news article?",
"How would you explain the main topic of this news story?",
"Which category does this news article belong to?",
"In which section of the newspaper is this news most likely to be published?",
"How would you describe the main subject of this news?",
"How would you express the main topic of this news?",
"What is the central focus of this news?",
"How would you articulate the main topic of this news?"
]

enhanced_prompts1= [ 
"What category best fits this news article?",
"How would you describe the main topic of this news?",
"In which section of the newspaper would this news likely be found?",
"What is the primary focus of this news article?",
"Which category does this news article fall under?",
"In which category does this news article belong?",
"Which category is most appropriate for this news article?",
"Under which category does this news article best fit?",
"Which category is the best match for this news article?",
"Into which category does this news article best fit?",
"Which category is most suitable for this news article?",
"How would you define the primary subject of this news?",
"How would you summarize the central theme of this news?",
"What would you say is the primary focus of this news?",
"How would you explain the main subject of this news?",
"How would you characterize the main topic of this news?",
"What is your description of the main topic of this news?",
"How would you articulate the primary subject of this news?",
"Which section of the newspaper is most likely to feature this news?",
"In what section of the newspaper is this news most likely to appear?",
"Which section of the newspaper is this news most likely to be found in?",
"Which part of the newspaper would most likely feature this news?",
"Which section of the newspaper is most likely to contain this news?",
"In what section of the newspaper is this news most likely to be found?",
"Where in the newspaper is this news most likely to be located?",
"What is the main emphasis of this news article?",
"What is the central focus of this news article?",
"What is the main point of this news article?",
"What is the main subject of this news article?",
"What is the main focus of this news article?",
"What is the key focus of this news article?",
"What is the main topic of this news article?",
"What category best fits this news article: business, entertainment, politics, sport or tech?",
"How would you describe the main topic of this news: business, entertainment, politics, sport or tech?",
"In which section of the newspaper would this news likely be found: business, entertainment, politics, sport or tech?",
"What is the primary focus of this news article: business, entertainment, politics, sport or tech?",
"Which category does this news article fall under: business, entertainment, politics, sport or tech?",
"In which category does this news article belong: business, entertainment, politics, sport or tech?",
"Which category is most appropriate for this news article: business, entertainment, politics, sport or tech?",
"Under which category does this news article best fit: business, entertainment, politics, sport or tech?",
"Which category is the best match for this news article: business, entertainment, politics, sport or tech?",
"Into which category does this news article best fit: business, entertainment, politics, sport or tech?",
"Which category is most suitable for this news article: business, entertainment, politics, sport or tech?",
"How would you define the primary subject of this news: business, entertainment, politics, sport or tech?",
"How would you summarize the central theme of this news: business, entertainment, politics, sport or tech?",
"What would you say is the primary focus of this news: business, entertainment, politics, sport or tech?",
"How would you explain the main subject of this news: business, entertainment, politics, sport or tech?",
"How would you characterize the main topic of this news: business, entertainment, politics, sport or tech?",
"What is your description of the main topic of this news: business, entertainment, politics, sport or tech?",
"How would you articulate the primary subject of this news: business, entertainment, politics, sport or tech?",
"Which section of the newspaper is most likely to feature this news: business, entertainment, politics, sport or tech?",
"In what section of the newspaper is this news most likely to appear: business, entertainment, politics, sport or tech?",
"Which section of the newspaper is this news most likely to be found in: business, entertainment, politics, sport or tech?",
"Which part of the newspaper would most likely feature this news: business, entertainment, politics, sport or tech?",
"Which section of the newspaper is most likely to contain this news: business, entertainment, politics, sport or tech?",
"In what section of the newspaper is this news most likely to be found: business, entertainment, politics, sport or tech?",
"Where in the newspaper is this news most likely to be located: business, entertainment, politics, sport or tech?",
"What is the main emphasis of this news article: business, entertainment, politics, sport or tech?",
"What is the central focus of this news article: business, entertainment, politics, sport or tech?",
"What is the main point of this news article: business, entertainment, politics, sport or tech?",
"What is the main subject of this news article: business, entertainment, politics, sport or tech?",
"What is the main focus of this news article: business, entertainment, politics, sport or tech?",
"What is the key focus of this news article: business, entertainment, politics, sport or tech?",
"What is the main topic of this news article: business, entertainment, politics, sport or tech?",
"What category best fits this news article: business, entertainment, politics, sport or tech? Example: 'wales want rugby league training wales could follow england s lead by training with a rugby league club' is sport.",
"How would you describe the main topic of this news: business, entertainment, politics, sport or tech? Example: 'markets signal brazilian recovery the brazilian stock market has risen to a record high as investors display growing confidence in the durability of the country s economic recovery' is business.",
"In which section of the newspaper would this news likely be found: business, entertainment, politics, sport or tech? Example: 'wales want rugby league training wales could follow england s lead by training with a rugby league club' is sport.",
"What is the primary focus of this news article: business, entertainment, politics, sport or tech? Example: 'markets signal brazilian recovery the brazilian stock market has risen to a record high as investors display growing confidence in the durability of the country s economic recovery' is business.",
"Which category does this news article fall under: business, entertainment, politics, sport or tech? Example: 'wales want rugby league training wales could follow england s lead by training with a rugby league club' is sport.",
"In which category does this news article belong: business, entertainment, politics, sport or tech? Example: 'wales want rugby league training wales could follow england s lead by training with a rugby league club' is sport.",
"Which category is most appropriate for this news article: business, entertainment, politics, sport or tech? Example: 'tough rules for ringtone sellers firms that flout rules on how ringtones and other mobile extras are sold could be cut off from all uk phone networks' is tech.",
"Under which category does this news article best fit: business, entertainment, politics, sport or tech? Example: 'wales want rugby league training wales could follow england s lead by training with a rugby league club' is sport.",
"Which category is the best match for this news article: business, entertainment, politics, sport or tech? Example: 'tough rules for ringtone sellers firms that flout rules on how ringtones and other mobile extras are sold could be cut off from all uk phone networks' is tech.",
"Into which category does this news article best fit: business, entertainment, politics, sport or tech? Example: 'rock band u2 break ticket record u2 have smashed irish box office records with ticket sales for their dublin concerts after more than 150 000 were sold within 50 minutes' is entertainment.",
"Which category is most suitable for this news article: business, entertainment, politics, sport or tech? Example: 'markets signal brazilian recovery the brazilian stock market has risen to a record high as investors display growing confidence in the durability of the country s economic recovery' is business.",
"How would you define the primary subject of this news: business, entertainment, politics, sport or tech? Example: 'iraq advice claim sparks new row the tories say ministers must respond in parliament to claims that the legal advice used to justify the iraq war was drawn up at number 10' is politics.",
"How would you summarize the central theme of this news: business, entertainment, politics, sport or tech? Example: 'rock band u2 break ticket record u2 have smashed irish box office records with ticket sales for their dublin concerts after more than 150 000 were sold within 50 minutes' is entertainment.",
"What would you say is the primary focus of this news: business, entertainment, politics, sport or tech? Example: 'wales want rugby league training wales could follow england s lead by training with a rugby league club' is sport.",
"How would you explain the main subject of this news: business, entertainment, politics, sport or tech? Example: 'iraq advice claim sparks new row the tories say ministers must respond in parliament to claims that the legal advice used to justify the iraq war was drawn up at number 10' is politics.",
"How would you characterize the main topic of this news: business, entertainment, politics, sport or tech? Example: 'tough rules for ringtone sellers firms that flout rules on how ringtones and other mobile extras are sold could be cut off from all uk phone networks' is tech.",
"What is your description of the main topic of this news: business, entertainment, politics, sport or tech? Example: 'rock band u2 break ticket record u2 have smashed irish box office records with ticket sales for their dublin concerts after more than 150 000 were sold within 50 minutes' is entertainment.",
"How would you articulate the primary subject of this news: business, entertainment, politics, sport or tech? Example: 'iraq advice claim sparks new row the tories say ministers must respond in parliament to claims that the legal advice used to justify the iraq war was drawn up at number 10' is politics.",
"Which section of the newspaper is most likely to feature this news: business, entertainment, politics, sport or tech? Example: 'tough rules for ringtone sellers firms that flout rules on how ringtones and other mobile extras are sold could be cut off from all uk phone networks' is tech.",
"In what section of the newspaper is this news most likely to appear: business, entertainment, politics, sport or tech? Example: 'wales want rugby league training wales could follow england s lead by training with a rugby league club' is sport.",
"Which section of the newspaper is this news most likely to be found in: business, entertainment, politics, sport or tech? Example: 'iraq advice claim sparks new row the tories say ministers must respond in parliament to claims that the legal advice used to justify the iraq war was drawn up at number 10' is politics.",
"Which part of the newspaper would most likely feature this news: business, entertainment, politics, sport or tech? Example: 'tough rules for ringtone sellers firms that flout rules on how ringtones and other mobile extras are sold could be cut off from all uk phone networks' is tech.",
"Which section of the newspaper is most likely to contain this news: business, entertainment, politics, sport or tech? Example: 'markets signal brazilian recovery the brazilian stock market has risen to a record high as investors display growing confidence in the durability of the country s economic recovery' is business.",
"In what section of the newspaper is this news most likely to be found: business, entertainment, politics, sport or tech? Example: 'iraq advice claim sparks new row the tories say ministers must respond in parliament to claims that the legal advice used to justify the iraq war was drawn up at number 10' is politics.",
"Where in the newspaper is this news most likely to be located: business, entertainment, politics, sport or tech? Example: 'wales want rugby league training wales could follow england s lead by training with a rugby league club' is sport.",
"What is the main emphasis of this news article: business, entertainment, politics, sport or tech? Example: 'wales want rugby league training wales could follow england s lead by training with a rugby league club' is sport.",
"What is the central focus of this news article: business, entertainment, politics, sport or tech? Example: 'rock band u2 break ticket record u2 have smashed irish box office records with ticket sales for their dublin concerts after more than 150 000 were sold within 50 minutes' is entertainment.",
"What is the main point of this news article: business, entertainment, politics, sport or tech? Example: 'wales want rugby league training wales could follow england s lead by training with a rugby league club' is sport.",
"What is the main subject of this news article: business, entertainment, politics, sport or tech? Example: 'iraq advice claim sparks new row the tories say ministers must respond in parliament to claims that the legal advice used to justify the iraq war was drawn up at number 10' is politics.",
"What is the main focus of this news article: business, entertainment, politics, sport or tech? Example: 'wales want rugby league training wales could follow england s lead by training with a rugby league club' is sport.",
"What is the key focus of this news article: business, entertainment, politics, sport or tech? Example: 'iraq advice claim sparks new row the tories say ministers must respond in parliament to claims that the legal advice used to justify the iraq war was drawn up at number 10' is politics.",
"What is the main topic of this news article: business, entertainment, politics, sport or tech? Example: 'wales want rugby league training wales could follow england s lead by training with a rugby league club' is sport." ]

In [9]:
# load Sentence Transformer model
embedmodel = SentenceTransformer('all-MiniLM-L6-v2')

# generate embeddings for each prompt
prompt_embeddings = embedmodel.encode(enhanced_prompts)
prompt_embeddings1 = embedmodel.encode(enhanced_prompts1)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

In [10]:
# classify function
def classify_article_with_prompt_embedding(prompt_embedding, article_text):
    prompt_text = " ".join(map(str, prompt_embedding))
    input_text = f"{prompt_text} {article_text}"
    inputs = tokenizer(input_text, truncation=True, max_length=512, padding='max_length', return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    label_id = outputs.logits.argmax(dim=1).item()
    return label_id

def evaluate_prompt_embedding(prompt_embedding):
    predictions = [classify_article_with_prompt_embedding(prompt_embedding, article_text) for article_text in train_dataset['text']]
    return accuracy_score(train_dataset['label'], predictions)

# Bayesian optimization function
def black_box_function(prompt_idx):
    prompt_embedding = prompt_embeddings[int(prompt_idx)]
    return evaluate_prompt_embedding(prompt_embedding)

pbounds = {'prompt_idx': (0, len(prompt_embeddings) - 1)}

optimizer = BayesianOptimization(
    f=black_box_function,
    pbounds=pbounds,
    random_state=42,
)

optimizer.maximize(
    init_points=5,
    n_iter=15,
)

|   iter    |  target   | prompt... |
-------------------------------------
| [30m1         | [30m0.1843    | [30m50.19     |
| [30m2         | [30m0.1747    | [30m127.4     |
| [35m3         | [35m0.2306    | [35m98.09     |
| [30m4         | [30m0.1747    | [30m80.22     |
| [30m5         | [30m0.1747    | [30m20.91     |
| [30m6         | [30m0.2306    | [30m98.04     |
| [30m7         | [30m0.1747    | [30m109.6     |
| [35m8         | [35m0.2351    | [35m0.0       |
| [30m9         | [30m0.2306    | [30m6.37      |
| [30m10        | [30m0.1843    | [30m64.38     |
| [30m11        | [30m0.1747    | [30m36.59     |
| [30m12        | [30m0.1843    | [30m2.929     |
| [30m13        | [30m0.1747    | [30m8.105     |
| [30m14        | [30m0.1843    | [30m5.393     |
| [30m15        | [30m0.2351    | [30m0.6095    |
| [30m16        | [30m0.2351    | [30m99.39     |
| [30m17        | [30m0.1747    | [30m100.5     |
| [30m18        | [30

In [11]:
# Step 1: Find the maximum target value
max_target_value = max(res['target'] for res in optimizer.res)

# Step 2: Collect all unique prompt indices with the maximum target value
best_prompt_indices = [int(res['params']['prompt_idx']) for res in optimizer.res if res['target'] == max_target_value]
unique_best_prompt_indices = list(set(best_prompt_indices))

# Step 3: Get the unique embeddings
best_prompt_embeddings = [prompt_embeddings[idx] for idx in unique_best_prompt_indices]

# Step 4: Find the top one most similar prompt for each of the best embeddings
all_top_similar_prompts_dict = {}

for idx, best_prompt_embedding in enumerate(best_prompt_embeddings):
    similarities = cosine_similarity(best_prompt_embedding[np.newaxis, :], prompt_embeddings)[0]
    # Get the index of the most similar prompt for each best embedding
    top_index = np.argmax(similarities)
    # Collect the most similar prompt
    top_similar_prompt = enhanced_prompts[top_index]
    all_top_similar_prompts_dict[unique_best_prompt_indices[idx]] = top_similar_prompt

# Output best prompt embedding indices and their corresponding top one similar prompts
print("Best prompt embedding indices and their top one similar prompts:")
for index, prompt in all_top_similar_prompts_dict.items():
    print(f"Index: {index}, Top Similar Prompt: {prompt}")

Best prompt embedding indices and their top one similar prompts:
Index: 0, Top Similar Prompt: What is the central focus of this newspaper article?
Index: 99, Top Similar Prompt: Which category is most suitable for this news article?


In [12]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir='./logs',
    load_best_model_at_end=True,
    report_to="none",
    seed= seed,
)

# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Iterate over each index and its most similar prompt
for idx, prompts in all_top_similar_prompts_dict.items():
    best_prompt = prompts  # Since we now store only the top prompt per index
    print(f"Training and evaluating for best prompt embedding index: {idx}")
    print(f"Most similar prompt: {best_prompt}")

    # Function to tokenize text with the given prompt
    def tokenize_with_prompt(examples):
        inputs = [f"{best_prompt} {text}" for text in examples['text']]
        return tokenizer(inputs, truncation=True, padding='max_length', max_length=512)

    # Apply the tokenize function to datasets
    train_data_with_prompt = train_dataset.map(tokenize_with_prompt, batched=True)
    val_data_with_prompt = val_dataset.map(tokenize_with_prompt, batched=True)
    test_data_with_prompt = test_dataset.map(tokenize_with_prompt, batched=True)

    # Initialize the Trainer
    trainer2 = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data_with_prompt,
        eval_dataset= val_data_with_prompt,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    # Train the model
    trainer2.train()

    # Evaluate the model on the test set
    eval_result = trainer2.evaluate(test_data_with_prompt)
    print(f"Final evaluation results for prompt index {idx} on test set: {eval_result}")

Training and evaluating for best prompt embedding index: 0
Most similar prompt: What is the central focus of this newspaper article?




Map:   0%|          | 0/1557 [00:00<?, ? examples/s]

Map:   0%|          | 0/223 [00:00<?, ? examples/s]

Map:   0%|          | 0/445 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.173473,0.973094,0.972917,0.973025,0.973094
2,No log,0.211726,0.977578,0.97754,0.977704,0.977578
3,No log,0.199031,0.977578,0.97754,0.977704,0.977578
4,No log,0.225477,0.964126,0.964025,0.964016,0.964126
5,No log,0.254776,0.973094,0.973006,0.973118,0.973094
6,0.024600,0.232339,0.973094,0.972866,0.973203,0.973094
7,0.024600,0.279718,0.959641,0.959687,0.960532,0.959641
8,0.024600,0.252082,0.96861,0.968383,0.968439,0.96861
9,0.024600,0.261466,0.973094,0.973006,0.973118,0.973094
10,0.024600,0.258927,0.973094,0.973006,0.973118,0.973094


Final evaluation results for prompt index 0 on test set: {'eval_loss': 0.1057116836309433, 'eval_accuracy': 0.9685393258426966, 'eval_f1': 0.9687662092358764, 'eval_precision': 0.9695136324349808, 'eval_recall': 0.9685393258426966, 'eval_runtime': 4.5491, 'eval_samples_per_second': 97.822, 'eval_steps_per_second': 6.155, 'epoch': 10.0}
Training and evaluating for best prompt embedding index: 99
Most similar prompt: Which category is most suitable for this news article?


Map:   0%|          | 0/1557 [00:00<?, ? examples/s]

Map:   0%|          | 0/223 [00:00<?, ? examples/s]

Map:   0%|          | 0/445 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.354597,0.941704,0.942078,0.943757,0.941704
2,No log,0.291746,0.973094,0.972914,0.973391,0.973094
3,No log,0.298831,0.959641,0.959543,0.960065,0.959641
4,No log,0.263637,0.964126,0.963894,0.963857,0.964126
5,No log,0.447882,0.93722,0.937672,0.942715,0.93722
6,0.013800,0.291979,0.964126,0.963945,0.964037,0.964126
7,0.013800,0.373214,0.950673,0.950518,0.951479,0.950673
8,0.013800,0.368375,0.950673,0.950518,0.951479,0.950673
9,0.013800,0.352266,0.959641,0.959481,0.959633,0.959641
10,0.013800,0.348289,0.959641,0.959481,0.959633,0.959641


Final evaluation results for prompt index 99 on test set: {'eval_loss': 0.1525474637746811, 'eval_accuracy': 0.9865168539325843, 'eval_f1': 0.9865610120512225, 'eval_precision': 0.987067728343116, 'eval_recall': 0.9865168539325843, 'eval_runtime': 4.5692, 'eval_samples_per_second': 97.391, 'eval_steps_per_second': 6.128, 'epoch': 10.0}


In [13]:
def classify_article_with_prompt_embedding(prompt_embedding, article_text):
    prompt_text = " ".join(map(str, prompt_embedding))
    input_text = f"{prompt_text} {article_text}"
    inputs = tokenizer(input_text, truncation=True, max_length=512, padding='max_length', return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    label_id = outputs.logits.argmax(dim=1).item()
    return label_id

def evaluate_prompt_embedding(prompt_embedding):
    predictions = [classify_article_with_prompt_embedding(prompt_embedding, article_text) for article_text in train_dataset['text']]
    return accuracy_score(train_dataset['label'], predictions)

# Bayesian optimization function
def black_box_function(prompt_idx):
    prompt_embedding = prompt_embeddings1[int(prompt_idx)]
    return evaluate_prompt_embedding(prompt_embedding)

pbounds = {'prompt_idx': (0, len(prompt_embeddings1) - 1)}

optimizer = BayesianOptimization(
    f=black_box_function,
    pbounds=pbounds,
    random_state=42,
)

optimizer.maximize(
    init_points=5,
    n_iter=15,
)

|   iter    |  target   | prompt... |
-------------------------------------
| [30m1         | [30m0.1747    | [30m35.58     |
| [35m2         | [35m0.1753    | [35m90.32     |
| [30m3         | [30m0.1747    | [30m69.54     |
| [30m4         | [30m0.1747    | [30m56.87     |
| [35m5         | [35m0.2351    | [35m14.82     |
| [30m6         | [30m0.2306    | [30m5.559     |
| [30m7         | [30m0.1843    | [30m13.78     |
| [30m8         | [30m0.1753    | [30m34.66     |
| [30m9         | [30m0.1747    | [30m15.18     |
| [30m10        | [30m0.2351    | [30m89.57     |
| [30m11        | [30m0.1747    | [30m88.27     |
| [30m12        | [30m0.2306    | [30m31.02     |
| [30m13        | [30m0.2306    | [30m5.559     |
| [30m14        | [30m0.2351    | [30m14.65     |
| [30m15        | [30m0.2351    | [30m89.29     |
| [30m16        | [30m0.2351    | [30m30.63     |
| [30m17        | [30m0.2351    | [30m30.18     |
| [30m18        | [30

In [14]:
# Step 1: Find the maximum target value
max_target_value = max(res['target'] for res in optimizer.res)

# Step 2: Collect all unique prompt indices with the maximum target value
best_prompt_indices = [int(res['params']['prompt_idx']) for res in optimizer.res if res['target'] == max_target_value]
unique_best_prompt_indices = list(set(best_prompt_indices))

# Step 3: Get the unique embeddings
best_prompt_embeddings = [prompt_embeddings1[idx] for idx in unique_best_prompt_indices]

# Step 4: Find the top one most similar prompt for each of the best embeddings
all_top_similar_prompts_dict = {}

for idx, best_prompt_embedding in enumerate(best_prompt_embeddings):
    similarities = cosine_similarity(best_prompt_embedding[np.newaxis, :], prompt_embeddings1)[0]
    # Get the index of the most similar prompt for each best embedding
    top_index = np.argmax(similarities)
    # Collect the most similar prompt
    top_similar_prompt = enhanced_prompts1[top_index]
    all_top_similar_prompts_dict[unique_best_prompt_indices[idx]] = top_similar_prompt

# Output best prompt embedding indices and their corresponding top one similar prompts
print("Best prompt embedding indices and their top one similar prompts:")
for index, prompt in all_top_similar_prompts_dict.items():
    print(f"Index: {index}, Top Similar Prompt: {prompt}")

Best prompt embedding indices and their top one similar prompts:
Index: 89, Top Similar Prompt: What is the main emphasis of this news article: business, entertainment, politics, sport or tech? Example: 'wales want rugby league training wales could follow england s lead by training with a rugby league club' is sport.
Index: 14, Top Similar Prompt: How would you explain the main subject of this news?
Index: 30, Top Similar Prompt: What is the key focus of this news article?


In [15]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir='./logs',
    load_best_model_at_end=True,
    report_to="none",
    seed= seed,
)
# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Iterate over each index and its most similar prompt
for idx, prompts in all_top_similar_prompts_dict.items():
    best_prompt = prompts  # Since we now store only the top prompt per index
    print(f"Training and evaluating for best prompt embedding index: {idx}")
    print(f"Most similar prompt: {best_prompt}")

    # Function to tokenize text with the given prompt
    def tokenize_with_prompt(examples):
        inputs = [f"{best_prompt} {text}" for text in examples['text']]
        return tokenizer(inputs, truncation=True, padding='max_length', max_length=512)

    # Apply the tokenize function to datasets
    train_data_with_prompt1 = train_dataset.map(tokenize_with_prompt, batched=True)
    val_data_with_prompt1 = val_dataset.map(tokenize_with_prompt, batched=True)
    test_data_with_prompt1 = test_dataset.map(tokenize_with_prompt, batched=True)

    # Initialize the Trainer
    trainer3 = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data_with_prompt1,
        eval_dataset=val_data_with_prompt1,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    # Train the model
    trainer3.train()

    # Evaluate the model on the test set
    eval_result = trainer3.evaluate(test_data_with_prompt1)
    print(f"Final evaluation results for prompt index {idx} on test set: {eval_result}")

Training and evaluating for best prompt embedding index: 89
Most similar prompt: What is the main emphasis of this news article: business, entertainment, politics, sport or tech? Example: 'wales want rugby league training wales could follow england s lead by training with a rugby league club' is sport.




Map:   0%|          | 0/1557 [00:00<?, ? examples/s]

Map:   0%|          | 0/223 [00:00<?, ? examples/s]

Map:   0%|          | 0/445 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.304824,0.964126,0.96398,0.964287,0.964126
2,No log,0.36403,0.959641,0.959543,0.960065,0.959641
3,No log,0.36714,0.96861,0.96838,0.968806,0.96861
4,No log,0.400005,0.955157,0.955113,0.955359,0.955157
5,No log,0.432474,0.950673,0.950579,0.950773,0.950673
6,0.009700,0.398105,0.950673,0.950467,0.950567,0.950673
7,0.009700,0.403762,0.955157,0.955113,0.955359,0.955157
8,0.009700,0.420143,0.955157,0.955001,0.955153,0.955157
9,0.009700,0.413287,0.959641,0.95943,0.959968,0.959641
10,0.009700,0.415734,0.959641,0.959543,0.960065,0.959641


Final evaluation results for prompt index 89 on test set: {'eval_loss': 0.22809657454490662, 'eval_accuracy': 0.9640449438202248, 'eval_f1': 0.9643265983890499, 'eval_precision': 0.9657569223033565, 'eval_recall': 0.9640449438202248, 'eval_runtime': 4.5635, 'eval_samples_per_second': 97.513, 'eval_steps_per_second': 6.136, 'epoch': 10.0}
Training and evaluating for best prompt embedding index: 14
Most similar prompt: How would you explain the main subject of this news?


Map:   0%|          | 0/1557 [00:00<?, ? examples/s]

Map:   0%|          | 0/223 [00:00<?, ? examples/s]

Map:   0%|          | 0/445 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.310658,0.96861,0.968477,0.968725,0.96861
2,No log,0.336375,0.959641,0.959481,0.959633,0.959641
3,No log,0.29924,0.973094,0.972819,0.973115,0.973094
4,No log,0.327463,0.96861,0.968383,0.968439,0.96861
5,No log,0.340861,0.96861,0.9684,0.968879,0.96861
6,0.008300,0.419665,0.955157,0.954998,0.956533,0.955157
7,0.008300,0.30924,0.973094,0.973006,0.973118,0.973094
8,0.008300,0.323489,0.964126,0.964094,0.964695,0.964126
9,0.008300,0.326392,0.964126,0.964094,0.964695,0.964126
10,0.008300,0.326461,0.964126,0.964094,0.964695,0.964126


Final evaluation results for prompt index 14 on test set: {'eval_loss': 0.14792011678218842, 'eval_accuracy': 0.9865168539325843, 'eval_f1': 0.9865791239263973, 'eval_precision': 0.9868860426997316, 'eval_recall': 0.9865168539325843, 'eval_runtime': 4.5393, 'eval_samples_per_second': 98.033, 'eval_steps_per_second': 6.168, 'epoch': 10.0}
Training and evaluating for best prompt embedding index: 30
Most similar prompt: What is the key focus of this news article?


Map:   0%|          | 0/1557 [00:00<?, ? examples/s]

Map:   0%|          | 0/223 [00:00<?, ? examples/s]

Map:   0%|          | 0/445 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.290344,0.977578,0.977245,0.978625,0.977578
2,No log,0.306624,0.96861,0.968517,0.968513,0.96861
3,No log,0.335015,0.96861,0.96838,0.968806,0.96861
4,No log,0.315152,0.973094,0.973006,0.973118,0.973094
5,No log,0.310483,0.977578,0.97754,0.977704,0.977578
6,0.002600,0.27086,0.982063,0.981833,0.98266,0.982063
7,0.002600,0.282673,0.973094,0.973006,0.973118,0.973094
8,0.002600,0.279937,0.973094,0.973006,0.973118,0.973094
9,0.002600,0.298925,0.973094,0.973006,0.973118,0.973094
10,0.002600,0.299175,0.973094,0.973006,0.973118,0.973094


Final evaluation results for prompt index 30 on test set: {'eval_loss': 0.175344318151474, 'eval_accuracy': 0.9775280898876404, 'eval_f1': 0.9775556584251022, 'eval_precision': 0.9777769666320002, 'eval_recall': 0.9775280898876404, 'eval_runtime': 4.5507, 'eval_samples_per_second': 97.787, 'eval_steps_per_second': 6.153, 'epoch': 10.0}
