In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
from sklearn.model_selection import train_test_split
from datasets import load_dataset, Dataset
import torch
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, GPT2Model
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from bayes_opt import BayesianOptimization
import random
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict

2024-08-12 14:46:48.817083: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-12 14:46:48.817186: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-12 14:46:48.948649: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# set the random seed
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

In [4]:
# Load the dataset
dataset = load_dataset("dair-ai/emotion", "split")

# Load each split
train_dataset = dataset['train']
val_dataset = dataset['validation']
test_dataset = dataset['test']

def stratified_sample(dataset, fraction):
    label_counts = defaultdict(list)
    
    # Group indices by label
    for i, example in enumerate(dataset):
        label_counts[example['label']].append(i)
    
    sampled_indices = []
    for label, indices in label_counts.items():
        # Calculate the number of samples to draw for each label
        sample_size = int(len(indices) * fraction)
        sampled_indices.extend(random.sample(indices, min(sample_size, len(indices))))
    
    return dataset.select(sampled_indices)

# Calculate the fraction size (one fifth) for each dataset
fraction = 1 / 2

# Perform stratified sampling
sampled_train_dataset = stratified_sample(train_dataset, fraction)
sampled_val_dataset = stratified_sample(val_dataset, fraction)
sampled_test_dataset = stratified_sample(test_dataset, fraction)

Downloading readme:   0%|          | 0.00/9.05k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.03M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/127k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/129k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [5]:
# Load GPT2Tokenizer and Model
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')



# Add pad_token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

model = GPT2ForSequenceClassification.from_pretrained('distilgpt2', num_labels=6)

# Resize token embeddings to match the tokenizer length
model.resize_token_embeddings(len(tokenizer))

# Set the padding token ID in the model configuration
model.config.pad_token_id = tokenizer.pad_token_id

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = model.to(device)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at distilgpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
compareprompts = [
"What mood does the author convey in the message?",
"What would be your feelings towards the message?",
"How would you describe the author's feeling in the message?"
]

In [7]:
# Define compute metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Training arguments setup
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir='./logs',
    load_best_model_at_end=True,
    report_to="none",  # Ensure reproducibility
    seed=42
)

# Data collator for padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Evaluate each prompt from compareprompts
for prompt in compareprompts:
    print(f"Evaluating with prompt: {prompt}")

    # Function to tokenize text with the given prompt
    def tokenize_with_prompt(examples):
        inputs = [f"{prompt} {text}" for text in examples['text']]
        return tokenizer(inputs, truncation=True, padding='max_length', max_length=512)

    # Apply the tokenize function to datasets
    train_data_with_prompt = sampled_train_dataset.map(tokenize_with_prompt, batched=True)
    val_data_with_prompt = sampled_val_dataset.map(tokenize_with_prompt, batched=True)
    test_data_with_prompt = sampled_test_dataset.map(tokenize_with_prompt, batched=True)

    # Initialize the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data_with_prompt,
        eval_dataset=val_data_with_prompt,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    # Train the model
    trainer.train()

    # Evaluate the model on the test set
    eval_result = trainer.evaluate(test_data_with_prompt)
    print(f"Final evaluation results for prompt '{prompt}' on test set: {eval_result}")



Evaluating with prompt: What mood does the author convey in the message?


Map:   0%|          | 0/7999 [00:00<?, ? examples/s]

Map:   0%|          | 0/999 [00:00<?, ? examples/s]

Map:   0%|          | 0/998 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.1819,0.468285,0.826827,0.819831,0.83064,0.826827
2,0.4033,0.272769,0.900901,0.900761,0.90524,0.900901
3,0.2323,0.227618,0.921922,0.921856,0.924037,0.921922
4,0.1809,0.222739,0.914915,0.915033,0.918052,0.914915
5,0.1588,0.202268,0.923924,0.924031,0.924667,0.923924
6,0.1293,0.230234,0.920921,0.921143,0.921865,0.920921
7,0.1163,0.216419,0.91992,0.920164,0.921838,0.91992
8,0.0992,0.234176,0.926927,0.926701,0.927672,0.926927
9,0.0908,0.227077,0.91992,0.919484,0.92006,0.91992
10,0.0867,0.230104,0.924925,0.924546,0.924792,0.924925


Final evaluation results for prompt 'What mood does the author convey in the message?' on test set: {'eval_loss': 0.19457325339317322, 'eval_accuracy': 0.9138276553106213, 'eval_f1': 0.9136414585790207, 'eval_precision': 0.9135478082654614, 'eval_recall': 0.9138276553106213, 'eval_runtime': 10.1759, 'eval_samples_per_second': 98.075, 'eval_steps_per_second': 6.191, 'epoch': 10.0}
Evaluating with prompt: What would be your feelings towards the message?


Map:   0%|          | 0/7999 [00:00<?, ? examples/s]

Map:   0%|          | 0/999 [00:00<?, ? examples/s]

Map:   0%|          | 0/998 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1628,0.228287,0.925926,0.926155,0.928303,0.925926
2,0.1322,0.26404,0.924925,0.92435,0.927064,0.924925
3,0.115,0.269251,0.922923,0.921707,0.922314,0.922923
4,0.1012,0.268525,0.928929,0.92884,0.931748,0.928929
5,0.0875,0.230085,0.928929,0.928759,0.928859,0.928929
6,0.0776,0.25,0.928929,0.928545,0.929885,0.928929
7,0.0637,0.272647,0.928929,0.928923,0.929556,0.928929
8,0.0601,0.281865,0.926927,0.926246,0.927075,0.926927
9,0.0453,0.294624,0.921922,0.921247,0.922305,0.921922
10,0.0428,0.282255,0.92993,0.929483,0.929706,0.92993


Final evaluation results for prompt 'What would be your feelings towards the message?' on test set: {'eval_loss': 0.2177208662033081, 'eval_accuracy': 0.9158316633266533, 'eval_f1': 0.9150625156775797, 'eval_precision': 0.916904343193386, 'eval_recall': 0.9158316633266533, 'eval_runtime': 10.1349, 'eval_samples_per_second': 98.472, 'eval_steps_per_second': 6.216, 'epoch': 10.0}
Evaluating with prompt: How would you describe the author's feeling in the message?


Map:   0%|          | 0/7999 [00:00<?, ? examples/s]

Map:   0%|          | 0/999 [00:00<?, ? examples/s]

Map:   0%|          | 0/998 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1389,0.210999,0.930931,0.931037,0.932174,0.930931
2,0.1166,0.221449,0.923924,0.923472,0.924764,0.923924
3,0.0918,0.273471,0.92993,0.928887,0.929326,0.92993
4,0.0921,0.267106,0.924925,0.92507,0.926143,0.924925
5,0.0708,0.272114,0.932933,0.932811,0.933256,0.932933
6,0.0639,0.244838,0.92993,0.929565,0.929975,0.92993
7,0.0578,0.272062,0.924925,0.92444,0.92463,0.924925
8,0.0456,0.272488,0.932933,0.932268,0.932894,0.932933
9,0.0365,0.272679,0.936937,0.936124,0.936711,0.936937
10,0.0337,0.274472,0.932933,0.932413,0.932804,0.932933


Final evaluation results for prompt 'How would you describe the author's feeling in the message?' on test set: {'eval_loss': 0.2148037552833557, 'eval_accuracy': 0.9198396793587175, 'eval_f1': 0.9188766298425116, 'eval_precision': 0.9199351286206492, 'eval_recall': 0.9198396793587175, 'eval_runtime': 10.1156, 'eval_samples_per_second': 98.659, 'eval_steps_per_second': 6.228, 'epoch': 10.0}
