In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.0.1


In [3]:
from sklearn.model_selection import train_test_split
from datasets import load_dataset, Dataset
import torch
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, GPT2Model
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from bayes_opt import BayesianOptimization
import random
from collections import defaultdict

2024-08-08 16:40:25.036807: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-08 16:40:25.036927: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-08 16:40:25.176666: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
import spacy
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

nlp = spacy.load("en_core_web_sm")


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
def stratified_sample(df, fraction):
    stratify_col = df['label']
    df_sampled, _ = train_test_split(df, test_size=(1 - fraction), stratify=stratify_col)
    return df_sampled

def preprocess_text(text):
    doc = nlp(text)
    stop_words = set(stopwords.words('english'))
    lemmatized_words = [token.lemma_ for token in doc if token.is_alpha and token.text not in stop_words]
    return " ".join(lemmatized_words)

In [6]:
seed =42 
dataset = load_dataset("dair-ai/emotion", "split", trust_remote_code=True)

# Convert to pandas DataFrame
train_df = pd.DataFrame(dataset['train'])
val_df = pd.DataFrame(dataset['validation'])
test_df = pd.DataFrame(dataset['test'])

# Define the fraction for sampling
fraction = 0.5

# Perform stratified sampling
sampled_train_df = stratified_sample(train_df, fraction)
sampled_val_df = stratified_sample(val_df, fraction)
sampled_test_df = stratified_sample(test_df, fraction)

# Preprocess the text
sampled_train_df['text'] = sampled_train_df['text'].apply(preprocess_text)
sampled_val_df['text'] = sampled_val_df['text'].apply(preprocess_text)
sampled_test_df['text'] = sampled_test_df['text'].apply(preprocess_text)

# Display the sampled and preprocessed datasets
print(sampled_train_df.head())
print(sampled_val_df.head())
print(sampled_test_df.head())

Downloading readme:   0%|          | 0.00/9.05k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.03M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/127k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/129k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

                                                    text  label
5196     feel dull many time headache many time insomnia      0
5655    certain situation feel neglect undeservedly harm      3
15232                                     feel unwelcome      0
14010  raphael say grasp usual eloquence feel slip fi...      3
2296               remember feel surprised option listen      5
                                                   text  label
228   begin feel shaky heart sort skip around feel l...      4
1901                  believe ground game feel superior      1
1985  start see concern pattern rush home end evenin...      3
988   constantly amazed world building maybe come ha...      2
1973              find feel bit shame defensive exclude      0
                                                   text  label
479   venture fabric amp fabric whim yesterday feel ...      4
730   feel like rich purple gold match make heaven r...      1
1826  always feel like amazing style day choose c

In [7]:
# Convert pandas DataFrame back to Hugging Face Dataset
sampled_train_dataset = Dataset.from_pandas(sampled_train_df)
sampled_val_dataset = Dataset.from_pandas(sampled_val_df)
sampled_test_dataset = Dataset.from_pandas(sampled_test_df)

In [8]:
# Load GPT2Tokenizer and Model
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')



# Add pad_token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

model = GPT2ForSequenceClassification.from_pretrained('distilgpt2', num_labels=6)

# Resize token embeddings to match the tokenizer length
model.resize_token_embeddings(len(tokenizer))

# Set the padding token ID in the model configuration
model.config.pad_token_id = tokenizer.pad_token_id

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = model.to(device)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at distilgpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding=True, max_length=512)

train_data1 = sampled_train_dataset.map(tokenize_function, batched=True)
val_data1 = sampled_val_dataset.map(tokenize_function, batched=True)
test_data1 = sampled_test_dataset.map(tokenize_function, batched=True)

# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define compute metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir='./logs',
    load_best_model_at_end=True,
    report_to="none",
    seed=seed,
)

# Define Trainer
trainer1 = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data1,
    eval_dataset=val_data1,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Train the model
trainer1.train()

# Evaluate the best model on the test set
eval_result = trainer1.evaluate(test_data1)
print(f"Final evaluation results on test set: {eval_result}")

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.1796,0.548484,0.812,0.810396,0.811602,0.812
2,0.457,0.359942,0.875,0.875531,0.878352,0.875
3,0.2932,0.31291,0.895,0.894347,0.898216,0.895
4,0.2197,0.284184,0.918,0.917515,0.918392,0.918
5,0.1879,0.286402,0.909,0.909409,0.912162,0.909
6,0.1497,0.322613,0.919,0.919146,0.919908,0.919
7,0.1312,0.333217,0.912,0.911259,0.91309,0.912
8,0.1147,0.309086,0.917,0.916133,0.91656,0.917
9,0.106,0.313349,0.921,0.920792,0.921095,0.921
10,0.0993,0.315011,0.92,0.919832,0.920538,0.92


Final evaluation results on test set: {'eval_loss': 0.3025406002998352, 'eval_accuracy': 0.911, 'eval_f1': 0.9093777531196711, 'eval_precision': 0.9100897336028915, 'eval_recall': 0.911, 'eval_runtime': 0.8237, 'eval_samples_per_second': 1213.984, 'eval_steps_per_second': 76.481, 'epoch': 10.0}
