In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig, Trainer, TrainingArguments, DistilBertForSequenceClassification
import torch
from torch.utils.data import Dataset
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report

  from .autonotebook import tqdm as notebook_tqdm
  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
2024-03-06 16:16:20.961893: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
class EmotionDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item
    
    def __len__(self):
        return len(self.labels)

In [3]:
def tokenize_data(X, y, max_length=128): 
    encodings = tokenizer(X, truncation=True, padding=True, max_length=max_length, return_tensors="pt")
    labels = torch.tensor(y, dtype=torch.float)
    return EmotionDataset(encodings, labels)

In [4]:
df = pd.read_csv('text_df.csv', index_col=0)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df.iloc[:, 1:], test_size=0.3, random_state=42)

In [6]:
tokenizer = BertTokenizer.from_pretrained('distilbert-base-uncased')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.


In [7]:
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=512)

In [8]:
train_dataset = EmotionDataset(train_encodings, y_train.values)
test_dataset = EmotionDataset(test_encodings, y_test.values)

In [9]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=6)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1, 
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=8, 
    warmup_steps=0,
    weight_decay=0,
    logging_dir='./logs',
    logging_steps=500,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'pre_classifier.bias', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss


IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [None]:
predictions = trainer.predict(test_dataset)
pred_labels = predictions.predictions

In [22]:
probabilities = torch.sigmoid(torch.tensor(predictions.predictions)).numpy()

threshold = 0.5
pred_labels = (probabilities > threshold).astype(int)

In [23]:
y_true_flat = y_test.values.flatten()
pred_labels_flat = pred_labels.flatten()

In [24]:
precision = precision_score(y_true_flat, pred_labels_flat, average='binary')
recall = recall_score(y_true_flat, pred_labels_flat, average='binary')
f1 = f1_score(y_true_flat, pred_labels_flat, average='binary')
accuracy = accuracy_score(y_true_flat, pred_labels_flat)

In [25]:
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"Accuracy: {accuracy}")

Precision: 0.9251517049258211
Recall: 0.9157125248480616
F1 Score: 0.9204079148076799
Accuracy: 0.9735214515790878
