In [1]:
import pandas as pd
import numpy as np
from pyprojroot import here

data_3 = pd.read_parquet(here("data/goemotions_3.parquet"))
data_3.columns

Index(['id', 'text', 'n_raters', 'emotions', 'author', 'subreddit', 'link_id',
       'parent_id', 'created_utc', 'emotion_positive', 'emotion_negative',
       'emotion_ambiguous'],
      dtype='object')

In [2]:
data_3.emotions

0        [[disappointment], [disappointment], [disappoi...
1                        [[curiosity], [curiosity], [joy]]
2        [[unclear], [sadness], [disapproval], [unclear...
3        [[sadness], [embarrassment, sadness], [unclear...
4        [[gratitude], [excitement, gratitude], [gratit...
                               ...                        
57344         [[disappointment], [confusion], [confusion]]
57345    [[realization], [annoyance, disappointment], [...
57346                    [[unclear], [unclear], [unclear]]
57347    [[admiration, amusement], [excitement], [admir...
57348                 [[unclear], [admiration], [unclear]]
Name: emotions, Length: 57349, dtype: object

In [None]:
import os
os.environ["HF_HOME"] = str(here("cache/huggingface"))
from transformers import AutoModelForSequenceClassification, AutoTokenizer

distill_3 = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
distill_3

In [None]:
# see tokenization in process
"""
DistilBERT tokenizer return whole tokens if they exist in the vocabulary, 
but falls back to WordPiece subwords if not
"""
tokens = tokenizer("I love programming in Python! Unbelievablingy")
print(tokenizer.convert_ids_to_tokens(tokens["input_ids"]))


['[CLS]', 'i', 'love', 'programming', 'in', 'python', '!', 'un', '##bel', '##ie', '##va', '##bling', '##y', '[SEP]']


In [None]:
import torch
from torchinfo import summary
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

with torch.no_grad():
  dummy_text = "This is a dummy text for testing the model."
  dummy_inputs = tokenizer(
    dummy_text,
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=512
  )
  out = summary(
    model_3,
    input_data={
      "input_ids": dummy_inputs["input_ids"].to(DEVICE).long(),
      "attention_mask": dummy_inputs["attention_mask"].to(DEVICE).long()
    },
    col_names=["input_size", "output_size", "num_params", "trainable"],
    device=DEVICE,
    batch_dim=64
  )
out

Layer (type:depth-idx)                                  Input Shape               Output Shape              Param #                   Trainable
DistilBertForSequenceClassification                     --                        [[1, 3, 1]]               --                        True
├─DistilBertModel: 1-1                                  --                        [[1, 12, 768, 1]]         --                        True
│    └─Embeddings: 2-1                                  [1, 12, 1]                [1, 12, 768]              --                        True
│    │    └─Embedding: 3-1                              [1, 12, 1]                [1, 12, 768]              23,440,896                True
│    │    └─Embedding: 3-2                              [1, 12, 1]                [1, 12, 768]              393,216                   True
│    │    └─LayerNorm: 3-3                              [1, 12, 768, 1]           [1, 12, 768]              1,536                     True
│    │    └─Dropout: 3

In [None]:
import torch
# try out a random example
example = data_3.iloc[0].text
with torch.no_grad():
  inputs = tokenizer(example, return_tensors="pt", truncation=True, padding=True)
  outputs = model_3(**inputs)
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[ 0.0250, -0.1824,  0.1597]]), hidden_states=None, attentions=None)

In [None]:
import lightning as L
import torchmetrics
from icecream import ic
import torchmetrics.classification
import torchmetrics.regression
import torchmetrics.text

class DistilBertFinetune(L.LightningModule):
  def __init__(self, distilbert_model, tokenizer, n_emotions=3):
    super().__init__()
    self.tokenizer = tokenizer
    self.model = distilbert_model
    self.model.classifier = torch.nn.Linear(in_features=768, out_features=n_emotions, bias=True)
    self.sigmoid = torch.nn.Sigmoid()
    self.sig_loss = torch.nn.BCEWithLogitsLoss()
    self.preplexity = torchmetrics.text.Perplexity()
    self.f1 = torchmetrics.classification.F1Score(task = "multilabel", num_classes=n_emotions, average="macro") # macro is average of f1s, micro is global f1
    self.rmse = torchmetrics.regression.MeanSquaredError(squared=False)
  def training_step(self, batch):
    x, target = batch
    tokens = self.tokenizer(
      x,
      return_tensors="pt",
      padding=True,
      truncation=True,
      max_length=512
    )
    logits = self.model(
      input_ids=tokens["input_ids"],
      attention_mask=tokens["attention_mask"]
    )
    loss = self.sig_loss(
      logits.logits,
      target
    )
    return loss

  def validation_step(self, batch):
    x, target = batch
    tokens = self.tokenizer(
      x,
      return_tensors="pt",
      padding=True,
      truncation=True,
      max_length=512
    )
    logits = self.model(
      input_ids=tokens["input_ids"],
      attention_mask=tokens["attention_mask"]
    )
    loss = self.sig_loss(
      logits.logits,
      target
    )
    y = self.sigmoid(logits.logits)
    self.log_dict({
      "test_loss": loss, 
      "test_perplexity": self.preplexity(preds=y, target=target),
      "test_rmse": self.rmse(y, target),
    }, on_step=False, on_epoch=True, prog_bar=True, logger=False)
    return loss
  
  def test_step(self, batch):
    x, target = batch
    tokens = self.tokenizer(
      x,
      return_tensors="pt",
      padding=True,
      truncation=True,
      max_length=512
    )
    logits = self.model(
      input_ids=tokens["input_ids"],
      attention_mask=tokens["attention_mask"]
    )
    loss = self.sig_loss(
      logits.logits,
      target
    )
    y = self.sigmoid(logits.logits)
    preplexity = self.preplexity(preds=y, target=target)
    rmse = self.rmse(y, target)
    # if target > 0 then 1
    # if target == 0 then 0
    y = (y > 0.5).int() # thresholding at 0.5 # TODO possibly change this later
    target = (target > 0.01).int()
    f1 = self.f1(y, target)
    self.log_dict({
      "test_loss": loss,
      "test_perplexity": preplexity,
      "test_f1": f1,
      "test_rmse": rmse
    }, on_step=False, on_epoch=True, prog_bar=True, logger=False)
    return loss
  
  def predict_step(self, batch):
    x = batch # TODO check if this is right
    tokens = self.tokenizer(
      x,
      return_tensors="pt",
      padding=True,
      truncation=True,
      max_length=512
    )
    logits = self.model(
      input_ids=tokens["input_ids"],
      attention_mask=tokens["attention_mask"]
    )
    y = self.sigmoid(logits.logits)
    return y
  
  def configure_optimizers(self):
    optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
    return optimizer
  
model_3 = DistilBertFinetune(
  distilbert_model=distill_3,
  tokenizer=tokenizer,
  n_emotions=3
)
model_3

transformers.models.distilbert.modeling_distilbert.DistilBertForSequenceClassification