In [14]:
import pandas as pd
import numpy as np
from pyprojroot import here

data_3 = pd.read_parquet(here("data/goemotions_3.parquet"))
data_3.columns

Index(['id', 'text', 'n_raters', 'emotions', 'author', 'subreddit', 'link_id',
       'parent_id', 'created_utc', 'emotion_positive', 'emotion_negative',
       'emotion_ambiguous'],
      dtype='object')

In [15]:
data_3.emotions

0        [[disappointment], [disappointment], [disappoi...
1                        [[curiosity], [curiosity], [joy]]
2        [[unclear], [sadness], [disapproval], [unclear...
3        [[sadness], [embarrassment, sadness], [unclear...
4        [[gratitude], [excitement, gratitude], [gratit...
                               ...                        
57344         [[disappointment], [confusion], [confusion]]
57345    [[realization], [annoyance, disappointment], [...
57346                    [[unclear], [unclear], [unclear]]
57347    [[admiration, amusement], [excitement], [admir...
57348                 [[unclear], [admiration], [unclear]]
Name: emotions, Length: 57349, dtype: object

In [16]:
data_3.shape

(57349, 12)

In [17]:
import os
os.environ["HF_HOME"] = str(here("cache/huggingface"))
from transformers import AutoModelForSequenceClassification, AutoTokenizer

distill_3 = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)
distill_3 = distill_3.to(DEVICE)  # Move model to device
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
distill_3

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [18]:
# see tokenization in process
"""
DistilBERT tokenizer return whole tokens if they exist in the vocabulary, 
but falls back to WordPiece subwords if not
"""
tokens = tokenizer("I love programming in Python! The ö Unbelievablingy")
print(tokenizer.convert_ids_to_tokens(tokens["input_ids"]))


['[CLS]', 'i', 'love', 'programming', 'in', 'python', '!', 'the', 'o', 'un', '##bel', '##ie', '##va', '##bling', '##y', '[SEP]']


In [19]:
import torch
from torchinfo import summary
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

with torch.no_grad():
  dummy_text = "This is a dummy text for testing the model."
  dummy_inputs = tokenizer(
    dummy_text,
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=512
  )
  dummy_inputs = {k: v.to(DEVICE) for k, v in dummy_inputs.items()}  # Move inputs to device
  out = summary(
    distill_3,
    input_data={
      "input_ids": dummy_inputs["input_ids"].long(),
      "attention_mask": dummy_inputs["attention_mask"].long()
    },
    col_names=["input_size", "output_size", "num_params", "trainable"],
    device=DEVICE,
    batch_dim=64
  )
out

Layer (type:depth-idx)                                  Input Shape               Output Shape              Param #                   Trainable
DistilBertForSequenceClassification                     --                        [[1, 3, 1]]               --                        True
├─DistilBertModel: 1-1                                  --                        [[1, 12, 768, 1]]         --                        True
│    └─Embeddings: 2-1                                  [1, 12, 1]                [1, 12, 768]              --                        True
│    │    └─Embedding: 3-1                              [1, 12, 1]                [1, 12, 768]              23,440,896                True
│    │    └─Embedding: 3-2                              [1, 12, 1]                [1, 12, 768]              393,216                   True
│    │    └─LayerNorm: 3-3                              [1, 12, 768, 1]           [1, 12, 768]              1,536                     True
│    │    └─Dropout: 3

In [57]:
import lightning as L
import torchmetrics
from icecream import ic
import torchmetrics.classification
import torchmetrics.regression
import torchmetrics.text

class DistilBertFinetune(L.LightningModule):
  def __init__(self, distilbert_model, tokenizer, n_emotions=3):
    super().__init__()
    self.tokenizer = tokenizer
    self.model = distilbert_model.to(DEVICE)  # Ensure model is on device
    self.model.train()
    self.model.classifier = torch.nn.Linear(in_features=768, out_features=n_emotions, bias=True).to(DEVICE)
    # Freeze all layers except classifier and pre-classifier
    for param in self.model.parameters():
      param.requires_grad = False
    for param in self.model.classifier.parameters():
      param.requires_grad = True
    for param in self.model.pre_classifier.parameters():
      param.requires_grad = True
    self.sigmoid = torch.nn.Sigmoid()
    self.sig_loss = torch.nn.BCEWithLogitsLoss()
    # self.preplexity = torchmetrics.text.Perplexity()
    self.f1 = torchmetrics.classification.MultilabelF1Score(num_labels=n_emotions, average="macro") # macro is average of f1s, micro is global f1
    self.rmse = torchmetrics.regression.MeanSquaredError(squared=False)
  
  def training_step(self, batch):
    x, target = batch
    tokens = self.tokenizer(
      x,
      return_tensors="pt",
      padding=True,
      truncation=True,
      max_length=512
    )
    tokens = {k: v.to(DEVICE) for k, v in tokens.items()}
    target = target.to(DEVICE)
    logits = self.model(
      input_ids=tokens["input_ids"],
      attention_mask=tokens["attention_mask"]
    )
    loss = self.sig_loss(
      logits.logits,
      target
    )
    return loss

  def validation_step(self, batch):
    x, target = batch
    tokens = self.tokenizer(
      x,
      return_tensors="pt",
      padding=True,
      truncation=True,
      max_length=512
    )
    tokens = {k: v.to(DEVICE) for k, v in tokens.items()}
    target = target.to(DEVICE)
    logits = self.model(
      input_ids=tokens["input_ids"],
      attention_mask=tokens["attention_mask"]
    )
    loss = self.sig_loss(
      logits.logits,
      target
    )
    y = self.sigmoid(logits.logits)
    self.log_dict({
      "val_loss": loss, 
      # "val_perplexity": self.preplexity(preds=y, target=target), # ValueError: Input tensor `preds` is expected to have 3 dimensions, [batch_size, seq_len, vocab_size], but got 2.
      "val_rmse": self.rmse(y, target),
    }, on_step=False, on_epoch=True, prog_bar=True, logger=False)
    return loss
  
  def test_step(self, batch):
    x, target = batch
    tokens = self.tokenizer(
      x,
      return_tensors="pt",
      padding=True,
      truncation=True,
      max_length=512
    )
    tokens = {k: v.to(DEVICE) for k, v in tokens.items()}
    target = target.to(DEVICE)
    logits = self.model(
      input_ids=tokens["input_ids"],
      attention_mask=tokens["attention_mask"]
    )
    loss = self.sig_loss(
      logits.logits,
      target
    )
    y = self.sigmoid(logits.logits)
    # preplexity = self.preplexity(preds=y, target=target)
    rmse = self.rmse(y, target)
    # if target > 0 then 1
    # if target == 0 then 0
    y = (y > 0.5).int() # thresholding at 0.5 # TODO possibly change this later
    target = (target > 0.01).int()
    f1 = self.f1(y, target)
    self.log_dict({
      "test_loss": loss,
      # "test_perplexity": preplexity,
      "test_f1": f1,
      "test_rmse": rmse
    }, on_step=False, on_epoch=True, prog_bar=True, logger=False)
    return loss
  
  def predict_step(self, batch):
    x = batch
    tokens = self.tokenizer(
      x,
      return_tensors="pt",
      padding=True,
      truncation=True,
      max_length=512
    )
    tokens = {k: v.to(DEVICE) for k, v in tokens.items()}
    logits = self.model(
      input_ids=tokens["input_ids"],
      attention_mask=tokens["attention_mask"]
    )
    y = self.sigmoid(logits.logits)
    return y
  
  def configure_optimizers(self):
    optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
    return optimizer
  
  def foward(self, x):
    tokens = self.tokenizer(
      x,
      return_tensors="pt",
      padding=True,
      truncation=True,
      max_length=512
    )
    tokens = {k: v.to(DEVICE) for k, v in tokens.items()}  # Move tokens to device
    logits = self.model(
      input_ids=tokens["input_ids"],
      attention_mask=tokens["attention_mask"]
    )
    y = self.sigmoid(logits.logits)
    return y
  
model_3 = DistilBertFinetune(
  distilbert_model=distill_3,
  tokenizer=tokenizer,
  n_emotions=3
)
model_3 = model_3.to(DEVICE)  # Move LightningModule to device
model_3

DistilBertFinetune(
  (model): DistilBertForSequenceClassification(
    (distilbert): DistilBertModel(
      (embeddings): Embeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (transformer): Transformer(
        (layer): ModuleList(
          (0-5): 6 x TransformerBlock(
            (attention): DistilBertSdpaAttention(
              (dropout): Dropout(p=0.1, inplace=False)
              (q_lin): Linear(in_features=768, out_features=768, bias=True)
              (k_lin): Linear(in_features=768, out_features=768, bias=True)
              (v_lin): Linear(in_features=768, out_features=768, bias=True)
              (out_lin): Linear(in_features=768, out_features=768, bias=True)
            )
            (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        

In [46]:
# collect eval with callbacks
from lightning.pytorch.callbacks import Callback

class MetricsCallback(Callback):
  def __init__(self):
    self.val_losses = []
    self.epochs = []
    
  def on_validation_epoch_end(self, trainer, pl_module):
    self.val_losses.append(trainer.callback_metrics["val_loss"].item())
    self.epochs.append(trainer.current_epoch)
    
metrics = MetricsCallback()
metrics

<__main__.MetricsCallback at 0x21652c59160>

In [47]:
# make dataloader using Lightning's DataLoader
from lightning.pytorch import LightningDataModule
from sklearn.model_selection import train_test_split
from lightning.pytorch import seed_everything

class GoEmotionsDataset(torch.utils.data.Dataset):
  def __init__(self, dataframe):
    self.dataframe = dataframe

  def __len__(self):
    return len(self.dataframe)

  def __getitem__(self, idx):
    text = self.dataframe.iloc[idx].text
    # Select all columns that start with "emotion_"
    emotions = self.dataframe.iloc[idx].filter(like="emotion_").values
    return text, torch.tensor(np.array(emotions, dtype=np.float32), dtype=torch.float32).to(DEVICE)

# Move the split logic into the DataModule class
class GoEmotionsDataModule(L.LightningDataModule):
  def __init__(self, dataframe, batch_size=64):
    super().__init__()
    self.dataframe = dataframe
    self.batch_size = batch_size
  
  def prepare_data(self):
    self.train_df, temp_df = train_test_split(self.dataframe, test_size=0.1)
    self.val_df, self.test_df = train_test_split(temp_df, test_size=0.8)

  def setup(self, stage=None):
    self.train_dataset = GoEmotionsDataset(self.train_df)
    self.val_dataset = GoEmotionsDataset(self.val_df)
    self.test_dataset = GoEmotionsDataset(self.test_df)

  def train_dataloader(self):
    return torch.utils.data.DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)

  def val_dataloader(self):
    return torch.utils.data.DataLoader(self.val_dataset, batch_size=self.batch_size, shuffle=False)

  def test_dataloader(self):
    return torch.utils.data.DataLoader(self.test_dataset, batch_size=self.batch_size, shuffle=False)

data_module = GoEmotionsDataModule(data_3, batch_size=64)
data_module

<__main__.GoEmotionsDataModule at 0x2166a92b770>

In [48]:
seed_everything(42, workers=True)
# train
trainer = L.Trainer(
  max_epochs=1,
  callbacks=[
    metrics,
  ],
  deterministic=True,
  default_root_dir=here("cache/lightning"),
  enable_checkpointing=True,
)
trainer

Seed set to 42
Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


<lightning.pytorch.trainer.trainer.Trainer at 0x21652895090>

In [49]:
trainer.fit(
  model_3,
  datamodule=data_module,
)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                                | Params | Mode 
-------------------------------------------------------------------------
0 | model    | DistilBertForSequenceClassification | 67.0 M | train
1 | sigmoid  | Sigmoid                             | 0      | train
2 | sig_loss | BCEWithLogitsLoss                   | 0      | train
3 | f1       | MultilabelF1Score                   | 0      | train
4 | rmse     | MeanSquaredError                    | 0      | train
-------------------------------------------------------------------------
592 K     Trainable params
66.4 M    Non-trainable params
67.0 M    Total params
267.823   Total estimated model params size (MB)
100       Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

c:\Users\Plancha\emotion-temp\.pixi\envs\gpu\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
c:\Users\Plancha\emotion-temp\.pixi\envs\gpu\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.


In [69]:
# make a sample prediction using model_3

sample_text = "Hello, Mosm"
with torch.no_grad():
  out = model_3.foward(sample_text)  # Forward pass through the model
out

tensor([[0.4363, 0.3697, 0.4708]], device='cuda:0')

In [63]:
model_3.foward(sample_text)  # Forward pass through the model

tensor([[0.4363, 0.3697, 0.4708]], device='cuda:0', grad_fn=<SigmoidBackward0>)