In [1]:
import pandas as pd
import numpy as np
from pyprojroot import here

data_3 = pd.read_parquet(here("data/goemotions_3.parquet"))
data_3.columns

Index(['id', 'text', 'n_raters', 'emotions', 'author', 'subreddit', 'link_id',
       'parent_id', 'created_utc', 'emotion_positive', 'emotion_negative',
       'emotion_ambiguous'],
      dtype='object')

In [2]:
data_3.emotions

0        [[disappointment], [disappointment], [disappoi...
1                        [[curiosity], [curiosity], [joy]]
2        [[unclear], [sadness], [disapproval], [unclear...
3        [[sadness], [embarrassment, sadness], [unclear...
4        [[gratitude], [excitement, gratitude], [gratit...
                               ...                        
57344         [[disappointment], [confusion], [confusion]]
57345    [[realization], [annoyance, disappointment], [...
57346                    [[unclear], [unclear], [unclear]]
57347    [[admiration, amusement], [excitement], [admir...
57348                 [[unclear], [admiration], [unclear]]
Name: emotions, Length: 57349, dtype: object

In [3]:
# load distilbert
import os
os.environ["HF_HOME"] = str(here("cache/huggingface"))
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer

model_3 = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model_3

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [34]:
import torch
from torchinfo import summary
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

with torch.no_grad():
  dummy_text = "This is a dummy text for testing the model."
  dummy_inputs = tokenizer(
    dummy_text,
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=512
  )
  out = summary(
    model_3,
    input_data={
      "input_ids": dummy_inputs["input_ids"].to(DEVICE).long(),
      "attention_mask": dummy_inputs["attention_mask"].to(DEVICE).long()
    },
    col_names=["input_size", "output_size", "num_params", "trainable"],
    device=DEVICE,
    batch_dim=64
  )
out

Layer (type:depth-idx)                                  Input Shape               Output Shape              Param #                   Trainable
DistilBertForSequenceClassification                     --                        [[1, 3, 1]]               --                        True
├─DistilBertModel: 1-1                                  --                        [[1, 12, 768, 1]]         --                        True
│    └─Embeddings: 2-1                                  [1, 12, 1]                [1, 12, 768]              --                        True
│    │    └─Embedding: 3-1                              [1, 12, 1]                [1, 12, 768]              23,440,896                True
│    │    └─Embedding: 3-2                              [1, 12, 1]                [1, 12, 768]              393,216                   True
│    │    └─LayerNorm: 3-3                              [1, 12, 768, 1]           [1, 12, 768]              1,536                     True
│    │    └─Dropout: 3

In [None]:
import torch
# try out a random example
example = data_3.iloc[0].text
with torch.no_grad():
  inputs = tokenizer(example, return_tensors="pt", truncation=True, padding=True)
  outputs = model_3(**inputs)
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[ 0.0250, -0.1824,  0.1597]]), hidden_states=None, attentions=None)

In [None]:
import lightning as L

class DistilBertFinetune(L.LightningModule):
  def __init__(self, distilbert_model, tokenizer, n_emotions=3):
    super().__init__()
    self.tokenizer = tokenizer
    self.model = distilbert_model
    self.model.classifier = torch.nn.Sequential(
      torch.nn.Linear(in_features=768, out_features=n_emotions, bias=True),
      torch.nn.Sigmoid()
    )
  def training_step(self, batch, batch_idx):
    x, _ = batch
    tokens = self.tokenizer( # TODO not sure what the args do
      x,
      return_tensors="pt",
      padding=True,
      truncation=True,
      max_length=512
    )
    y = self.model(
      input_ids=tokens["input_ids"],
      attention_mask=tokens["attention_mask"]
    )
    loss = torch.nn.functional.binary_cross_entropy_with_logits( # TODO see which one to use
      y.logits,
      batch[1].float() # TODO see what this does
    )
    return loss
  def configure_optimizers(self):
    optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
    return optimizer
  
  def test_step(self, batch, batch_idx):
    x, _ = batch
    tokens = self.tokenizer(
      x,
      return_tensors="pt",
      padding=True,
      truncation=True,
      max_length=512
    )
    y = self.model(
      input_ids=tokens["input_ids"],
      attention_mask=tokens["attention_mask"]
    )
    loss = torch.nn.functional.binary_cross_entropy_with_logits(
      y.logits,
      batch[1].float()
    )
    self.log("test_loss", loss)
    return loss
    
  def validation_step(self, batch, batch_idx):
    x, _ = batch
    tokens = self.tokenizer(
      x,
      return_tensors="pt",
      padding=True,
      truncation=True,
      max_length=512
    )
    y = self.model(
      input_ids=tokens["input_ids"],
      attention_mask=tokens["attention_mask"]
    )
    loss = torch.nn.functional.binary_cross_entropy_with_logits(
      y.logits,
      batch[1].float()
    )
    self.log("val_loss", loss)
    return loss

transformers.models.distilbert.modeling_distilbert.DistilBertForSequenceClassification