### Imports


In [1]:
import transformers, datasets
import numpy as np
from datasets import load_dataset
transformers.__version__, datasets.__version__

('4.39.3', '2.18.0')

### WandB for logging performance metrics


In [2]:
import os 
os.environ['WANDB_API_KEY'] = '#######################'

### Dataset

In [3]:
raw_datasets = load_dataset("glue", "sst2")
raw_datasets

Downloading readme:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

Downloading data: 100%|██████████| 3.11M/3.11M [00:00<00:00, 5.61MB/s]
Downloading data: 100%|██████████| 72.8k/72.8k [00:00<00:00, 138kB/s]
Downloading data: 100%|██████████| 148k/148k [00:00<00:00, 321kB/s]


Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [4]:
raw_datasets['train']

Dataset({
    features: ['sentence', 'label', 'idx'],
    num_rows: 67349
})

In [5]:
raw_datasets['train'][50000:50003]

{'sentence': ['glow ',
  'a classical dramatic animated feature ',
  'best espionage picture '],
 'label': [1, 1, 1],
 'idx': [50000, 50001, 50002]}

In [6]:
raw_datasets['train'].features

{'sentence': Value(dtype='string', id=None),
 'label': ClassLabel(names=['negative', 'positive'], id=None),
 'idx': Value(dtype='int32', id=None)}

### Model and Tokenizer

In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# checkpoint = "bert-base-uncased"
checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint,
    num_labels=2)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
!pip install torchinfo
from torchinfo import summary
summary(model)



Layer (type:depth-idx)                                  Param #
DistilBertForSequenceClassification                     --
├─DistilBertModel: 1-1                                  --
│    └─Embeddings: 2-1                                  --
│    │    └─Embedding: 3-1                              23,440,896
│    │    └─Embedding: 3-2                              393,216
│    │    └─LayerNorm: 3-3                              1,536
│    │    └─Dropout: 3-4                                --
│    └─Transformer: 2-2                                 --
│    │    └─ModuleList: 3-5                             42,527,232
├─Linear: 1-2                                           590,592
├─Linear: 1-3                                           1,538
├─Dropout: 1-4                                          --
Total params: 66,955,010
Trainable params: 66,955,010
Non-trainable params: 0

In [9]:
tokenized_sentences = tokenizer(raw_datasets['train'][0:3]['sentence'])
#tokenized_sentences = tokenizer(raw_datasets['train'][0:3]['sentence'], padding = 'max_length',truncation = True )

print(tokenized_sentences)

{'input_ids': [[101, 5342, 2047, 3595, 8496, 2013, 1996, 18643, 3197, 102], [101, 3397, 2053, 15966, 1010, 2069, 4450, 2098, 18201, 2015, 102], [101, 2008, 7459, 2049, 3494, 1998, 10639, 2015, 2242, 2738, 3376, 2055, 2529, 3267, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}


In [10]:
def tokenize_fn(batch):
    return tokenizer(batch['sentence'], padding = 'max_length', truncation=True)

In [11]:
tokenized_datasets = raw_datasets.map(tokenize_fn, batched=True)

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

### Training Arguments

In [12]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    'd_bert_glue_sst2',
    num_train_epochs = 5,
    learning_rate = 1e-5,
    per_device_train_batch_size = 32,
    per_device_eval_batch_size = 64, 
    evaluation_strategy='steps',
    save_strategy='steps',
    logging_steps = 0.05, 
    logging_dir="./logs", 
    save_steps = 0.05, 
    load_best_model_at_end = True, 
    run_name= "d_bert_sst2_run_1", fp16 = False,  
    save_total_limit = 3, report_to="wandb")

### Trainer and performance metrics 

In [13]:
from transformers import Trainer


2024-05-18 17:15:29.880265: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-18 17:15:29.880363: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-18 17:15:30.026431: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [14]:
import torch
lang = 'eng'
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, log_loss
from torch.nn import CrossEntropyLoss
def compute_metrics(pred):
    global num_labels
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    loss_fct = CrossEntropyLoss()
    logits = torch.tensor(pred.predictions)
    labels = torch.tensor(labels)
    loss = loss_fct(logits.view(-1, 2), labels.view(-1))
    return {
        'accuracy@'+lang: acc,
        'f1@'+lang: f1,
        'precision@'+lang: precision,
        'recall@'+lang: recall,
        'loss@'+lang: loss,
    }

# def compute_metrics(logits_and_labels):
#   logits, labels = logits_and_labels
#   predictions = np.argmax(logits, axis=-1)
#   return {"accuracy" : accuracy_score(y_pred=predictions, y_true=labels)}

In [15]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


### Training

In [16]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mgautam-taaresh[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.17.0 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.16.6
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20240518_171540-pgt5bm0w[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33md_bert_sst2_run_1[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/gautam-taaresh/huggingface[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/gautam-taaresh/huggingface/runs/pgt5bm0w[0m


Step,Training Loss,Validation Loss,Accuracy@eng,F1@eng,Precision@eng,Recall@eng,Loss@eng
527,0.3458,0.273114,0.881881,0.881877,0.881962,0.882072,0.273114
1054,0.2416,0.255092,0.895642,0.895642,0.896,0.895965,0.255092
1581,0.2131,0.251674,0.902523,0.902522,0.902681,0.902764,0.251674
2108,0.1942,0.26769,0.897936,0.897852,0.898183,0.897712,0.26769
2635,0.1516,0.287708,0.900229,0.900134,0.90059,0.899964,0.287708
3162,0.1335,0.310843,0.902523,0.902443,0.902778,0.902301,0.310843
3689,0.137,0.301836,0.895642,0.895474,0.896595,0.895207,0.301836
4216,0.1397,0.323393,0.894495,0.894201,0.896709,0.893828,0.323393
4743,0.1043,0.35907,0.901376,0.901147,0.903112,0.900796,0.35907
5270,0.1108,0.343321,0.904817,0.904771,0.904843,0.904721,0.343321


TrainOutput(global_step=10525, training_loss=0.13084612087512526, metrics={'train_runtime': 9267.7673, 'train_samples_per_second': 36.335, 'train_steps_per_second': 1.136, 'total_flos': 4.460773416041472e+16, 'train_loss': 0.13084612087512526, 'epoch': 5.0})