## Read the data

In [3]:
!pip install datasets

Defaulting to user installation because normal site-packages is not writeable
Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp312-cp312-win_amd64.whl.metadata (3.4 kB)
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp312-cp312-win_amd64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py312-none-any.whl.metadata (7.2 kB)
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.24.5-py3-none-any.whl.metadata (13 kB)
Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
   ---------------------------------------- 0.0/547.8 kB ? eta -:--:--
   -------------- ------------------------- 204.8/547.8 kB 6.3 MB/s eta 0:00:01
   ---------------------------------------  542.7/547.8 kB 8.6 MB/s e



In [7]:
!pip install transformers

Defaulting to user installation because normal site-packages is not writeable
Collecting transformers
  Downloading transformers-4.43.3-py3-none-any.whl.metadata (43 kB)
     ---------------------------------------- 0.0/43.7 kB ? eta -:--:--
     --------- ------------------------------ 10.2/43.7 kB ? eta -:--:--
     ----------------- -------------------- 20.5/43.7 kB 131.3 kB/s eta 0:00:01
     -------------------------- ----------- 30.7/43.7 kB 163.8 kB/s eta 0:00:01
     ----------------------------------- -- 41.0/43.7 kB 217.9 kB/s eta 0:00:01
     -------------------------------------- 43.7/43.7 kB 177.7 kB/s eta 0:00:00
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.3-cp312-none-win_amd64.whl.metadata (3.9 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers)
  Downloading tokenizers-0.19.1-cp312-none-win_amd64.whl.metadata (6.9 kB)
Downloading transformers-4.43.3-py3-none-any.whl (9.4 MB)
   ---------------------------------------- 0.0/9.4 M



In [9]:
import pandas as pd
from pprint import pprint
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import (
    DataCollatorWithPadding, AutoTokenizer, DistilBertForSequenceClassification,
    TrainingArguments, Trainer
)

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [11]:
train_df = pd.read_csv("data/training_data.csv")
test_df = pd.read_csv("data/testing_data.csv")

In [13]:
train_df.sample(4)

Unnamed: 0,id,premise,hypothesis,label
4768,354d03f4fc,The great attraction of the church is the sple...,The outside of the church isn't much to look a...,2
5471,49245fe99b,that was good and Poland yeah and i've done so...,I enjoy receiving information in the shape of ...,0
2774,9a1afb6e56,A little past the small theater built for loca...,There are a number of art performances in the ...,1
3901,5e4f77f4b2,and the NIT semifinals are on tonight,The NIT semifinals take place in New York City...,1


In [15]:
val_length = int(len(train_df)*0.2)
val_df = train_df[:val_length]
train_df = train_df[val_length:]

In [5]:
print(len(val_df), len(train_df), len(test_df))

1374 5496 2945


___

## Convert the pandas dataset to HF datasets

In [6]:
train_dataset = Dataset.from_pandas(train_df.drop(columns=["id"]))
val_dataset = Dataset.from_pandas(val_df.drop(columns=["id"]))
test_dataset = Dataset.from_pandas(test_df.drop(columns=["id"]))

In [7]:
train_dataset

Dataset({
    features: ['premise', 'hypothesis', 'label'],
    num_rows: 5496
})

___

## Tokenizer and model

In [8]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

___

In [10]:
example = tokenizer.encode("this is it","for now atleast")
tokenizer.decode(example)

'[CLS] this is it [SEP] for now atleast [SEP]'

In [11]:
example_tokenizer = tokenizer("this is it","for now atleast")
example_tokenizer

{'input_ids': [101, 2023, 2003, 2009, 102, 2005, 2085, 2012, 19738, 3367, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

___

In [12]:
def tokenize_function(example):
    # print(type(example), example)
    # return
    return tokenizer(example["premise"], example["hypothesis"], truncation=True)

In [13]:
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 5496/5496 [00:00<00:00, 32196.51 examples/s]
Map: 100%|██████████| 1374/1374 [00:00<00:00, 29278.94 examples/s]


In [24]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [14]:
# label2id = {"entailment": 0, "neutral": 1, "contradiction": 2}

In [15]:
tokenized_train

Dataset({
    features: ['premise', 'hypothesis', 'label', 'input_ids', 'attention_mask'],
    num_rows: 5496
})

In [16]:
tokenized_val

Dataset({
    features: ['premise', 'hypothesis', 'label', 'input_ids', 'attention_mask'],
    num_rows: 1374
})

___

In [21]:
training_args = TrainingArguments("test-trainer")

In [25]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [26]:
trainer.train()

  0%|          | 0/2061 [00:00<?, ?it/s]You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
 24%|██▍       | 500/2061 [00:29<01:26, 18.12it/s]

{'loss': 0.9611, 'learning_rate': 3.78699660359049e-05, 'epoch': 0.73}


 49%|████▊     | 1000/2061 [00:59<01:01, 17.38it/s]

{'loss': 0.7039, 'learning_rate': 2.57399320718098e-05, 'epoch': 1.46}


 73%|███████▎  | 1500/2061 [01:28<00:33, 16.66it/s]

{'loss': 0.5085, 'learning_rate': 1.3609898107714703e-05, 'epoch': 2.18}


 97%|█████████▋| 2000/2061 [01:58<00:03, 17.73it/s]

{'loss': 0.2987, 'learning_rate': 1.4798641436196021e-06, 'epoch': 2.91}


100%|██████████| 2061/2061 [02:02<00:00, 16.76it/s]

{'train_runtime': 122.9861, 'train_samples_per_second': 134.064, 'train_steps_per_second': 16.758, 'train_loss': 0.6108005455198477, 'epoch': 3.0}





TrainOutput(global_step=2061, training_loss=0.6108005455198477, metrics={'train_runtime': 122.9861, 'train_samples_per_second': 134.064, 'train_steps_per_second': 16.758, 'train_loss': 0.6108005455198477, 'epoch': 3.0})

___

In [27]:
predictions = trainer.predict(tokenized_val)
print(predictions.predictions.shape, predictions.label_ids.shape)

100%|██████████| 172/172 [00:01<00:00, 92.64it/s]

(1374, 3) (1374,)





In [34]:
type(predictions)

transformers.trainer_utils.PredictionOutput

In [28]:
import numpy as np

preds = np.argmax(predictions.predictions, axis=-1)

In [53]:
from sklearn.metrics import classification_report
results = classification_report(y_true=predictions.label_ids, y_pred=preds)
print(results)

              precision    recall  f1-score   support

           0       0.67      0.73      0.70       474
           1       0.61      0.55      0.58       452
           2       0.66      0.65      0.66       448

    accuracy                           0.65      1374
   macro avg       0.65      0.65      0.64      1374
weighted avg       0.65      0.65      0.65      1374



In [54]:
results

'              precision    recall  f1-score   support\n\n           0       0.67      0.73      0.70       474\n           1       0.61      0.55      0.58       452\n           2       0.66      0.65      0.66       448\n\n    accuracy                           0.65      1374\n   macro avg       0.65      0.65      0.64      1374\nweighted avg       0.65      0.65      0.65      1374\n'

___

The compute metrics is not working for multiple metrics with multiclass classification

Custom trainer is needed

In [None]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss (suppose one has 3 labels with different weights)
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 2.0, 3.0]))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [None]:
training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)