## Read the data

In [2]:
# !pip install datasets

In [3]:
# !pip install transformers

In [6]:
import pandas as pd
from pprint import pprint
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import (
    DataCollatorWithPadding, AutoTokenizer, DistilBertForSequenceClassification,
    TrainingArguments, Trainer
)

In [8]:
train_df = pd.read_csv("data/training_data.csv")
test_df = pd.read_csv("data/testing_data.csv")

In [10]:
train_df.sample(4)

Unnamed: 0,id,premise,hypothesis,label
6478,ff42e661c2,Time 's cover package considers what makes a g...,Time's cover package is about how most college...,2
5512,0989d728ff,Total volume grew 13.,Overall volume decreased.,2
2192,d56f67ad59,"To see the desert at its best, go out at dawn ...",Go at noon to see the desert for the best view.,2
4955,a41023e637,To savour the full effect of the architect's s...,The gate to the Hippodrome is an example of th...,0


In [12]:
val_length = int(len(train_df)*0.2)
val_df = train_df[:val_length]
train_df = train_df[val_length:]

In [14]:
print(len(val_df), len(train_df), len(test_df))

1374 5496 2945


___

## Convert the pandas dataset to HF datasets

In [16]:
train_dataset = Dataset.from_pandas(train_df.drop(columns=["id"]))
val_dataset = Dataset.from_pandas(val_df.drop(columns=["id"]))
test_dataset = Dataset.from_pandas(test_df.drop(columns=["id"]))

In [18]:
train_dataset

Dataset({
    features: ['premise', 'hypothesis', 'label'],
    num_rows: 5496
})

___

## Tokenizer and model

In [20]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

___

In [30]:
example = tokenizer.encode("this is it","for now atleast")
tokenizer.decode(example)

'[CLS] this is it [SEP] for now atleast [SEP]'

In [28]:
example

[101, 2023, 2003, 2009, 102, 2005, 2085, 2012, 19738, 3367, 102]

In [11]:
example_tokenizer = tokenizer("this is it","for now atleast")
example_tokenizer

{'input_ids': [101, 2023, 2003, 2009, 102, 2005, 2085, 2012, 19738, 3367, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

___

In [32]:
def tokenize_function(example):
    # print(type(example), example)
    # return
    return tokenizer(example["premise"], example["hypothesis"], truncation=True)

In [34]:
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/5496 [00:00<?, ? examples/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

In [36]:
tokenized_train

Dataset({
    features: ['premise', 'hypothesis', 'label', 'input_ids', 'attention_mask'],
    num_rows: 5496
})

In [58]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [14]:
# label2id = {"entailment": 0, "neutral": 1, "contradiction": 2}

In [50]:
tokenizer.decode(tokenized_train[1].get('input_ids'))

"[CLS] you will learn later that the person who usually poured out mrs. inglethorp's medicine was always extremely careful not to shake the bottle, but to leave the sediment at the bottom of it undisturbed. [SEP] the person who poured mrs. inglethorp's medicine never shook the bottle so as to leave the sediment untouched. [SEP]"

In [16]:
tokenized_val

Dataset({
    features: ['premise', 'hypothesis', 'label', 'input_ids', 'attention_mask'],
    num_rows: 1374
})

___

In [52]:
training_args = TrainingArguments("test-trainer")

In [60]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
    tokenizer=tokenizer
)

In [None]:
trainer.train()

Step,Training Loss


___

In [27]:
predictions = trainer.predict(tokenized_val)
print(predictions.predictions.shape, predictions.label_ids.shape)

100%|██████████| 172/172 [00:01<00:00, 92.64it/s]

(1374, 3) (1374,)





In [34]:
type(predictions)

transformers.trainer_utils.PredictionOutput

In [28]:
import numpy as np

preds = np.argmax(predictions.predictions, axis=-1)

In [53]:
from sklearn.metrics import classification_report
results = classification_report(y_true=predictions.label_ids, y_pred=preds)
print(results)

              precision    recall  f1-score   support

           0       0.67      0.73      0.70       474
           1       0.61      0.55      0.58       452
           2       0.66      0.65      0.66       448

    accuracy                           0.65      1374
   macro avg       0.65      0.65      0.64      1374
weighted avg       0.65      0.65      0.65      1374



In [54]:
results

'              precision    recall  f1-score   support\n\n           0       0.67      0.73      0.70       474\n           1       0.61      0.55      0.58       452\n           2       0.66      0.65      0.66       448\n\n    accuracy                           0.65      1374\n   macro avg       0.65      0.65      0.64      1374\nweighted avg       0.65      0.65      0.65      1374\n'

___

The compute metrics is not working for multiple metrics with multiclass classification

Custom trainer is needed

In [None]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss (suppose one has 3 labels with different weights)
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 2.0, 3.0]))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [None]:
training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)