In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

### How to change the number of inputs?
Normally with transfer learning . we only change the head of the neural network . whole keeping the input + middle layer the same. But how to change the number of inputs? No need to change it. We can train the transformer to understand the multiple input sentences concatenated into the same input. This works with RNN as well

Format of input text : "[CLS] Some Text ABC. [SEP] Another text statement. [SEP]"

Bert 2 task :
1 - Next Sentence Prediction
2 - Textual Entailment

### Train Textual Entailment Model

In [3]:
from datasets import load_dataset

In [4]:
raw_dataset = load_dataset("glue","rte")

Downloading data: 100%|██████████| 697k/697k [00:00<00:00, 21.6MB/s]
Generating train split: 100%|██████████| 2490/2490 [00:00<00:00, 12202.05 examples/s]
Generating validation split: 100%|██████████| 277/277 [00:00<00:00, 19343.39 examples/s]
Generating test split: 100%|██████████| 3000/3000 [00:00<00:00, 23765.83 examples/s]


In [5]:
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 2490
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 277
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3000
    })
})

In [6]:
raw_dataset['train'].features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['entailment', 'not_entailment'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [7]:
raw_dataset['train']['sentence1'][:10]

['No Weapons of Mass Destruction Found in Iraq Yet.',
 'A place of sorrow, after Pope John Paul II died, became a place of celebration, as Roman Catholic faithful gathered in downtown Chicago to mark the installation of new Pope Benedict XVI.',
 'Herceptin was already approved to treat the sickest breast cancer patients, and the company said, Monday, it will discuss with federal regulators the possibility of prescribing the drug for more breast cancer patients.',
 'Judie Vivian, chief executive at ProMedica, a medical service company that helps sustain the 2-year-old Vietnam Heart Institute in Ho Chi Minh City (formerly Saigon), said that so far about 1,500 children have received treatment.',
 "A man is due in court later charged with the murder 26 years ago of a teenager whose case was the first to be featured on BBC One's Crimewatch. Colette Aram, 16, was walking to her boyfriend's house in Keyworth, Nottinghamshire, on 30 October 1983 when she disappeared. Her body was later found i

In [20]:
checkpoint = 'distilbert-base-cased'

In [21]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer

In [22]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [23]:
tokenizer(
    raw_dataset['train']['sentence1'][0],
    raw_dataset['train']['sentence2'][0]
)

{'input_ids': [101, 1302, 20263, 1104, 8718, 14177, 17993, 17107, 1107, 5008, 6355, 119, 102, 20263, 1104, 8718, 14177, 17993, 17107, 1107, 5008, 119, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [25]:
results = _

In [26]:
results.keys()

dict_keys(['input_ids', 'attention_mask'])

In [28]:
tokenizer.decode(results['input_ids'])

'[CLS] No Weapons of Mass Destruction Found in Iraq Yet. [SEP] Weapons of Mass Destruction Found in Iraq. [SEP]'

In [29]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint,num_labels=2)
# Num of labels = 2 as this is binary classification problem 

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
training_args = TrainingArguments(
    output_dir="textual_entrailment_model_dir",
    evaluation_strategy='epoch',
    save_strategy='epoch',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    logging_steps=150
)

In [32]:
from datasets import load_metric

In [33]:
metric = load_metric("glue","rte")

  metric = load_metric("glue","rte")


In [34]:
def compute_metrics(logits_and_labels):
    logits,labels = logits_and_labels
    predictions = np.argmax(logits,axis=-1)
    return metric.compute(predictions=predictions,references=labels)

In [35]:
def tokenize_func(batch):
    return tokenizer(batch['sentence1'],batch['sentence2'],truncation=True)

In [36]:
tokenized_datasets = raw_dataset.map(tokenize_func,batched=True)

Map: 100%|██████████| 2490/2490 [00:00<00:00, 16762.67 examples/s]
Map: 100%|██████████| 277/277 [00:00<00:00, 16476.94 examples/s]
Map: 100%|██████████| 3000/3000 [00:00<00:00, 11788.23 examples/s]


In [37]:
trainer = Trainer(
    model=model,
    args = training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [38]:
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6992,0.691848,0.494585
2,0.64,0.708972,0.555957
3,0.3813,0.982503,0.574007
4,0.1855,1.628551,0.584838
5,0.0854,2.146535,0.545126


TrainOutput(global_step=780, training_loss=0.3865704298019409, metrics={'train_runtime': 69.8309, 'train_samples_per_second': 178.288, 'train_steps_per_second': 11.17, 'total_flos': 544524318051096.0, 'train_loss': 0.3865704298019409, 'epoch': 5.0})