In [26]:
# !pip install transformers
# !pip install datasets
# !pip install accelerate
# !pip install evaluate
import evaluate
import numpy as np
import torch
from tqdm import tqdm

# Chapter 1: The Pipeline

In [2]:
# High level code
from transformers import pipeline

classifier = pipeline('sentiment-analysis')
classifier(["Oppenheimer is shit movie for sure!"])

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


[{'label': 'NEGATIVE', 'score': 0.9963310360908508}]

# Chapter 2: Behind the Pipeline

## Under the hood of pipeline

In [3]:
# Loading the particular tokenizer
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [4]:
raw_inputs = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this so much!",
]
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors='pt')
print(inputs)

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  1045,  5223,  2023,  2061,  2172,   999,   102,     0,     0,
             0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}


In [5]:
# Loading particular model
from transformers import AutoModel  # Model without any head

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModel.from_pretrained(checkpoint)


In [6]:
outputs = model(**inputs)
print(outputs.last_hidden_state.shape)

torch.Size([2, 16, 768])


In [7]:
from transformers import AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

In [8]:
outputs = model(**inputs)
print(outputs.logits.shape)

torch.Size([2, 2])


In [9]:
# Post processing the model output
import torch.nn.functional as F

predictions = F.softmax(outputs.logits, dim=1)
print(predictions)

tensor([[4.0195e-02, 9.5980e-01],
        [9.9946e-01, 5.4418e-04]], grad_fn=<SoftmaxBackward0>)


In [10]:
print(model.config.id2label)

{0: 'NEGATIVE', 1: 'POSITIVE'}


## Going in details of individual components: Model

### a) Instantiating model directly

#### Using automodel class

In [11]:
from transformers import AutoModel

In [12]:
bert_model = AutoModel.from_pretrained('bert-base-cased')

#### Using some specific class

In [13]:
from transformers import BertModel

In [14]:
bert_model = BertModel.from_pretrained("bert-base-cased")

### b) Instantiating Configuration for model and then the model


#### Using autoconfig and giving it checkpoint or folder name

In [15]:
from transformers import AutoConfig

In [16]:
bert_config = AutoConfig.from_pretrained('bert-base-cased')

In [17]:
print(type(bert_config))

<class 'transformers.models.bert.configuration_bert.BertConfig'>


#### Using bertconfig without any checkpoint name: returns random config file

In [18]:
from transformers import BertConfig

In [19]:
bert_config_random = BertConfig()

#### Using bertconfig with the checkpoint name: returns non-random config file

In [20]:
from transformers import BertModel

In [21]:
bert_config = BertConfig.from_pretrained('bert-base-cased')

#### Finally instantiating the model using any of the above configuration files

In [22]:
bert_model = BertModel(bert_config)

### c) Saving the model

In [23]:
bert_model.save_pretrained('/content/')

## Going in details of individual components: Tokenizer

In [24]:
'''
There are three types of tokenizers:
1. Word based
2. Character based
3. Subword based
'''

'\nThere are three types of tokenizers:\n1. Word based\n2. Character based\n3. Subword based\n'

### a) Word based tokenizers

In [25]:
'''
Issues:
  1. Similar words may not have similar tokens
  2. Vocabulary size can become very large :( Large vocabularies result in heavy models
  3. A lot of unknown words will occur

To get word based tokens, create a unique vocabulary, assign each unique word
an id to get a word-to-id dictionary, and then use this dictionary to assign tokens to the input text.
'''

'\nIssues:\n  1. Similar words may not have similar tokens\n  2. Vocabulary size can become very large :( Large vocabularies result in heavy models\n  3. A lot of unknown words will occur\n\nTo get word based tokens, create a unique vocabulary, assign each unique word\nan id to get a word-to-id dictionary, and then use this dictionary to assign tokens to the input text.\n'

### b) Character Based Tokens

In [26]:
'''
. This has two primary benefits:
    - The vocabulary is much smaller.
    - There are much fewer out-of-vocabulary (unknown) tokens, since every word can be built from characters.

. Problem:
    - We’ll end up with a very large amount of tokens to be processed by our model.
'''

'\n. This has two primary benefits:\n    - The vocabulary is much smaller.\n    - There are much fewer out-of-vocabulary (unknown) tokens, since every word can be built from characters.\n\n. Problem:\n    - We’ll end up with a very large amount of tokens to be processed by our model.\n'

### c) Subword Tokenization



In [27]:
'''
- Do not split frequently occuring words
- Otherwise split
'''

'\n- Do not split frequently occuring words\n- Otherwise split\n'

### d) Code

In [28]:
from transformers import AutoTokenizer
'''
Autotokenizer.from_pretrained takes in model checkpoint,
and returns the algorithm(similar to config file) which the
model used for tokenization during pretraining, and the
vocabulary(similar to weights of the model)
'''

'\nAutotokenizer.from_pretrained takes in model checkpoint,\nand returns the algorithm(similar to config file) which the\nmodel used for tokenization during pretraining, and the\nvocabulary(similar to weights of the model)\n'

In [29]:
model_name = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer('Yoo ma boi, how are you?')

{'input_ids': [101, 14941, 1186, 12477, 171, 8136, 117, 1293, 1132, 1128, 136, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [30]:
from transformers import BertTokenizer

In [31]:
model_name = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(model_name)

In [32]:
tokenizer("I love it when you call me Senorita...")

{'input_ids': [101, 146, 1567, 1122, 1165, 1128, 1840, 1143, 14895, 9012, 1777, 119, 119, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

### e) Under the hood of tokenizer('string')

In [33]:
model_name = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [34]:
input_text = "Binte Khan was Bin Hassan's first Love"

In [35]:
tokens = tokenizer.tokenize(input_text)
print(tokens)

['Bin', '##te', 'Khan', 'was', 'Bin', 'Hassan', "'", 's', 'first', 'Love']


In [36]:
input_ids = tokenizer.convert_tokens_to_ids(tokens)
print(input_ids)

[21700, 1566, 4340, 1108, 21700, 13583, 112, 188, 1148, 2185]


In [37]:
final_inputs = tokenizer.prepare_for_model(input_ids)
print(final_inputs)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': [101, 21700, 1566, 4340, 1108, 21700, 13583, 112, 188, 1148, 2185, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [38]:
decoded = tokenizer.decode(final_inputs['input_ids'])
print(decoded)

[CLS] Binte Khan was Bin Hassan's first Love [SEP]


### f) Batching input sequences

In [39]:
import torch
'''
Following line will give error as the model expects
not a single sentence, but a batch of sentences.
'''
# model(torch.tensor(final_inputs['input_ids']))

'\nFollowing line will give error as the model expects\nnot a single sentence, but a batch of sentences.\n'

In [40]:
# This line won't give any error
model(torch.tensor([final_inputs['input_ids']]))

SequenceClassifierOutput(loss=None, logits=tensor([[ 2.4303, -2.0763]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [41]:
# Apart from just getting the right tokens to input into
# the model, we also need to pad tokens of all sentences
# to equal length, and to get attention masks.

# But keep in mind that pad token id should be the same
# as the model used during training

# All of this is done by tokenizer by default :)

### g) More on tokenizer functionalities

In [42]:
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

model_inputs = tokenizer(sequences)

In [43]:
# Will pad the sequences up to the maximum sequence length
model_inputs = tokenizer(sequences, padding="longest")

# Will pad the sequences up to the model max length
# (512 for BERT or DistilBERT)
model_inputs = tokenizer(sequences, padding="max_length")

# Will pad the sequences up to the specified max length
model_inputs = tokenizer(sequences, padding="max_length", max_length=8)

In [44]:
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

# Will truncate the sequences that are longer than the model max length
# (512 for BERT or DistilBERT)
model_inputs = tokenizer(sequences, truncation=True)

# Will truncate the sequences that are longer than the specified max length
model_inputs = tokenizer(sequences, max_length=8, truncation=True)

In [45]:
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

# Returns PyTorch tensors
model_inputs = tokenizer(sequences, padding=True, return_tensors="pt")

# Returns TensorFlow tensors
model_inputs = tokenizer(sequences, padding=True, return_tensors="tf")

# Returns NumPy arrays
model_inputs = tokenizer(sequences, padding=True, return_tensors="np")

### h) Conclusion

In [46]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
output = model(**tokens)
print(output)

SequenceClassifierOutput(loss=None, logits=tensor([[-1.5607,  1.6123],
        [-3.6183,  3.9137]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


#

# Chapter 3: Finetuning a pre-trained model

## One step of model training, with two sentences as dataset

In [47]:
import torch
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

checkpoint = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "This course is amazing!",
]
batch = tokenizer(sequences, padding=True,
                  truncation=True, return_tensors='pt')  # dictionary

batch['labels'] = torch.tensor([1, 1])

optimizer = AdamW(model.parameters())
loss = model(**batch).loss
loss.backward()
optimizer.step()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Exploring the Dataset library


In [48]:
from datasets import load_dataset

In [49]:
raw_datasets = load_dataset("glue", "mrpc")

In [50]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [51]:
raw_train_dataset = raw_datasets["train"]
print(raw_train_dataset)
print('---------------------')
# Viewing a single example
example = raw_train_dataset[0]
print(example)

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 3668
})
---------------------
{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .', 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .', 'label': 1, 'idx': 0}


In [52]:
# To see which label represents what:
raw_train_dataset.features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [53]:
# Tokenizing all the elements of the dataset
from transformers import AutoTokenizer

checkpoint = "bert-base-cased"
toeknizer = AutoTokenizer.from_pretrained(checkpoint)

# Noice how we gave 2, and not 1 sentence as input to the tokenizer()
def tokenize_function(example):
  return tokenizer(
      example["sentence1"], example["sentence2"], padding="max_length",
      truncation=True, max_length=128
  )

tokenize_function(example)

{'input_ids': [101, 7277, 2180, 5303, 4806, 1117, 1711, 117, 2292, 1119, 1270, 107, 1103, 7737, 107, 117, 1104, 9938, 4267, 12223, 21811, 1117, 2554, 119, 102, 11336, 6732, 3384, 1106, 1140, 1112, 1178, 107, 1103, 7737, 107, 117, 7277, 2180, 5303, 4806, 1117, 1711, 1104, 9938, 4267, 12223, 21811, 1117, 2554, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [54]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
print(tokenized_datasets.column_names)

{'train': ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'], 'validation': ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'], 'test': ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask']}


In [55]:
print(tokenized_datasets)

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})


In [56]:
print(tokenized_datasets["train"])

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 3668
})


In [57]:
# Manipulating the dataset according to what model is expecting.
tokenized_datasets = tokenized_datasets.remove_columns(["idx", "sentence1", "sentence2"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

In [58]:
print(tokenized_datasets)

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})


In [59]:
# Other way to tokenize the dataset
tokenized_dataset_v2 = tokenizer(
    raw_datasets["train"]["sentence1"],
    raw_datasets["train"]["sentence2"],
    padding=True,
    truncation=True
)

In [60]:
print(tokenized_dataset_v2)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



The problem with the second approach is that it returns a python dictionary and will work only if you have enough RAM. To keep the data as a dataset, we will use the Dataset.map() method.

## Exploring the Dataset Library (Putting together)

In [61]:
from datasets import load_dataset
from transformers import AutoTokenizer

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
  return tokenizer(
      example["sentence1"], example["sentence2"], padding="max_length",
      truncation=True, max_length=128
  )


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["idx", "sentence1", "sentence2"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets = tokenized_datasets.with_format('torch')
print(tokenized_datasets["train"][0])

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

{'labels': tensor(1), 'input_ids': tensor([  101,  7277,  2180,  5303,  4806,  1117,  1711,   117,  2292,  1119,
         1270,   107,  1103,  7737,   107,   117,  1104,  9938,  4267, 12223,
        21811,  1117,  2554,   119,   102, 11336,  6732,  3384,  1106,  1140,
         1112,  1178,   107,  1103,  7737,   107,   117,  7277,  2180,  5303,
         4806,  1117,  1711,  1104,  9938,  4267, 12223, 21811,  1117,  2554,
          119,   102,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0, 

## Giving this dataset to torch dataloader

In [62]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(tokenized_datasets["train"],
                              batch_size=16,
                              shuffle=True)

for step, batch in enumerate(train_dataloader):
  print(batch['input_ids'].shape)
  if step > 5:
    break

torch.Size([16, 128])
torch.Size([16, 128])
torch.Size([16, 128])
torch.Size([16, 128])
torch.Size([16, 128])
torch.Size([16, 128])
torch.Size([16, 128])


### Doing above two sections by Dynamic Padding

In [63]:
from datasets import load_dataset
from transformers import AutoTokenizer

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
  return tokenizer(
      example["sentence1"], example["sentence2"],
      truncation=True, max_length=128
  )

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["idx", "sentence1", "sentence2"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets = tokenized_datasets.with_format('torch')

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

In [64]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer)
train_dataloader = DataLoader(tokenized_datasets["train"],
                              batch_size=16,
                              shuffle=True,
                              collate_fn=data_collator)

for step, batch in enumerate(train_dataloader):
  print(batch['input_ids'].shape)
  if step > 5:
    break

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


torch.Size([16, 80])
torch.Size([16, 72])
torch.Size([16, 87])
torch.Size([16, 84])
torch.Size([16, 77])
torch.Size([16, 74])
torch.Size([16, 69])


## Trainer API

In [65]:
from datasets import load_dataset
from transformers import AutoTokenizer

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
  return tokenizer(
      example["sentence1"], example["sentence2"],
      truncation=True, max_length=128
  )

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer)

In [66]:
# !pip install transformers
from transformers import TrainingArguments
training_args = TrainingArguments('test-trainer')

In [67]:
from transformers import AutoModelForSequenceClassification

checkpoint = "bert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [68]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [69]:
# trainer.train()

In [70]:
# Evaluating the model
# Trainer.predict returns named tuple containing 3 elements
# predictions, labels, metric value
predictions = trainer.predict(tokenized_datasets["validation"])
print(predictions.predictions.shape, predictions.label_ids.shape)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


(408, 2) (408,)


In [71]:
print(predictions.predictions)

[[-0.6967151  -0.3756896 ]
 [-0.6876184  -0.391532  ]
 [-0.68211657 -0.37875637]
 [-0.6914974  -0.38169637]
 [-0.69527936 -0.38151777]
 [-0.70225835 -0.38775015]
 [-0.6917361  -0.40015525]
 [-0.6837981  -0.40252933]
 [-0.68638617 -0.37447944]
 [-0.69432944 -0.36988056]
 [-0.6873695  -0.382127  ]
 [-0.7064969  -0.39345717]
 [-0.67053777 -0.3508614 ]
 [-0.68649125 -0.37919453]
 [-0.692019   -0.38762498]
 [-0.6945637  -0.3777609 ]
 [-0.69388425 -0.39638898]
 [-0.6602345  -0.38809955]
 [-0.68243265 -0.38765866]
 [-0.66972566 -0.38133958]
 [-0.63704437 -0.42157665]
 [-0.678729   -0.38502657]
 [-0.68850255 -0.39314327]
 [-0.69666815 -0.39865604]
 [-0.6876614  -0.38122246]
 [-0.7025974  -0.39227557]
 [-0.672801   -0.40824246]
 [-0.68693805 -0.3873577 ]
 [-0.68496865 -0.39938754]
 [-0.68684095 -0.37949717]
 [-0.6943828  -0.38600424]
 [-0.6897569  -0.39814025]
 [-0.6925422  -0.3841939 ]
 [-0.6759791  -0.37521875]
 [-0.69334537 -0.37910366]
 [-0.67045295 -0.37173864]
 [-0.68579894 -0.38805583]
 

In [72]:
print(predictions.label_ids)

[1 0 0 1 0 1 0 1 1 1 1 0 0 1 1 1 1 0 1 0 0 1 0 1 1 1 0 1 1 1 0 1 1 1 1 0 0
 0 1 1 0 1 0 0 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 0 1 1 1 1 1
 1 1 0 1 1 1 0 1 1 0 1 0 1 0 1 1 0 1 1 0 1 1 0 1 1 1 1 1 1 1 0 1 1 0 0 1 1
 1 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 0 1 0 1 1 0 0 1 0 1 0 0 1 0 0 1 1
 1 1 0 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 0 1 0 0 1 1 1 0 1 1 0 1 1 0 0 1 1 0
 1 0 1 0 1 1 0 1 1 0 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 0 1 1 0 1 1 1 1 0 1 1 1
 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 0 1 0 1 1 1 1 1 0 1 1 1 0 1 0 1 0 1 1 1
 0 1 0 1 1 0 1 1 0 0 1 0 1 1 1 1 1 1 1 1 0 1 1 1 0 0 1 1 1 1 1 1 0 1 1 1 0
 0 0 1 1 1 1 1 1 1 1 0 1 0 0 0 1 1 0 1 0 1 1 0 0 0 0 0 1 0 1 0 1 1 1 1 1 0
 1 1 1 1 0 0 1 1 1 0 0 1 1 1 1 0 1 0 1 0 1 0 0 1 1 1 1 0 1 1 0 1 1 0 1 0 0
 1 1 1 1 0 0 0 0 1 1 1 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 0
 1]


In [73]:
import numpy as np

preds = np.argmax(predictions.predictions, axis=-1)

In [74]:
# !pip install evaluate
import evaluate

metric = evaluate.load("glue", "mrpc")
metric.compute(predictions=preds, references=predictions.label_ids)

{'accuracy': 0.6838235294117647, 'f1': 0.8122270742358079}

## Putting together the Trainer API code

In [11]:
# Wrapping everything togther, we can get our compute_metrics function
# !pip install evaluate
import evaluate
import numpy as np
def compute_metrics(eval_preds):
  metric = evaluate.load("glue", "mrpc")
  logits, labels = eval_preds
  predictions = np.argmax(logits, axis=-1)
  return metric.compute(predictions=predictions, references=labels)

In [9]:
from transformers import AutoTokenizer
from datasets import load_dataset

raw_datasets = load_dataset("glue", "mrpc")  # 1
checkpoint = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)  # 2

def tokenize_function(example):
  return tokenizer(
      example["sentence1"], example["sentence2"], truncation=True,
      max_length=128
  )

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)  # 3

from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer)  # 4

from transformers import TrainingArguments
training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")  # 5

from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)  # 6

from transformers import Trainer
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)  # 7

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.684488,0.816176,0.877651
2,0.308400,0.917826,0.835784,0.888147
3,0.129600,0.990127,0.857843,0.900344


Epoch,Training Loss,Validation Loss


TrainOutput(global_step=1377, training_loss=0.17081803105062734, metrics={'train_runtime': 218.6065, 'train_samples_per_second': 50.337, 'train_steps_per_second': 6.299, 'total_flos': 559439881459200.0, 'train_loss': 0.17081803105062734, 'epoch': 3.0})

## Putting together training code, but without Trainer API use

In [None]:
# Wrapping everything togther, we can get our compute_metrics function
def compute_metrics(eval_preds):
  metric = evaluate.load("glue", "mrpc")
  logits, labels = eval_preds
  predictions = np.argmax(logits, axis=-1)
  return metric.compute(predictions=predictions, references=labels)

In [14]:
from transformers import AutoTokenizer
from datasets import load_dataset

raw_datasets = load_dataset("glue", "mrpc")  # 1
checkpoint = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)  # 2

def tokenize_function(example):
  return tokenizer(
      example["sentence1"], example["sentence2"], truncation=True,
      max_length=128
  )

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["idx", "sentence1", "sentence2"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets = tokenized_datasets.with_format('torch')

from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer)  # 4

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
tokenized_datasets["train"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [16]:
# Defining my own dataloader
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)

In [31]:
# To check there is no mistake, we can check a batch:
for batch in train_dataloader:
  break
{k: v.shape for k, v in batch.items()}

In [20]:
# Data processing done, time to load model
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)  # 6/

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

tensor(0.7016, grad_fn=<NllLossBackward0>) torch.Size([8, 2])


In [25]:
# Preparation for training loop
from transformers import AdamW
from transformers import get_scheduler

optimizer = AdamW(model.parameters(), lr=5e-5)

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

from tqdm.auto import tqdm
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
  for batch in train_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    outputs = model(**batch)
    loss = outputs.loss
    loss.backward()
    optimizer.step()

    lr_scheduler.step()
    progress_bar.update(1)

  0%|          | 0/1377 [00:00<?, ?it/s]

In [26]:
    import evaluate
    metric = evaluate.load("glue", "mrpc")
    model.eval()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])

    metric.compute()

{'accuracy': 0.6838235294117647, 'f1': 0.8122270742358079}

## Putting together training code, but without Trainer API use, but this time updating training loop to work with 'accelerator'

In [20]:
from datasets import load_dataset
raw_datasets = load_dataset("glue", "mrpc")

from transformers import AutoTokenizer
checkpoint = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
  return tokenizer(
      example["sentence1"], example["sentence2"], truncation=True, max_length=128
  )

tokenized_datasets = raw_datasets.map(tokenize_function)
tokenized_datasets = tokenized_datasets.remove_columns(['sentence1', 'sentence2', 'idx'])
tokenized_datasets = tokenized_datasets.rename_column('label', 'labels')
tokenized_datasets.set_format('torch')

from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer)

from torch.utils.data import DataLoader
train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=16, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], shuffle=True, batch_size=16, collate_fn=data_collator
)

# To check there is no mistake, we can check a batch:
for batch in train_dataloader:
  break
{k: v.shape for k, v in batch.items()}

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'labels': torch.Size([16]),
 'input_ids': torch.Size([16, 105]),
 'token_type_ids': torch.Size([16, 105]),
 'attention_mask': torch.Size([16, 105])}

In [21]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
# Preparing for the training loop
from transformers import AdamW
optimizer = AdamW(model.parameters(), lr=5e-5)

from accelerate import Accelerator
accelerator = Accelerator()
train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
    train_dataloader, eval_dataloader, model, optimizer
)

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
from transformers import get_scheduler
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

progress_bar = tqdm(range(num_training_steps))
model.train()
for epoch in range(num_epochs):
  for batch in train_dataloader:
    outputs = model(**batch)
    loss = outputs.loss
    accelerator.backward(loss)

    optimizer.step()
    lr_scheduler.step()
    optimizer.zero_grad()
    progress_bar.update(1)

In [30]:
    import evaluate
    metric = evaluate.load("glue", "mrpc")
    model.eval()
    for batch in eval_dataloader:
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=accelerator.gather(predictions), references=accelerator.gather(batch["labels"]))

    metric.compute()

Downloading builder script:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

{'accuracy': 0.6862745098039216, 'f1': 0.8134110787172011}