# Model overview

In [1]:
# The overall pipeline
from transformers import pipeline

# The toakenizer
from transformers import AutoTokenizer
# Model architectures 
from transformers import AutoModel
from transformers import AutoModelForSequenceClassification

from transformers import DataCollatorWithPadding
from transformers import TrainingArguments
from transformers import Trainer

from datasets import load_dataset
import datasets
import torch

In [2]:
dataset_name = {'path': 'glue', 'name': 'mrpc'}
dataset_name_sst2 = {'path': 'glue', 'name': 'sst2'}

In [5]:
text = [
        "I've been waiting for a HuggingFace course my whole life.",
        "I hate this so much!",
        "Great! I messed it up again!"
]
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
# checkpoint = "distilbert-base-cased-distilled-squad"

## Running a model directly

In [6]:
classifier = pipeline("sentiment-analysis", model=checkpoint)
classifier(text)

[{'label': 'POSITIVE', 'score': 0.9598049521446228},
 {'label': 'NEGATIVE', 'score': 0.9994558691978455},
 {'label': 'POSITIVE', 'score': 0.9654763340950012}]

## Running a model by step

### Tokenization
Do this preprocessing the same way as when the model was pretrained by using from_pretrained to update the tokenizer with the pretraining info from the original model  
The raw_inputs are the text; The output of the tokenizer is the input of the model. It contains the input IDs (ID of each token/word for the checkpoint) as a PyTorch tensor, and the attention_mask

In [7]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

raw_inputs = text
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")
print(inputs)

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  1045,  5223,  2023,  2061,  2172,   999,   102,     0,     0,
             0,     0,     0,     0,     0,     0],
        [  101,  2307,   999,  1045, 18358,  2009,  2039,  2153,   999,   102,
             0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]])}


### Modeling - output the last hidden state only
The output of the checkpoint is hidden state, not the final label  

Dimensions of the output
- Batch size: The number of sequences processed at a time (3 in our example).
- Sequence length: The length of the numerical representation of the sequence (16 in our example).
- Hidden size: The vector dimension of each model input.

In [11]:
model = AutoModel.from_pretrained(checkpoint)
outputs = model(**inputs)
# outputs is like named tuple. Check its keys by outputs.keys()
print(outputs.last_hidden_state.shape)

Some weights of the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english were not used when initializing DistilBertModel: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.weight', 'classifier.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


torch.Size([3, 16, 768])


### Redo the modeling - output the logits
Do this with AutoModelForSequenceClassification  
Now the outputs is the logits [N_row, N_label]  
Use softmax to get the probilities

In [12]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
outputs = model(**inputs)
print(outputs.logits)

tensor([[-1.5607,  1.6123],
        [ 4.1692, -3.3464],
        [-1.6187,  1.7123]], grad_fn=<AddmmBackward0>)


In [13]:
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(predictions)

# Get labels
print(model.config.id2label)

tensor([[4.0195e-02, 9.5980e-01],
        [9.9946e-01, 5.4418e-04],
        [3.4524e-02, 9.6548e-01]], grad_fn=<SoftmaxBackward0>)


## Running a specific model

In [15]:
checkpoint = "bert-base-cased"

#### The automatic way

In [16]:
from transformers import AutoModel
bert_model = AutoModel.from_pretrained(checkpoint)
print(type(bert_model))

from transformers import AutoConfig
bert_config = AutoConfig.from_pretrained(checkpoint)
print(type(bert_config))

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<class 'transformers.models.bert.modeling_bert.BertModel'>
<class 'transformers.models.bert.configuration_bert.BertConfig'>


#### The specific way - better for customization

In [17]:
from transformers import BertConfig, BertModel

# Initialize a model (with random weights)
config = BertConfig()
model = BertModel(config)

# Or load a pre-trained model
model = BertModel.from_pretrained(checkpoint)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [18]:
print(config)

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.28.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



## Save a model

In [None]:
model.save_pretrained("./")

# Tokenizers

In [19]:
# The toakenizer
from transformers import AutoTokenizer
# Model architectures 
from transformers import AutoModel
from transformers import AutoModelForSequenceClassification

import torch

## Overview

In [20]:
from transformers import BertTokenizer  # can use AutoTokenizer as well
tokenizer = BertTokenizer.from_pretrained(checkpoint)

inputs = tokenizer(text)
print(inputs)

tokenizer.save_pretrained("./")

{'input_ids': [[101, 146, 112, 1396, 1151, 2613, 1111, 170, 20164, 10932, 2271, 7954, 1736, 1139, 2006, 1297, 119, 102], [101, 146, 4819, 1142, 1177, 1277, 106, 102], [101, 2038, 106, 146, 20147, 1122, 1146, 1254, 106, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}


('./tokenizer_config.json',
 './special_tokens_map.json',
 './vocab.txt',
 './added_tokens.json')

## Step by step

### The detailed way

In [21]:
from transformers import BertTokenizer

# Tokens
tokenizer = BertTokenizer.from_pretrained(checkpoint)
tokens = tokenizer.tokenize(text[0])
print(tokens)

# Token ids
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

# The final output used by models also contains sentence marks
inputs = tokenizer.prepare_for_model(ids)
print(inputs["input_ids"])

# Decode
decoded_string = tokenizer.decode(ids)
print(decoded_string)
decoded_string = tokenizer.decode(inputs["input_ids"])
print(decoded_string)

['I', "'", 've', 'been', 'waiting', 'for', 'a', 'Hu', '##gging', '##F', '##ace', 'course', 'my', 'whole', 'life', '.']
[146, 112, 1396, 1151, 2613, 1111, 170, 20164, 10932, 2271, 7954, 1736, 1139, 2006, 1297, 119]
[101, 146, 112, 1396, 1151, 2613, 1111, 170, 20164, 10932, 2271, 7954, 1736, 1139, 2006, 1297, 119, 102]
I've been waiting for a HuggingFace course my whole life.
[CLS] I've been waiting for a HuggingFace course my whole life. [SEP]


### Put it all together

In [22]:
# This is the same with 
# the output of passing text directly to the tokenizer instance
inputs = tokenizer(text[0])
print(inputs["input_ids"])

decoded_string = tokenizer.decode(inputs["input_ids"])
print(decoded_string)

[101, 146, 112, 1396, 1151, 2613, 1111, 170, 20164, 10932, 2271, 7954, 1736, 1139, 2006, 1297, 119, 102]
[CLS] I've been waiting for a HuggingFace course my whole life. [SEP]


## Process multiple sequences

In [23]:
print(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
print(tokenizer.pad_token_id)

bert-base-cased


Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification 

0


In [24]:
sequence = "I've been waiting for a HuggingFace course my whole life."
tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)

### The input of a model has to be a list of sequences

In [25]:
# It works when passing a list of sequences (of 1 element)
input_ids = torch.tensor([ids])
print(f'Input IDs: {input_ids};\nLogits: {model(input_ids).logits}\n')

# It fails when passing a sequence
input_ids = torch.tensor(ids)
print(input_ids)
print(model(input_ids))

Input IDs: tensor([[  146,   112,  1396,  1151,  2613,  1111,   170, 20164, 10932,  2271,
          7954,  1736,  1139,  2006,  1297,   119]]);
Logits: tensor([[-0.3675, -0.2447]], grad_fn=<AddmmBackward0>)

tensor([  146,   112,  1396,  1151,  2613,  1111,   170, 20164, 10932,  2271,
         7954,  1736,  1139,  2006,  1297,   119])


ValueError: not enough values to unpack (expected 2, got 1)

In [26]:
# Now batch multiple sequences
batched_ids = [ids, ids]
inputs = torch.tensor(batched_ids)
model(inputs)

SequenceClassifierOutput(loss=None, logits=tensor([[-0.3675, -0.2447],
        [-0.3675, -0.2447]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

### Padding
Pad short sequences so that they all have the same length with the longest one

In [27]:
padding_id = tokenizer.pad_token_id  # What padding id to use when padding
print(padding_id)

0


In [29]:
seq1_ids = [200, 200, 200]
seq2_ids = [200, 200]
batched_ids = [
    seq1_ids, 
    seq2_ids + [padding_id],
]
print(model(torch.tensor([seq1_ids])).logits)
print(model(torch.tensor([seq2_ids])).logits)
print(model(torch.tensor(batched_ids)).logits)
print("The results are not the same after batching!!! Need attention masks (see below)")

tensor([[-0.2686, -0.2606]], grad_fn=<AddmmBackward0>)
tensor([[-0.2446, -0.2774]], grad_fn=<AddmmBackward0>)
tensor([[-0.2686, -0.2606],
        [-0.3394, -0.1513]], grad_fn=<AddmmBackward0>)
The results are not the same after batching!!! Need attention masks (see below)


### Attention masks

In [30]:
attention_mask = [
    [1, 1, 1],
    [1, 1, 0],
]
print(f'batched_ids: {batched_ids};\nattention_mask: {attention_mask}')
outputs = model(torch.tensor(batched_ids), attention_mask=torch.tensor(attention_mask))
print(outputs.logits)
print("Now the logits are the same")

batched_ids: [[200, 200, 200], [200, 200, 0]];
attention_mask: [[1, 1, 1], [1, 1, 0]]
tensor([[-0.2686, -0.2606],
        [-0.2446, -0.2774]], grad_fn=<AddmmBackward0>)
Now the logits are the same


### Do the above automatically

In [31]:
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]
model_inputs = tokenizer(sequences)
print(f"No padding: {model_inputs}\n")

padding_type = "longest"
model_inputs = tokenizer(sequences, padding=padding_type)
print(f"{padding_type} padding: {model_inputs}\n")

padding_type = "max_length"  # The default max of the model
model_inputs = tokenizer(sequences, padding=padding_type)
print(f"{padding_type} padding: {model_inputs}\n")

padding_type = "max_length"  # The defined max
model_inputs = tokenizer(sequences, padding=padding_type, max_length=8)
print(f"{padding_type} padding: {model_inputs}\n")

# Truncate: by default it's max_length of the model but we can redefined the max
model_inputs = tokenizer(sequences, truncation=True, padding=padding_type, max_length=8)
print(f"Truncated: {model_inputs}\n")

# Return a tensor
model_inputs = tokenizer(sequences, padding=padding_type, max_length=16, return_tensors="pt")
print(f"max_length padding, max_length = 16, return a PT tensor: \n{model_inputs}\n")

No padding: {'input_ids': [[101, 146, 112, 1396, 1151, 2613, 1111, 170, 20164, 10932, 2271, 7954, 1736, 1139, 2006, 1297, 119, 102], [101, 1573, 1138, 146, 106, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]}

longest padding: {'input_ids': [[101, 146, 112, 1396, 1151, 2613, 1111, 170, 20164, 10932, 2271, 7954, 1736, 1139, 2006, 1297, 119, 102], [101, 1573, 1138, 146, 106, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}

max_length padding: {'input_ids': [[101, 146, 112, 1396, 1151, 2613, 1111, 170, 20164, 10932, 2271, 7954, 1736, 1139, 2006, 1297, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`input_ids` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

# All together: From tokenizer to model

In [61]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
inputs = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")

# Get the output
output = model(**inputs)
print(output)

# Get prediction
predictions = torch.nn.functional.softmax(output.logits, dim=-1)
print(predictions)

# Get labels
print(model.config.id2label)

SequenceClassifierOutput(loss=None, logits=tensor([[-1.5607,  1.6123],
        [-3.6183,  3.9137]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
tensor([[4.0195e-02, 9.5980e-01],
        [5.3534e-04, 9.9946e-01]], grad_fn=<SoftmaxBackward0>)
{0: 'NEGATIVE', 1: 'POSITIVE'}


# Fine-tunning a pretrained model

## Processing the data

### The load_dataset lib

In [33]:
from datasets import load_dataset

raw_datasets = load_dataset(**dataset_name)  # first time it will download the data
raw_datasets

Found cached dataset glue (C:/Users/yuwei/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [34]:
raw_datasets['train'][:2]

{'sentence1': ['Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
  "Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion ."],
 'sentence2': ['Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
  "Yucaipa bought Dominick 's in 1995 for $ 693 million and sold it to Safeway for $ 1.8 billion in 1998 ."],
 'label': [1, 0],
 'idx': [0, 1]}

In [35]:
from transformers import AutoTokenizer

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
# Option 1(bad): process directly. This returns a dict with values in lists of lists,
# instead of a dataframe-like class
tokenized_datasets = tokenizer(raw_datasets['train']['sentence1'], 
                              raw_datasets['train']['sentence2'],
                              padding=True,
                              truncation=True,
                             )
## View in pandas
# import pandas as pd
# tmp = pd.DataFrame(tokenized_dataset.values(), index=tokenized_dataset.keys()).T

In [36]:
# Option 2(good)
def tokenize_function(example):
    return tokenizer(example['sentence1'], example['sentence2'], truncation=True)

In [37]:
tokenize_function(raw_datasets['train'][0]).keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [38]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets

Loading cached processed dataset at C:\Users\yuwei\.cache\huggingface\datasets\glue\mrpc\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\cache-5e7bb59e1000f23e.arrow
Loading cached processed dataset at C:\Users\yuwei\.cache\huggingface\datasets\glue\mrpc\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\cache-c41a5c23770baece.arrow
Loading cached processed dataset at C:\Users\yuwei\.cache\huggingface\datasets\glue\mrpc\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\cache-039385fc50c4881c.arrow


DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

### Dynamic padding and collator

In [39]:
from transformers import DataCollatorWithPadding
# A tokenizer instance is needed to know which padding token to use,
# and whether the model expects padding to be on the left or on the right of the inputs
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [41]:
# After slicing, Dataset object returns a dict
samples = tokenized_datasets['train'][:8]
samples = {k: v for k, v in samples.items() if k not in ['idx', 'sentence1', 'sentence2']}
print(samples.keys())
print(f'''length of original sequence: {[len(x) for x in samples['input_ids']]}''')

dict_keys(['label', 'input_ids', 'token_type_ids', 'attention_mask'])
length of original sequence: [50, 59, 47, 67, 59, 50, 62, 32]


In [42]:
# The input of data_collator has to be a dict, not a Dataset object
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': torch.Size([8, 67]),
 'token_type_ids': torch.Size([8, 67]),
 'attention_mask': torch.Size([8, 67]),
 'labels': torch.Size([8])}

### All together

In [43]:
import datasets

#### Get dataset information 

In [44]:
# two ways of getting info
# Use dataset_name_sst2 ('glue', 'sst2')
info_glue = datasets.get_dataset_infos('glue')
print(info_glue['sst2'].description)

ds_builder = datasets.load_dataset_builder(**dataset_name_sst2)
print(ds_builder.info.description)  # feature, name, etc.

GLUE, the General Language Understanding Evaluation benchmark
(https://gluebenchmark.com/) is a collection of resources for training,
evaluating, and analyzing natural language understanding systems.


GLUE, the General Language Understanding Evaluation benchmark
(https://gluebenchmark.com/) is a collection of resources for training,
evaluating, and analyzing natural language understanding systems.




In [46]:
ds_builder.info?

#### The processing

In [47]:
raw_datasets = load_dataset(**dataset_name_sst2)
raw_datasets

Found cached dataset glue (C:/Users/yuwei/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [48]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
def tokenize_function(sample):
    return tokenizer(sample['sentence'], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function)

Loading cached processed dataset at C:\Users\yuwei\.cache\huggingface\datasets\glue\sst2\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\cache-9652eec03887107f.arrow
Loading cached processed dataset at C:\Users\yuwei\.cache\huggingface\datasets\glue\sst2\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\cache-e1f87f6532a2e09f.arrow


Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

  arrays.append(pa.array(typed_sequence))


In [49]:
ds_collator = DataCollatorWithPadding(tokenizer)
samples = tokenized_datasets['validation']
samples = samples.remove_columns(['sentence', 'idx',])
# Slicing or use to_dict to convert to a dict
batch = ds_collator(samples[:4])

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [50]:
batch

{'input_ids': tensor([[  101,  2009,  1005,  1055,  1037, 11951,  1998,  2411, 12473,  4990,
          1012,   102,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  4895, 10258,  2378,  8450,  2135, 21657,  1998,  7143,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  4473,  2149,  2000,  3246,  2008, 13401,  2003, 22303,  2000,
         28866,  1037,  2350,  2476,  2004,  1037,  3293,  2664,  1999, 15338,
          3512, 12127,  1012,   102,     0,     0,     0,     0],
        [  101,  1996,  3772,  1010, 12703,  1010,  2189,  1010, 16434,  1998,
          2614,  2024,  2035,  2004, 24826, 15683,  2445,  1996,  2537,  1005,
          1055, 17151,  3334,  2063,  2334,  2229,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

## Load the data again for training below

In [51]:
raw_datasets = load_dataset(**dataset_name)

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
def tokenize_function(sample):
    return tokenizer(sample['sentence1'], sample['sentence2'], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function)

ds_builder = datasets.load_dataset_builder(**dataset_name_sst2)
num_labels = ds_builder.info.features['label'].num_classes 

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Found cached dataset glue (C:/Users/yuwei/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at C:\Users\yuwei\.cache\huggingface\datasets\glue\mrpc\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\cache-6805c5561369e395.arrow
Loading cached processed dataset at C:\Users\yuwei\.cache\huggingface\datasets\glue\mrpc\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\cache-7828b119295c81b3.arrow
Loading cached processed dataset at C:\Users\yuwei\.cache\huggingface\datasets\glue\mrpc\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\cache-dc0ea7dc358506f3.arrow


## Create the trainer args 

In [52]:
# Create the trainer arg object
from transformers import TrainingArguments
training_args = TrainingArguments("test-trainer")  # this is the output path

# Create the model
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels)

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

## Train with the Trainer API

### The follows run a long while on my X1 so are executed on colab
https://colab.research.google.com/drive/1_orJU8HBXfE24HhtOPVyVZR6spNQJ515?usp=sharing

### Train without eval
The eval_dataset is not used below

In [53]:
# Define the trainer
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

### Define the compute_metrics function

In [54]:
predictions = trainer.predict(tokenized_datasets['validation'])
print(predictions.predictions.shape, predictions.label_ids.shape)

import numpy as np
import evaluate

def compute_metrics(eval_preds):
    """computer model performance metrics
    Args:
        eval_preds: a tuple of (logits_predictions, labels)
    Return:
        a dict of {'accuracy', 'f1}
    """
    
    evaluate.load(dataset_name['path'], dataset_name['name'])
    logits, labels = eval_preds
    preds = np.argmax(logits, axis=-1)
    return metric.compute(predictions=preds, references=labels)

compute_metrics((predictions.predictions, predictions.label_ids))

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


KeyboardInterrupt: 

### Train with eval

In [None]:
training_args = TrainingArguments('test-trainer', evaluation_strategy='epoch')
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)
trainer.train()

## Train without the Trainer API

### Postprocessing the tokenized datasets
- Remove the columns corresponding to values the model does not expect (like the sentence1 and sentence2 columns).
- Rename the column label to labels (because the model expects the argument to be named labels).
- Set the format of the datasets so they return PyTorch tensors instead of lists.

In [55]:
tokenized_datasets = tokenized_datasets.remove_columns(['sentence1', 
                                                        'sentence2',
                                                        'idx']
                                                      )
tokenized_datasets = tokenized_datasets.rename_column('label', 'labels')
tokenized_datasets.set_format('torch')
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

### Batch using DataLoader

In [56]:
from torch.utils.data import DataLoader
batch_size = 8
train_dataloader = DataLoader(
    tokenized_datasets['train'], shuffle=True, batch_size=batch_size, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets['validation'], shuffle=True, batch_size=batch_size, collate_fn=data_collator
)

In [57]:
for batch in train_dataloader:
    break
print(batch.keys())
print([i.shape for i in batch.values()])

dict_keys(['labels', 'input_ids', 'token_type_ids', 'attention_mask'])
[torch.Size([8]), torch.Size([8, 71]), torch.Size([8, 71]), torch.Size([8, 71])]


### Training

In [59]:
from transformers import AutoModelForSequenceClassification, AdamW, get_scheduler
import torch
from tqdm.auto import tqdm
from accelerate import Accelerator

accelerator = Accelerator()

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels)
# outputs = model(**batch)
# print(outputs.loss, outputs.logits.shape)

# Adam with weight decay regularization
optimizer = AdamW(model.parameters(), lr=5e-5)

# Use GPU
# 1. If working on one CPU or GPU
# device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# model.to(device)
# print(device)
# 2. If using accelerator to work on multiple GPUs or TPUs
# This will wrap those objects in the proper container to make sure 
# your distributed training works as intended.
train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
    train_dataloader, eval_dataloader, model, optimizer
)

# learning rate
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)  # epoch * batches
lr_scheduler = get_scheduler(
    'linear',
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
# print(num_training_steps)

progress_bar = tqdm(range(num_training_steps))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

  0%|          | 0/1377 [00:00<?, ?it/s]

In [22]:
model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        
        # 1. If working on one CPU or GPU
#         batch = {k: v.to(device) for k, v in batch.items()}  # put batch to device
#         outputs = model(**batch)
#         loss = outputs.loss
#         loss.backward()
        
        # 2. If using accelerator to work on multiple GPUs or TPUs
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)
        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
        break
    break

  0%|          | 0/1377 [00:00<?, ?it/s]

### Eval

In [25]:
import evaluate

metric = evaluate.load('glue', 'mrpc')
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k,v in batch.items()}
    with torch.no_grad():  # do not calculate gradients since we are doing eval and not backward
        outputs = model(**batch)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch['labels'])
    break
    
metric.compute()

{'accuracy': 0.75, 'f1': 0.8571428571428571}

# Sharing models

In [63]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Using the push_to_hub API
integrated in the transformers lib

### Push a model to hub during training

In [65]:
from transformers import TrainingArguments
from transformers import Trainer

# push the model to the hub each epoch (when the model is saved)
# This works with the Trainer API
training_args = TrainingArguments(
    "tutorial", 
    save_strategy="epoch", 
    push_to_hub=True,
#     hub_model_id = "my_organization/my_repo_name",  # For orgs
)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    tokenizer=tokenizer,
)
trainer.train()

# Do a final push after the trainer is done. This generates the Model Card page
trainer.push_to_hub("End of trainer")

### A dummy example

In [66]:
from transformers import AutoModelForMaskedLM, AutoTokenizer

checkpoint = "camembert-base"

model = AutoModelForMaskedLM.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading (…)lve/main/config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/811k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.40M [00:00<?, ?B/s]

In [67]:
tokenizer.push_to_hub("dummy-model")
model.push_to_hub("dummy-model")

# # Or with orgs and a specific token
# tokenizer.push_to_hub("dummy-model", organization="huggingface", use_auth_token="<TOKEN>")

# # If there are lables
# label_names = raw_datasets["train"].features["label"].names
# model.config.id2label = {str(i): lbl for i, lbl in enumerate(label_names)}
# model.config.label2id = {lbl: str(i) for i, lbl in enumerate(label_names)}

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

sentencepiece.bpe.model:   0%|          | 0.00/811k [00:00<?, ?B/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/yuwei2342/dummy-model/commit/9eb4e1680c0421e1c262ca850d3707f390f5e151', commit_message='Upload CamembertForMaskedLM', commit_description='', oid='9eb4e1680c0421e1c262ca850d3707f390f5e151', pr_url=None, pr_revision=None, pr_num=None)

## Using the huggingface_hub lib

In [82]:
from huggingface_hub import (
    # User management
    login, logout, whoami,
    # Repo creation and management
    create_repo, delete_repo, update_repo_visibility,
    # Info
    list_models, list_datasets, list_metrics, list_repo_files, upload_file, delete_file,
    # git-like repo management
    Repository,
)

In [77]:
delete_repo("dummy-model2")
create_repo(
    "dummy-model2", 
#     private=True,
#     token="some-token",
#     repo_type="space",      
)

RepoUrl('https://huggingface.co/yuwei2342/dummy-model2', endpoint='https://huggingface.co', repo_type='model', repo_id='yuwei2342/dummy-model2')

In [81]:
upload_file(
    path_or_fileobj="tokenizer_config.json",
    path_in_repo="tokenizer_config.json",
    repo_id="yuwei2342/dummy-model2",
)

'https://huggingface.co/yuwei2342/dummy-model2/blob/main/tokenizer_config.json'

In [None]:
# # Git-like commands
# repo = Repository("<path_to_dummy_folder>", clone_from="<namespace>/dummy-model")
# repo.git_pull()
# repo.git_add()
# repo.git_commit("Add model and tokenizer files")
# repo.git_push()

## Using git

In [None]:
# # Install git large file system from https://git-lfs.com/

# # Initiailize git-lfs
# !git lfs install

# # Then all the git stuff
# git clone https://huggingface.co/<namespace>/<your-model-id>
# git lfs status
# git add
# git commit ....