# Handling multiple sequences (PyTorch)

Install the Transformers and Datasets libraries to run this notebook.

In [1]:
!pip install datasets transformers[sentencepiece]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.3.2-py3-none-any.whl (362 kB)
[K     |████████████████████████████████| 362 kB 31.6 MB/s 
[?25hCollecting transformers[sentencepiece]
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 57.4 MB/s 
Collecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 61.7 MB/s 
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2022.5.0-py3-none-any.whl (140 kB)
[K     |████████████████████████████████| 140 kB 76.5 MB/s 
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "I've been waiting for this my whole life."

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor(ids)
# This line will fail.
model(input_ids)

IndexError: ignored

In [4]:
# Correct way

input_ids = torch.tensor([ids])
print(input_ids)
model(input_ids)


tensor([[1045, 1005, 2310, 2042, 3403, 2005, 2023, 2026, 2878, 2166, 1012]])


SequenceClassifierOutput([('logits',
                           tensor([[-3.1463,  3.3041]], grad_fn=<AddmmBackward0>))])

In [5]:
tokenized_inputs = tokenizer(sequence, return_tensors="pt")
print(tokenized_inputs["input_ids"])

model(tokenized_inputs["input_ids"])


tensor([[ 101, 1045, 1005, 2310, 2042, 3403, 2005, 2023, 2026, 2878, 2166, 1012,
          102]])


SequenceClassifierOutput([('logits',
                           tensor([[-2.6078,  2.6051]], grad_fn=<AddmmBackward0>))])

In [6]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "I've been waiting for this my whole life."

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)

input_ids = torch.tensor([ids])
print("Input IDs:", input_ids)

output = model(input_ids)
print("Logits:", output.logits)

Input IDs: tensor([[1045, 1005, 2310, 2042, 3403, 2005, 2023, 2026, 2878, 2166, 1012]])
Logits: tensor([[-3.1463,  3.3041]], grad_fn=<AddmmBackward0>)


In [7]:
batched_ids = [
    [200, 200, 200],
    [200, 200]
]

In [8]:
torch.tensor(batched_ids)

ValueError: ignored

In [None]:
padding_id = 100

batched_ids = [
    [200, 200, 200],
    [200, 200, padding_id],
]

In [None]:
torch.tensor(batched_ids)

In [None]:
print(tokenizer.pad_token_id)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence1_ids = [[200, 200, 200]]
sequence2_ids = [[200, 200]]
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]

print(model(torch.tensor(sequence1_ids)).logits)
print(model(torch.tensor(sequence2_ids)).logits)
print(model(torch.tensor(batched_ids)).logits)

Use of Attention Mask

In [None]:
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]

attention_mask = [
    [1, 1, 1],
    [1, 1, 0],
]

outputs = model(torch.tensor(batched_ids), attention_mask=torch.tensor(attention_mask))
print(outputs.logits)

Try it out! Apply the tokenization manually on the two sentences used in section 2 (“I’ve been waiting for this my whole life.” and “I hate this so much!”). Pass them through the model and check that you get the same logits as in section 2. Now batch them together using the padding token, then create the proper attention mask. Check that you obtain the same results when going through the model!

In [11]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = [
             "I’ve been waiting for this whole life.", 
             "I hate this so much"
             ]
ids = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
print(ids)
output = model(**ids)
print(output.logits)



{'input_ids': tensor([[ 101, 1045, 1521, 2310, 2042, 3403, 2005, 2023, 2878, 2166, 1012,  102],
        [ 101, 1045, 5223, 2023, 2061, 2172,  102,    0,    0,    0,    0,    0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]])}
tensor([[-2.3011,  2.2917],
        [ 4.2141, -3.4158]], grad_fn=<AddmmBackward0>)


In [10]:
id1 = tokenizer(sequences[0], padding=True, truncation=True, return_tensors="pt")
print(id1)
output1 = model(**id1)
print(output1.logits)



{'input_ids': tensor([[ 101, 1045, 1521, 2310, 2042, 3403, 2005, 2023, 2878, 2166, 1012,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
tensor([[-2.3011,  2.2917]], grad_fn=<AddmmBackward0>)


In [12]:
id2 = tokenizer(sequences[1], padding=True, truncation=True, return_tensors="pt")
print(id2)
output2 = model(**id2)
print(output2.logits)


{'input_ids': tensor([[ 101, 1045, 5223, 2023, 2061, 2172,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}
tensor([[ 4.2141, -3.4158]], grad_fn=<AddmmBackward0>)
