<a href="https://colab.research.google.com/github/ua-datalab/NLP-Speech/blob/main/Introduction_to_transformers/training_transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introduction to Transformers

![alt text](https://www.sky.de/static/img/filmhighlights/sky_25-03_transformer-one_hero_s.jpg)

## Slide deck: https://docs.google.com/presentation/d/1mL4ygUlKjNWPSqCiG9GqIQ4iCATj_JuGYfv0xw6VEvM/edit?usp=sharing




# Let’s train some transformers:

In [None]:
pip install transformers

In [None]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis")
classifier(
    [
        "I've been waiting for a HuggingFace course my whole life.",
        "I hate this so much!",
    ]
)

In [None]:
tokenized_text = "This movie was really scary".split()
print(tokenized_text)

['This', 'movie', 'was', 'really', 'scary']


In [None]:
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [None]:
raw_inputs = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this so much!",
]
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")
print(inputs)

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  1045,  5223,  2023,  2061,  2172,   999,   102,     0,     0,
             0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}


In [None]:
from transformers import AutoModel

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModel.from_pretrained(checkpoint)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [None]:
outputs = model(**inputs)
print(outputs.last_hidden_state.shape)

torch.Size([2, 16, 768])


In [None]:
from transformers import AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
outputs = model(**inputs)

In [None]:
print(outputs.logits.shape)
print(outputs.logits)

torch.Size([2, 2])
tensor([[-1.5607,  1.6123],
        [ 4.1692, -3.3464]], grad_fn=<AddmmBackward0>)


In [None]:
import torch

predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(predictions)


tensor([[4.0195e-02, 9.5980e-01],
        [9.9946e-01, 5.4418e-04]], grad_fn=<SoftmaxBackward0>)


In [None]:
model.config.id2label

{0: 'NEGATIVE', 1: 'POSITIVE'}

In [None]:
out=predictions.tolist()

print(raw_inputs)
for i,row in enumerate(out):
  index=row.index(max(row))
  print(model.config.id2label[index])


["I've been waiting for a HuggingFace course my whole life.", 'I hate this so much!']
POSITIVE
NEGATIVE


In [None]:
from transformers import pipeline, set_seed
generator = pipeline('text-generation', model='openai-gpt')
set_seed(42)
generator("Hello, I'm a language model,", max_length=30, num_return_sequences=5)


config.json:   0%|          | 0.00/656 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/479M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/816k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/458k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[{'generated_text': "Hello, I'm a language model, for'e's a very big one. they call me'e'n e'n''"},
 {'generated_text': 'Hello, I\'m a language model, " he said and leaned his left shoulder against the corner of the door. \n i couldn\'t take my'},
 {'generated_text': "Hello, I'm a language model,'came their reply. \n'it's a man thing! if you have three hundred thousand words,"},
 {'generated_text': 'Hello, I\'m a language model, " he said, the sound of his voice sending a shiver down her spine. \n " a language model'},
 {'generated_text': "Hello, I'm a language model,'he said.'you call me a'man out'in... well, er, in the"}]

In [None]:
!pip install spacy ftfy

In [None]:
from transformers import OpenAIGPTTokenizer, OpenAIGPTModel
import torch

tokenizer = OpenAIGPTTokenizer.from_pretrained("openai-gpt")
model = OpenAIGPTModel.from_pretrained("openai-gpt")

inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
outputs = model(**inputs)

last_hidden_states = outputs.last_hidden_state
print(last_hidden_states)


tensor([[[ 0.4653,  0.0642,  0.5910,  ...,  0.1177, -0.0021, -1.2262],
         [-0.3697, -0.0957,  0.6613,  ..., -0.0344, -0.2164,  0.1205],
         [ 0.1700, -0.3252,  0.0407,  ...,  0.1589, -0.8057, -0.2830],
         [-0.3669, -0.0448,  0.8061,  ..., -0.0090, -0.0872, -0.5224],
         [-0.5047,  0.6522,  0.6932,  ...,  0.0811,  0.6475,  0.3190],
         [-0.2972,  0.0591,  1.2333,  ..., -0.7394, -0.2600,  0.0863]]],
       grad_fn=<ViewBackward0>)


In [None]:
from transformers import OpenAIGPTTokenizer, TFOpenAIGPTModel

tokenizer = OpenAIGPTTokenizer.from_pretrained("openai-gpt")
model = TFOpenAIGPTModel.from_pretrained("openai-gpt")

inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
outputs = model(inputs)

last_hidden_states = outputs.last_hidden_state
print(last_hidden_states)
