<a href="https://colab.research.google.com/github/xpdlaldam/nlp/blob/master/Hugging%20Face/transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **1. pipeline**

In [None]:
!pip install datasets gradio evaluate transformers[sentencepiece]

# Libraries

In [None]:
from transformers import pipeline
from datasets import load_dataset
from transformers.pipelines.pt_utils import KeyDataset
from tqdm.auto import tqdm

# 1-1. sentiment analysis

In [None]:
### sentiment-analysis
classifier = pipeline("sentiment-analysis")

sents = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this so much!",
    "neutral i'd say"
    ]
# classifier(sents[2]) # one by one
classifier(sents) # simultaneous

## 1-2. customize labels

In [None]:
### zero-shot-classification: lets customize labels
classifier = pipeline("zero-shot-classification")
sents = [
    "this is biology 101",
    "president trump",
    "capex was over 1B this time",
]

classifier(
    sents,
    candidate_labels=["education", "politics", "business"],
)

## 1-3. generate text

In [None]:
# distilgpt2
# deepset/roberta-base-squad2
generator = pipeline("text-generation", model="distilgpt2")
generator("summarize AMD's most recent financial report")

In [None]:
pipe = pipeline(model="FacebookAI/roberta-large-mnli")

sents = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this so much!",
    "neutral i'd say"
    ]

pipe(sents)

## 1-4. speech recognition

In [None]:
pipe = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h", device=0)
dataset = load_dataset("superb", name="asr", split="test")

# KeyDataset (only *pt*) will simply return the item in the dict returned by the dataset item
# as we're not interested in the *target* part of the dataset. For sentence pair use KeyPairDataset
for out in tqdm(pipe(KeyDataset(dataset, "file"))):
    print(out)
    # {"text": "NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD NIGHT HUSBAND"}
    # {"text": ....}
    # ....

In [None]:
dataset

In [None]:
from huggingface_hub import list_datasets
print([dataset.id for dataset in list_datasets()])

In [None]:
minds = load_dataset("PolyAI/minds14", name="ko-KR", split="train")
minds

In [None]:
minds[0]

In [None]:
id2label = minds.features["intent_class"].int2str
id2label(minds[0]["intent_class"])

In [None]:
minds.shuffle()[0]

In [None]:
import gradio as gr

def generate_audio():
    example = minds[0]
    audio = example["audio"]
    return (
        audio["sampling_rate"], # Hz
        audio["array"], # contains the sound represented in numbers in an array
    ), id2label(example["intent_class"])


with gr.Blocks() as demo:
    with gr.Column():
        for _ in range(1):
            audio, label = generate_audio()
            output = gr.Audio(audio, label=label)

demo.launch(debug=True)

In [None]:
example = minds[0]
example['audio']

In [None]:
## visualize
import librosa
import matplotlib.pyplot as plt
import librosa.display

example = minds[0]

plt.figure().set_figwidth(12)
librosa.display.waveshow(example["audio"]["array"], sr=example["audio"]["sampling_rate"])

# 1-5. Fill in the blank

In [None]:
from transformers import pipeline

unmasker = pipeline("fill-mask")
unmasker("The cheapest flight from EWR to ICN is <mask>.", top_k=5)

In [None]:
ner = pipeline("ner", grouped_entities=True)
ner("Elon Musk and Trump are on the same boat")

In [None]:
pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-tc-big-ko-en")
pipe("설악산은 한국의 100대 명산 중 하나이다")

# 2. The Transformer architecture

In [None]:
# The Transformer model is based on Attention is All you Need
# Attention is All you Need is a encoder-decoder (aka sequence to sequence transformer) model where it has encoders and decoders
# The encoder "encodes" text into numerical representations
# These numerical representations are all also called "embeddings" or "features"
# The decoder "decodes" the representations from the encoder

## 2-1. Encoder models

In [None]:
# ex) Welcome to Korea => each word has its own numerical representation (aka feature vector or tensor vector) comprising of sequence of numbers
# ex) "Welcome" corresponds to [.1, .3, .2, ...]

# ex) BERT

# The feature vector looks at the left and right of the word, hence capturing "context" of the sentence thanks to the self-attention mechanism

# Encoder models are good at obtaining an understanding of sequences and the relationship b/w words
  # Extracting meaningful information
  # Masked Language Modeling: guessing a randomly masked work
  # Classification => Sentiment analysis ex) is the sentence positive or negative

# 2-2. Decoders

In [None]:
### great at:
## uni-directional: access to either the left or right context

## causal tasks: guesses the next word in a sequence (auto-regressive)
# ex) my => name
# my name => is
# my name is => Peter

## generating sequences

# words can only see the words on their left side; the right side is hidden
# => means unidirectional

# 2-3. Encoder-Decoder

In [None]:
## aka sequence-to-sequence model
# step 1) the encoder takes a sequence of words such as "Welcome to Korea"
# step 2) the encoder outputs a numerical represenation for each word
# step 3) the decoder takes the first word Welcome as an input
# step 4) the decoder outputs the second word based on the first word (at this point we don't need to use the encoder)

##
# the encoder takes cafe of understanding the sequence
# the decoder takes care of generating a sequence according to the understanding of the encoder, hence it "decodes"
# the weights are not necessarily shared b/w an encoder and a decoder

## good at:
# summarizing text

# 3. How does the pipeline function work?

In [None]:
## there are 3 stages
## 1. Tokenizer
# converts raw text into input ids
# ex) hi my name is => [101, 2342, 1212, 2357]
# the AutoTokenizer class can load the tokenizer for any checkpoint (language model)
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

raw_inputs = [
    "i'm pretty hungry right now",
    "i need food now",
]
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")
inputs

# padding=True: to make the sentences the same lenth
# truncation=True: ensure any sentence does not exceed the maximum the model can handle
# return_tensors="pt": pt means pytorch tensor
# attention_mask: indicates where padding was applied so the model does not pay attention to it
# outputs a dictionary with two keys
# input_ids: one row per sentence => unique identifiers of the tokens for each sentence

In [None]:
## 2. Model
# Based on the input ids from the tokenizer, we now move on to the Model
# part which outputs logits
# outputs "hidden states" aka features
# "hidden states" are usually inputs to another part of the model, known as the "head"
# for each model input (sentence in this case), we get a high-dimensional vector that represents
# the contextual understanding of that input comprising of
# batch size:  number of sequences (sentences in this case)
# sequence length
# hidden size: the vector dimension of each model input
from transformers import AutoModel

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModel.from_pretrained(checkpoint)

outputs = model(**inputs)

# [2, 9, 768]
# 2: number of sequences
# 9: sequence length
# 768: vector dimension of each model input
outputs.last_hidden_state.shape

In [None]:
## 3. Postprocessing
# For our example, we need a model with a sequence classification head
#
from transformers import AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
outputs = model(**inputs)

# you can see that the dimensionality is much lower as the model head
# took the high-dimensional input and outputted a 2 by 2 vector
# (one per label => positive and negative for each sequence)
print(outputs.logits.shape)

# note that these numbers are logits (the raw, unnormalized scores)
# , not probabilities
# they need to go through a SoftMax layer to convert to probabilities
# Q. why does all Transformers models outputs logits?
# A. because the loss function for training will generally fuse
# the last activation function such as a SoftMax, with the actual loss
# such as cross entropy
print(outputs.logits)
import torch
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(predictions)

In [None]:
# model labels
model.config.id2label

# How To Instantiate a Transformers model

##

In [None]:
### How to upload weights
## Method 1
from transformers import AutoConfig
AutoConfig.from_pretrained("bert-base-cased")

## Method 2
from transformers import BertConfig, BertModel
model = BertModel.from_pretrained("bert-base-cased")

In [None]:
## Config file: a blueprint that contains all the info to create the model architecture
BertConfig.from_pretrained("bert-base-cased")

In [None]:
## to use different layers ex) use 10 layers instead of 12
# however, this will start with randomly assigned weights
BertConfig.from_pretrained("bert-base-cased", num_hidden_layers=10)
BertModel(BertConfig)

# 5. Tokenizers

## 5-1. Tokenizers convert raw text to numbers called "encoding"

In [None]:
from transformer import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

seq1 = "what's it like living in Ireland"

print(tokenizer(seq1)) # this is actually two steps as shown in the following:

tokens = tokenizer.tokenize(seq1) # step 1. to see the tokenized seq
print(tokens)

print(tokenizer.convert_tokens_to_ids(tokens)) # step 2. convert to input IDs

In [None]:
seq2 = "how's it like living in Korea"

tokens = tokenizer.tokenize(seq1)
print(tokens)

print(tokenizer.convert_tokens_to_ids(tokens))

In [None]:
## Decoding is the opposite of encoding: converts vocabulary indices (input IDs) to raw text (tokens)
# This behavior will be very useful for models that predict new text (e.g., text generation from a promt, seq-to-seq problems like translation or summarization)
tokenizer.decode([1293, 112, 188, 1122, 1176, 1690, 1107, 3577])

# 6. Handling Multiple Sequences

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

## 1. Choose tokenizer model
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"

## 2. Define Tokenizer based on defined checkpoint in 1.
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

## 3. Define sentiment analysis model => will be task-specific
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

## 4. Make seq test case
seq = "I've never been to Ireland. I'ld like to go someday"

## 5. Apply tokenizer defined in 2.

## 6. Convert tokens to numbers (input IDs)

## 7.
input_ids = torch.tensor([ids])
print(input_ids)

## Logit
outputs = model(input_ids)

## Convert logit to probability using softmax
preds = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(preds)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

## 1. Choose tokenizer model
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"

## 2. Define Tokenizer based on defined checkpoint in 1.
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

## 3. Define sentiment analysis model => will be task-specific
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

## 4. Make seq test case
seqs = ["I've been waiting very long to finally meet you", "I missed you so much"]

## 5. Apply tokenizer defined in 2.
tokens = tokenizer(seqs, padding=True, truncation=True, return_tensors="pt")
output = model(**tokens)
print(output)

preds = torch.nn.functional.softmax(output.logits, dim=-1)
print(preds)

# 7. Fine-Tuning a Pretrained Model

## 7-1. Preprocessing

In [None]:
## goal: from a pair of sentences, the model can tell if they are a paraphrase
from datasets import load_dataset

raw_datasets = load_dataset("glue", "mrpc")
raw_datasets

In [None]:
raw_train_dataset = raw_datasets["train"]
raw_train_dataset[0]

In [None]:
raw_train_dataset.features

In [None]:
### Caution: Because we need a pair of sentences, we construct the tokenizer as follows
# However, this stores the data in RAM which will make the notebook crash
# Instead, we use the "datasets" from the Datasets library which are Apache Arrow files
# stored on the disk
# Exception: when trained on a TPU. it prefers fixed shapes, even when that requires extra padding

from transformers import AutoTokenizer
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

## this method works but it will take significantly more time as the data is stored in RAM
# tokenized_dataset = tokenizer(
#     raw_datasets["train"]["sentence1"],
#     raw_datasets["train"]["sentence2"],
#     padding=True,
#     truncation=True,
# )

## correct method
# to keep the data as a "dataset", we use Dataset.map()
# * Note: we skip padding at this step as it's inefficient to apply padding to the entire data
# with max length => instead we do this at the batched level aka "dynamic padding"
# * we implement dynamic padding by using a "collate function" which puts together samples inside a batch
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets

In [None]:
tokenized_datasets['train'][0]

In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
data_collator

In [None]:
tokenized_datasets["train"].column_names

In [None]:
## grab some sample => assume this is our first batch
samples = tokenized_datasets["train"][:5]

## remove idx, sentence1, sentence2 as we can't create tensors w/ strings
samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]}

## as you can see the length vary
# if dynamic padding is applied to this batch it should all be padded to the max length which is 67
[len(x) for x in samples["input_ids"]]

In [None]:
## check if dynamic padding max length is 67 for this batch
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

# 7-2. Fine-tuning a model with the Trainer API

In [None]:
pip install datasets

In [None]:
## from previous section 7-1
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

## apply tokenizer keeping the data as a "dataset" using the Dataset.map() method
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

## dynamic padding using DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
data_collator

### 7-2-1. Method 1: w/o using evaluation metrics during training

In [None]:
### Step 1
## Define model setup using TrainingArguments
# contains all the hyperparameters
# here the only argument we need to change is a directory where the trained model will be saved
from transformers import TrainingArguments
training_args = TrainingArguments("test-trainer")

# list of all the avaiable parameters for training
training_args

In [None]:
### Step 2
## Define model
## Caveat: we get a warning as the model we chose (BERT) hasn't been trained on classifying
# pairs of sentences
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
model

In [None]:
### Step 3
## Put all the objects constructed up to now by defining a Trainer
from transformers import Trainer
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator, # default data_collator used by the Trainer is DataCollatorWithPadding
    processing_class=tokenizer,
)
trainer

In [None]:
### Step 4
## Fine-tune
# takes too long => change parameter and retry
trainer.train()

In [None]:
### Step 5
## Evaluation
predictions = trainer.predict(tokenized_datasets["validation"])

# returns a named tuple: predictions, label_ids, metrics (loss, execution time etc)
# (408, 2)
# (408,)
print(predictions.predictions.shape, predictions.label_ids.shape)

## why are we doing argmax()
# ? revisit as not fully understood why argmax is needed
# to match the predictions with our labels, we take the max logit for each prediction
# to know which of the two classes were predicted
import numpy as np
preds = np.argmax(predictions.predictions, axis=-1)

# we now compare preds to the labels
import evaluate
metric = evaluate.load("glue", "mrpc")
metric.compute(predictions=preds, references=predictions.label_ids)

### 7-2-2. Method 2: by using evaluation metrics during training

In [None]:
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

## evaluation_strategy="epoch": tells the Trainer to evaluate at the end of every epoch
training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

# 7-3. Full Training using PyTorch

In [18]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

## apply tokenizer keeping the data as a "dataset" using the Dataset.map() method
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

In [19]:
tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [20]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)

In [21]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

{'labels': torch.Size([8]),
 'input_ids': torch.Size([8, 63]),
 'token_type_ids': torch.Size([8, 63]),
 'attention_mask': torch.Size([8, 63])}

In [22]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

tensor(0.6756, grad_fn=<NllLossBackward0>) torch.Size([8, 2])
