# 2. Transformers Library
### 2-1. Pipeline 내부 실행과정 

- Tokenizer > Model > Post processing
#### Tokenizer
    1. input을 token으로 분할
    2. 각 token을 integer로 mapping
    3. 모델에 유용할 수 있는 additional inputs 추가
#### Preprocessing

In [2]:
# Create Tokenizer
# tokenizer에 문장을 입력해 모델에 바로 전달할 수 있는 python dictionary정보를 구할 수 있다
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [3]:
# 단일 문장 list를 toknizer함수로 전달, output tensor 유형 지정
raw_inputs = ["I've been waiting for a Hugging Face course my whole life.",
			  "I hate this so much"]
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")
print(inputs)

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662,  2227,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  1045,  5223,  2023,  2061,  2172,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}


In [4]:
# 위 checkpoint로 훈련된 model load
from transformers import AutoModel

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
# from_pretrained: pipeline을 사용한 것과 동일한 checkpoint를 다운로드 하고 모델을 instance화 한다
model = AutoModel.from_pretrained(checkpoint)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [9]:
# output= high-dimensional vector.
# (Batch size, Sequence length, Hidden size)

outputs = model(**inputs) # **: dictionary 형태로 파라미터 개수를 지정하지 않고 전달
print(f"output: {outputs.last_hidden_state.shape} == (Batch Size, Input Sequence len, Hidden Size)")


output: torch.Size([2, 16, 768]) == (Batch Size, Input Sequence len, Hidden Size)


In [10]:
# 아래와 같이 dictionary처럼 key값으로 접근가능
outputs

BaseModelOutput(last_hidden_state=tensor([[[-0.0308,  0.1668,  0.6030,  ...,  0.0668,  0.5981, -0.1910],
         [ 0.4810,  0.4916,  0.1850,  ...,  0.1947,  0.8521, -0.1492],
         [ 0.9573,  0.1291,  0.3315,  ...,  0.4574,  0.0402, -0.8591],
         ...,
         [ 0.4769,  0.4884,  0.2280,  ..., -0.0569,  0.6654, -0.1550],
         [ 0.8315,  0.1056,  0.2724,  ...,  0.6057,  0.1320, -0.8378],
         [ 0.2346,  0.1887,  0.2689,  ...,  0.4420,  0.6957, -0.4821]],

        [[-0.3088,  0.7332, -0.1860,  ..., -0.1305, -0.9360, -0.0433],
         [-0.3340,  0.9830, -0.0946,  ..., -0.3825, -0.6176,  0.2008],
         [-0.1687,  0.8781, -0.1117,  ..., -0.2380, -0.7790,  0.0935],
         ...,
         [-0.3047,  0.7850, -0.2043,  ..., -0.1101, -0.7665, -0.0563],
         [-0.4014,  0.8999, -0.2245,  ..., -0.1877, -0.6910,  0.0329],
         [-0.2933,  0.7947, -0.2140,  ..., -0.1002, -0.7508, -0.0623]]],
       grad_fn=<NativeLayerNormBackward0>), hidden_states=None, attentions=None)

In [11]:
# 문장의 긍/부정 분류를 위해 시퀀스 분류 헤드가 포함된 모델 사용
from transformers import AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
outputs = model(**inputs)          
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[-3.1071,  3.2654],
        [ 4.2141, -3.4158]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [12]:
# model head가 high-dimensional vector를 입력받아 저차원의 vector를 출력한다
print(outputs.logits.shape)

torch.Size([2, 2])


In [30]:
# 출력 값 자체가 의미있는 것이 아니므로 Output을 확률 값으로 변환
# logit(출력 값) -> softmax -> 확률
import torch

predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(f"predictions: {predictions}")

predictions: tensor([[1.7051e-03, 9.9829e-01],
        [9.9951e-01, 4.8549e-04]], grad_fn=<SoftmaxBackward0>)


In [31]:
# 각 위치에 해당하는 label을 가져오기 위해 model.config의 id2label 값 확인

print(model.config.id2label)

{0: 'NEGATIVE', 1: 'POSITIVE'}


## 2-2. Model Generation & Using

#### 2-2-1. BERT Model initialization

In [38]:
# Configuration 객체 load
from transformers import BertConfig, BertModel

# load configuration itself
config = BertConfig()

# 해당 config model generation
model = BertModel(config)
print (config)

# 이 상태의 model을 사용하면 출력이 엉망이다.

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.41.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [39]:
# 따라서 이미 사전 학습된 Transformer model을 load 하자
from transformers import BertModel

model = BertModel.from_pretrained("bert-base-cased")

In [40]:
model.save_pretrained("saved_models/")

In [41]:
!ls saved_models
# confin.json file: model architecture를 구축하는 데 필요한 다양한 속성,
# some metadata(checkpoint 구축한 출처, 마지막으 저장할 때 사용하던 library version emd)가 저장됨

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


config.json  model.safetensors


In [42]:
# Create Tokenizer
from transformers import AutoTokenizer
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

sequences = ["Hello!", "Cool.", "Nice!"]
inputs = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
print(inputs)

{'input_ids': tensor([[ 101, 7592,  999,  102],
        [ 101, 4658, 1012,  102],
        [ 101, 3835,  999,  102]]), 'attention_mask': tensor([[1, 1, 1, 1],
        [1, 1, 1, 1],
        [1, 1, 1, 1]])}


In [45]:
encoded_inputs = tokenizer(sequences)
print(encoded_inputs)

tensor([[ 101, 7592,  999,  102],
        [ 101, 4658, 1012,  102],
        [ 101, 3835,  999,  102]])

In [49]:
model_inputs = torch.tensor(encoded_inputs['input_ids'])
output = model(model_inputs)
print(f"output's keys: {output.keys()}")

output's keys: odict_keys(['last_hidden_state', 'pooler_output'])


In [50]:
pop = output['pooler_output']

from torch import nn
cos = nn.CosineSimilarity(dim=-1, eps=1e-6)

print(cos(pop[0], pop[1]))
print(cos(pop[2], pop[1]))
print(cos(pop[2], pop[0]))

tensor(0.9860, grad_fn=<SumBackward1>)
tensor(0.9899, grad_fn=<SumBackward1>)
tensor(0.9891, grad_fn=<SumBackward1>)


## 2-3. Tokenizer
#### 2-3-1. Tokenization

In [19]:
# python split() method
raw_text = "Jim Henson was a puppeteer"
tokenized_text = raw_text.split()
print(tokenized_text)

['Jim', 'Henson', 'was', 'a', 'puppeteer']


#### 2-3-2. Load & Save tokenizer

In [21]:
from transformers import BertTokenizer
tokenizer1 = BertTokenizer.from_pretrained("bert-base-cased")

from transformers import AutoTokenizer
# checkpoint 이름에 해당하는 tokenizer 가져온다
tokenizer2 = AutoTokenizer.from_pretrained("bert-base-cased")

In [22]:
# Using tokenizer
tokenizer2("Using a Transformer network is simple")

{'input_ids': [101, 7993, 170, 13809, 23763, 2443, 1110, 3014, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [23]:
# Save tokenizer
tokenizer2.save_pretrained("saving_folder")

('saving_folder/tokenizer_config.json',
 'saving_folder/special_tokens_map.json',
 'saving_folder/vocab.txt',
 'saving_folder/added_tokens.json',
 'saving_folder/tokenizer.json')

In [24]:
# input_ids 생성과정
# Encoding_ tokenize : text를 token으로 분리
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

sequence = "Using a Transformer network is simple"
tokens = tokenizer.tokenize(sequence)

print(tokens)

['Using', 'a', 'Trans', '##former', 'network', 'is', 'simple']


In [25]:
# Tokens -> input IDs
ids = tokenizer.convert_tokens_to_ids(tokens)

print(ids)

[7993, 170, 13809, 23763, 2443, 1110, 3014]


In [26]:
# Decoding: 읽을 수 있는 원본 문장 도출
decoded_string = tokenizer.decode(ids)

print(decoded_string)

Using a Transformer network is simple


## 2-4. Multiple sequence handling

In [28]:
# model input을 batch화 해보자
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my whole life."

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor(ids)
# model(input_ids)  # this whill fail
model(input_ids.unsqueeze(-1))

SequenceClassifierOutput(loss=None, logits=tensor([[ 0.5416, -0.3122],
        [-0.6212,  0.6812],
        [ 0.7320, -0.3175],
        [ 0.3401, -0.0835],
        [ 0.1112,  0.1108],
        [ 0.1154,  0.2029],
        [ 0.1899,  0.1824],
        [ 0.4792, -0.3667],
        [ 2.3476, -1.9814],
        [ 1.1809, -0.8643],
        [ 0.3525,  0.0292],
        [ 1.6505, -1.3613],
        [-1.6671,  1.8471],
        [ 1.0687, -0.9100]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [29]:
tokenized_inputs = tokenizer(sequence, return_tensors="pt")
print (tokenized_inputs)

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [30]:
# Fix error
input_ids = torch.tensor([ids])
print(f"Input IDs:{input_ids}")

output = model(input_ids)
print(f"Logits:{output}")

Input IDs:tensor([[ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012]])
Logits:SequenceClassifierOutput(loss=None, logits=tensor([[-2.7276,  2.8789]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


In [31]:
# Padding
# - 두 개의 sequence를 따로, batch로 model에 넣어보기
# 길이 더 짧은 문장에 padding_token 추가
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence1_ids = [[200,200,200]]
sequence2_ids = [[200,200]]
batched_ids = [
        [200,200,200],
        [200,200,tokenizer.pad_token_id]
    ]

print(model(torch.tensor(sequence1_ids)).logits)
print(model(torch.tensor(sequence2_ids)).logits)
print(model(torch.tensor(batched_ids)).logits)

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


tensor([[ 1.5694, -1.3895]], grad_fn=<AddmmBackward0>)
tensor([[ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)
tensor([[ 1.5694, -1.3895],
        [ 1.3373, -1.2163]], grad_fn=<AddmmBackward0>)


In [32]:
# attention_mask: attention layer가 padding_token을 고려하지 않기 위함
attention_mask = [
    [1, 1, 1],
    [1, 1, 0],
]

outputs = model(torch.tensor(batched_ids), attention_mask=torch.tensor(attention_mask))
print(outputs.logits)

tensor([[ 1.5694, -1.3895],
        [ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)
