# Google Drive Mount

In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


# 3.2.1 Transformers 설치


In [2]:
!pip install datasets==2.20.0

Collecting datasets==2.20.0
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets==2.20.0)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets==2.20.0)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.2 (from datasets==2.20.0)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets==2.20.0)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_

In [3]:
!pip install transformers==4.41.2



In [4]:
!pip list | grep transformers

transformers                     4.41.2


# 3.2.2 Tokenizer

### Tokenizer 다운로드

In [1]:
from transformers import BertTokenizer

model_name = "klue/bert-base"
tokenizer = BertTokenizer.from_pretrained(model_name)

In [None]:
help(tokenizer)

In [None]:
print(tokenizer.vocab_size)
print(tokenizer.get_vocab())
print(tokenizer.special_tokens_map)

### 토큰화 작업

In [3]:
sentence = "안녕하세요. 이건 테스트입니다."

# 토큰화 작업
tokens1 = tokenizer.tokenize(sentence)
print(tokens1)

# 토큰을 입력 식별자로 변환
ids1 = tokenizer.convert_tokens_to_ids(tokens1)
print(ids1)

ids2 = tokenizer(sentence)
print(ids2)

['안녕', '##하', '##세요', '.', '이건', '테스트', '##입니다', '.']
[5891, 2205, 5971, 18, 5370, 7453, 12190, 18]
{'input_ids': [2, 5891, 2205, 5971, 18, 5370, 7453, 12190, 18, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [4]:
# 디코딩
decoded_string1 = tokenizer.decode(ids1)
print(decoded_string1)

decoded_string2 = tokenizer.decode(ids2["input_ids"])
print(decoded_string2)

decoded_string3 = tokenizer.decode(ids2["input_ids"], skip_special_tokens=True)
print(decoded_string3)

안녕하세요. 이건 테스트입니다.
[CLS] 안녕하세요. 이건 테스트입니다. [SEP]
안녕하세요. 이건 테스트입니다.


### 데이터셋 전처리

In [5]:
from datasets import load_dataset

dataset = load_dataset("klue", "ynat")
raw_train_dataset = dataset["train"]

Downloading data:   0%|          | 0.00/4.17M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/847k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/45678 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/9107 [00:00<?, ? examples/s]

In [6]:
from transformers import BertTokenizer

model_name = "klue/bert-base"
tokenizer = BertTokenizer.from_pretrained(model_name)

tokenized_examples = tokenizer(
    raw_train_dataset["title"],
    padding="max_length",
    truncation=True,
)

In [7]:
def tokenize_function(sample):
    return tokenizer(sample["title"])

tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    batch_size=1000,
    remove_columns=["guid", "title", "url", "date"]
)

Map:   0%|          | 0/45678 [00:00<?, ? examples/s]

Map:   0%|          | 0/9107 [00:00<?, ? examples/s]

In [8]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 45678
    })
    validation: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9107
    })
})

# 3.2.3 DataCollator

### DataCollator 사용

In [9]:
print(tokenized_datasets["train"][0]["input_ids"])
print(type(tokenized_datasets["train"][0]["input_ids"]))

[2, 10637, 8474, 22, 2210, 2299, 2118, 28940, 3691, 4101, 3792, 3]
<class 'list'>


In [10]:
from pprint import pprint
from transformers import DataCollatorWithPadding

batch = [tokenized_datasets["train"][i] for i in range(8)]
print([len(sample["input_ids"]) for sample in batch])

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
batch = data_collator(batch)
pprint({k: v.size() for k, v in batch.items()})

[12, 12, 17, 16, 18, 13, 14, 5]
{'attention_mask': torch.Size([8, 18]),
 'input_ids': torch.Size([8, 18]),
 'labels': torch.Size([8]),
 'token_type_ids': torch.Size([8, 18])}


# 3.2.4 Model

### Model 다운로드

In [19]:
!ls ~/.cache/huggingface/hub

models--bert-base-uncased  models--klue--bert-base  version.txt


In [41]:
from transformers import BertTokenizer, BertModel

model = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model)
model = BertModel.from_pretrained(model)

In [29]:
model_path = "/content/drive/MyDrive/Books/outputs/MyBertModel/"
tokenizer.save_pretrained(model_path)
model.save_pretrained(model_path)

!ls -l {model_path}

total 427917
-rw------- 1 root root       690 Jun 23 13:14 config.json
-rw------- 1 root root 437951328 Jun 23 13:14 model.safetensors
-rw------- 1 root root       695 Jun 23 13:14 special_tokens_map.json
-rw------- 1 root root      1242 Jun 23 13:14 tokenizer_config.json
-rw------- 1 root root    231508 Jun 23 13:14 vocab.txt


In [30]:
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertModel.from_pretrained(model_path)

### Model 추론 실습

In [None]:
import torch
from transformers import BertTokenizer, BertForMaskedLM

model = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model)
model = BertModel.from_pretrained(model)

inputs = tokenizer("The capital of France is [MASK].", return_tensors="pt")

with torch.no_grad():
   logits = model(**inputs).logits

# retrieve index of [MASK]
mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]

predicted_token_id = logits[0, mask_token_index].argmax(axis=-1)
tokenizer.decode(predicted_token_id)

# 3.2.5 AutoClass

### AutoClass로 Tokenizer, Model 다운로드

In [39]:
from transformers import AutoTokenizer, AutoModelForMaskedLM


model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name)

inputs = tokenizer("The capital of France is [MASK].", return_tensors="pt")

with torch.no_grad():
   logits = model(**inputs).logits

# retrieve index of [MASK]
mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]

predicted_token_id = logits[0, mask_token_index].argmax(axis=-1)
tokenizer.decode(predicted_token_id)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


'paris'

# 3.2.7 Pipelines

### 허깅페이스 허브에 있는 모델 가져오기

In [43]:
from transformers import pipeline

pipe = pipeline(task="text-classification", model="JiHoon-kim/bert-base-klue-ynat-finetuned")
print(pipe("유튜브 내달 2일까지 크리에이터 지원 공간 운영"))

[{'label': 'LABEL_3', 'score': 0.5010399222373962}]


### 미세조정 모델 경로로 가져오기

In [44]:
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    pipeline
)

model_name = "drive/MyDrive/Books/outputs/trained_model"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

pipe = pipeline(task="text-classification", tokenizer=tokenizer, model=model)
print(pipe("유튜브 내달 2일까지 크리에이터 지원 공간 운영"))

[{'label': 'LABEL_2', 'score': 0.46643710136413574}]


In [45]:
model_name = "JiHoon-kim/bert-base-klue-ynat-finetuned"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

pipeline = pipeline(task="text-classification", model=model, tokenizer=tokenizer)
print(pipeline("유튜브 내달 2일까지 크리에이터 지원 공간 운영"))

[{'label': 'LABEL_3', 'score': 0.5010399222373962}]


### 직접 구현

In [46]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification


model_name = "JiHoon-kim/bert-base-klue-ynat-finetuned"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

model.cuda().eval()

with torch.no_grad():
    output = model(
        **tokenizer(
            "유튜브 내달 2일까지 크리에이터 지원 공간 운영", 
            return_tensors="pt"
        ).to(model.device)
    )
    result = torch.softmax(output.logits.cpu(), -1)

result = [
    {"label": f"LABEL_{l}", "score": result[i, l].item()}
    for i, l in enumerate(result.argmax(-1))
]
print(result)

[{'label': 'LABEL_3', 'score': 0.5010401606559753}]
