In [1]:
from dotenv import load_dotenv
import os

load_dotenv()

hugging_face_token = os.getenv("HUGGING_FACE_TOKEN")

In [2]:
from huggingface_hub import login

login(hugging_face_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /Users/joyuiyeong/.cache/huggingface/token
Login successful


In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b")
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", device_map="auto")

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [23]:
tokenizer.special_tokens_map

{'bos_token': '<bos>',
 'eos_token': '<eos>',
 'unk_token': '<unk>',
 'pad_token': '<pad>',
 'additional_special_tokens': ['<start_of_turn>', '<end_of_turn>']}

In [4]:
import torch

my_device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

input_text = "What is your name?"

input_ids = tokenizer(input_text, return_tensors="pt").to(my_device)

outputs = model.generate(**input_ids)
print(tokenizer.decode(outputs[0]))



<bos>What is your name?

What is your age?

What is your gender?

What


In [6]:
tokens = input_ids["input_ids"]
print(tokens)

logits = model(**input_ids).logits
for i in range(tokens.shape[-1]):
    token = tokens[0, i].item()
    print(logits[0, i, token])

tensor([[     2,   1841,    603,    861,   1503, 235336]], device='mps:0')
tensor(-18.2746, device='mps:0', grad_fn=<SelectBackward0>)
tensor(-33.2665, device='mps:0', grad_fn=<SelectBackward0>)
tensor(-23.9536, device='mps:0', grad_fn=<SelectBackward0>)
tensor(-27.7627, device='mps:0', grad_fn=<SelectBackward0>)
tensor(-19.6064, device='mps:0', grad_fn=<SelectBackward0>)
tensor(-21.0372, device='mps:0', grad_fn=<SelectBackward0>)


## Zero-shot classification 구현해보기

### Zero-shot learning
- 정의: 모델이 이전에 본 적 없는 클래스나 작업을 수행하는 능력입니다.
- 특징: 훈련 데이터에 없던 새로운 카테고리의 데이터를 처리할 수 있습니다.
- 작동 원리: 기존 지식을 바탕으로 새로운 상황에 일반화하여 적용합니다.
- 예시: 개와 고양이를 구분하도록 훈련된 모델이 사자 이미지를 보고 '고양이과' 동물로 분류하는 경우

### Few-shot learning
- 정의: 매우 적은 수의 예시만으로 새로운 작업을 수행하는 능력입니다.
- 특징: 소수의 학습 예제만으로 새로운 개념을 빠르게 습득합니다.
- 작동 원리: 주어진 소수의 예시를 바탕으로 패턴을 파악하고 일반화합니다.
- 예시: 5개의 한국어 문장과 영어 번역을 보고, 새로운 한국어 문장을 영어로 번역하는 경우

In [26]:
def zero_shot_classification(device, text, task_description, labels):
    tokenized_question = tokenizer(task_description + text, return_tensors="pt").to(
        device
    )
    question_input_ids = tokenized_question["input_ids"]
    question_attention_mask = tokenized_question["attention_mask"]

    probs = []
    for label in labels:
        tokenized_label = tokenizer(label, return_tensors="pt").to(device)
        label_input_ids = tokenized_label["input_ids"]
        label_attention_mask = tokenized_label["attention_mask"]
        num_label_tokens = (
            label_input_ids.shape[-1] - 1
        )  # 문장 나누는 special token 을 뺀 것

        concatenated_input_ids = torch.concatenate(
            [question_input_ids, label_input_ids[:, 1:]], axis=-1
        )
        concatenated_attention_mask = torch.concatenate(
            [question_attention_mask, label_attention_mask[:, 1:]], axis=-1
        )

        logits = model(
            input_ids=concatenated_input_ids, attention_mask=concatenated_attention_mask
        ).logits
        prob = 0
        num_total_token = concatenated_input_ids.shape[-1]
        for i in range(num_label_tokens, 0, -1):
            token = label_input_ids[0, i].item()
            prob += logits[0, num_total_token - i, token].item()
        probs.append(prob)

        torch.mps.empty_cache()
    return probs

In [25]:
probs = zero_shot_classification(
    my_device,
    "I am happy!",
    "Is the sentence positive or negative?: ",
    ["positive", "negative"],
)
print(probs)

[-10.642311096191406, -11.575439453125]


## 영화 리뷰에 대해서 Zero-Shot 해보기

In [27]:
from datasets import load_dataset

imdb = load_dataset("imdb")


def preprocess_function(data):
    return tokenizer(data["text"], truncation=True, padding="max_length")


tokenized_imdb = imdb.map(preprocess_function, batched=True)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [28]:
import numpy as np
from tqdm import tqdm

n_correct = 0
for i in tqdm(range(50)):
    text = tokenized_imdb["test"][i]["text"]
    label = tokenized_imdb["test"][i]["label"]
    probs = zero_shot_classification(
        device=my_device,
        text=text,
        task_description="A movie review is given. Decide that the movie review is positive or negative: ",
        labels=["Answer: negative.", "Answer: positive."],
    )
    pred = np.argmax(np.array(probs))
    if pred == label:
        n_correct += 1

100%|██████████| 50/50 [00:34<00:00,  1.46it/s]

43





In [30]:
print(n_correct, n_correct / 50)

43 0.86
