# LLM 으로 뉴스기사 분류해보기

In [1]:
import os

from dotenv import load_dotenv
from huggingface_hub import login

load_dotenv()

hugging_face_token = os.getenv("HUGGING_FACE_TOKEN")

login(hugging_face_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /Users/joyuiyeong/.cache/huggingface/token
Login successful


## GEMMA 모델과 Tokenizer 로드하기

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b")
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", device_map="auto")
print(model.device)
model

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

mps:0


GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear(in_features=16384, out_features=2048, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): GemmaRMSNorm((2048,), eps=1e-06)
        (post_attention_layernorm): GemmaRMSNorm((2048,), eps=1e-

In [3]:
import torch


def get_device():
    if torch.cuda.is_available():
        return torch.device("cuda")
    elif torch.backends.mps.is_available():
        return torch.device("mps")
    else:
        return torch.device("cpu")


def clear_cache():
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    elif torch.backends.mps.is_available():
        torch.mps.empty_cache()


my_device = get_device()
my_device

device(type='mps')

## Zero-Shot 분류 함수 정의

In [4]:
def tokenize(device, text):
    tokenized_text = tokenizer(text, return_tensors="pt").to(device)
    return tokenized_text["input_ids"], tokenized_text["attention_mask"]


def zero_shot_classification(device, task_description, text, candidate_labels):
    question_input_ids, question_attention_mask = tokenize(
        device, task_description + text
    )
    scores = []
    for label in candidate_labels:
        label_input_ids, label_attention_mask = tokenize(device, label)
        num_label_tokens = label_input_ids.shape[-1] - 1

        input_ids = torch.concatenate(
            [question_input_ids, label_input_ids[..., 1:]], axis=-1
        )
        attention_mask = torch.concatenate(
            [question_attention_mask, label_attention_mask[..., 1:]], axis=-1
        )

        logits = model(input_ids=input_ids, attention_mask=attention_mask).logits
        num_total_token = input_ids.shape[-1]
        score = sum(
            [
                logits[0, num_total_token - i, label_input_ids[0, i].item()]
                for i in range(num_label_tokens, 0, -1)
            ]
        )
        scores.append(score)

        del input_ids
        del attention_mask
        del logits

        clear_cache()
    return scores

In [5]:
def zero_shot_classification2(device, task_description, text, candidate_labels):
    question_input_ids, question_attention_mask = tokenize(
        device, task_description + text
    )
    scores = []
    for label in candidate_labels:
        label_input_ids, label_attention_mask = tokenize(device, label)
        num_label_tokens = label_input_ids.shape[-1] - 1

        input_ids = torch.concatenate(
            [question_input_ids, label_input_ids[..., 1:]], axis=-1
        )
        attention_mask = torch.concatenate(
            [question_attention_mask, label_attention_mask[..., 1:]], axis=-1
        )

        logits = model(input_ids=input_ids, attention_mask=attention_mask).logits
        num_total_token = input_ids.shape[-1]
        score = sum(
            [
                logits[0, num_total_token - i, label_input_ids[0, i].item()]
                for i in range(num_label_tokens, 0, -1)
            ]
        )
        scores.append(score)

        del input_ids
        del attention_mask
        del logits

        clear_cache()
    return scores

## AG News 데이터셋 로드하기
- 4개의 뉴스 카테고리
    - 1: World, 2: Sports, 3: Business, 4: Science/Technology

In [6]:
from datasets import load_dataset

ds_ag_news = load_dataset("fancyzhx/ag_news")
ds_ag_news

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})

In [7]:
def preprocess_function(data):
    return tokenizer(data["text"])


tokenized_ds = ds_ag_news.map(preprocess_function, batched=True)

## test 데이터셋으로 분류해보기

In [8]:
from tqdm import tqdm

NUM_TEST = 50


def classify(dataset, num_test, task_description, candidate_labels):
    total_correctness = 0
    for i in tqdm(range(num_test)):
        text = dataset[i]["text"]
        label = dataset[i]["label"]

        scores = zero_shot_classification(
            device=my_device,
            task_description=task_description,
            text=text,
            candidate_labels=candidate_labels,
        )

        prediction = torch.argmax(torch.Tensor(scores)).item()
        if prediction == label:
            total_correctness += 1
    return total_correctness

### 여러 종류의 task_description 형태와 labels 형태로 분류해보기

In [9]:
examples = [
    {
        "task_description": "A short news article is given. Decide which category the article belongs to. Article: ",
        "candidate_labels": [
            "Answer: World",
            "Answer: Sports",
            "Answer: Business",
            "Answer: Science/Technology",
        ],
    },
    {
        "task_description": "Classify the following news article into an appropriate category. News: ",
        "candidate_labels": [
            "Answer: World",
            "Answer: Sports",
            "Answer: Business",
            "Answer: Science/Technology",
        ],
    },
    {
        "task_description": "Read the news snippet and determine its main topic. Snippet: ",
        "candidate_labels": [
            "Answer: World",
            "Answer: Sports",
            "Answer: Business",
            "Answer: Science/Technology",
        ],
    },
    {
        "task_description": "Categorize this piece of news into one of the following sections. News piece: ",
        "candidate_labels": [
            "Answer: World",
            "Answer: Sports",
            "Answer: Business",
            "Answer: Science/Technology",
        ],
    },
    {
        "task_description": "What type of news is this article? Article: ",
        "candidate_labels": [
            "Answer: World",
            "Answer: Sports",
            "Answer: Business",
            "Answer: Science/Technology",
        ],
    },
    {
        "task_description": "Identify the most suitable category for the given news item. News item: ",
        "candidate_labels": [
            "Answer: World",
            "Answer: Sports",
            "Answer: Business",
            "Answer: Science/Technology",
        ],
    },
    {
        "task_description": "To which section of a newspaper would this article belong? Article text: ",
        "candidate_labels": [
            "World",
            "Sports",
            "Business",
            "Science/Technology",
        ],
    },
    {
        "task_description": "Determine the primary focus of this news story. Story: ",
        "candidate_labels": [
            "World",
            "Sports",
            "Business",
            "Science/Technology",
        ],
    },
    {
        "task_description": "What's the main theme of the following news excerpt? Excerpt: ",
        "candidate_labels": [
            "World",
            "Sports",
            "Business",
            "Science/Technology",
        ],
    },
    {
        "task_description": "Assign a category to this news bulletin. Bulletin: ",
        "candidate_labels": [
            "World",
            "Sports",
            "Business",
            "Science/Technology",
        ],
    },
    {
        "task_description": "Choose the most appropriate news section for this article. Article content: ",
        "candidate_labels": [
            "World",
            "Sports",
            "Business",
            "Science/Technology",
        ],
    },
    {
        "task_description": "What kind of news story is this? Story details: ",
        "candidate_labels": [
            "World",
            "Sports",
            "Business",
            "Science/Technology",
        ],
    },
    {
        "task_description": "Classify the topic of this news report. Report: ",
        "candidate_labels": [
            "World",
            "Sports",
            "Business",
            "Science/Technology",
        ],
    },
    {
        "task_description": "In which category would you place this news item? News: ",
        "candidate_labels": [
            "World",
            "Sports",
            "Business",
            "Science/Technology",
        ],
    },
    {
        "task_description": "What's the primary subject matter of this news article? Article text: ",
        "candidate_labels": [
            "World",
            "Sports",
            "Business",
            "Science/Technology",
        ],
    },
]

In [10]:
answers = []
for example in examples:
    answers.append(
        classify(
            tokenized_ds["test"],
            NUM_TEST,
            example["task_description"],
            example["candidate_labels"],
        )
    )

100%|██████████| 50/50 [00:30<00:00,  1.64it/s]
100%|██████████| 50/50 [00:26<00:00,  1.86it/s]
100%|██████████| 50/50 [00:25<00:00,  1.94it/s]
100%|██████████| 50/50 [00:25<00:00,  1.96it/s]
100%|██████████| 50/50 [00:25<00:00,  1.99it/s]
100%|██████████| 50/50 [00:25<00:00,  1.97it/s]
100%|██████████| 50/50 [00:24<00:00,  2.08it/s]
100%|██████████| 50/50 [00:26<00:00,  1.91it/s]
100%|██████████| 50/50 [00:29<00:00,  1.68it/s]
100%|██████████| 50/50 [00:30<00:00,  1.65it/s]
100%|██████████| 50/50 [00:29<00:00,  1.71it/s]
100%|██████████| 50/50 [00:27<00:00,  1.82it/s]
100%|██████████| 50/50 [00:25<00:00,  1.94it/s]
100%|██████████| 50/50 [00:25<00:00,  1.96it/s]
100%|██████████| 50/50 [00:25<00:00,  1.94it/s]


In [15]:
for answer, example in zip(answers, examples):
    print(
        "task_description: ",
        example["task_description"],
        "candidate_labels: ",
        example["candidate_labels"],
    )
    print("Total Correctness: ", answer, "Accuracy: ", answer / NUM_TEST)

task_description:  A short news article is given. Decide which category the article belongs to. Article:  candidate_labels:  ['Answer: World', 'Answer: Sports', 'Answer: Business', 'Answer: Science/Technology']
Total Correctness:  9 Accuracy:  0.18
task_description:  Classify the following news article into an appropriate category. News:  candidate_labels:  ['Answer: World', 'Answer: Sports', 'Answer: Business', 'Answer: Science/Technology']
Total Correctness:  12 Accuracy:  0.24
task_description:  Read the news snippet and determine its main topic. Snippet:  candidate_labels:  ['Answer: World', 'Answer: Sports', 'Answer: Business', 'Answer: Science/Technology']
Total Correctness:  12 Accuracy:  0.24
task_description:  Categorize this piece of news into one of the following sections. News piece:  candidate_labels:  ['Answer: World', 'Answer: Sports', 'Answer: Business', 'Answer: Science/Technology']
Total Correctness:  12 Accuracy:  0.24
task_description:  What type of news is this art

### 결과 분석

- 성능 변동
   - 가장 낮은 성능: "Identify the most suitable category for the given news item." (0.2)
   - 가장 높은 성능: "What kind of news story is this? Story details:" (0.38)
- "Answer:" 접두사가 있는 경우(첫 두 행)가 없는 경우보다 일반적으로 성능이 낮습니다.
- 간단하고 직접적인 질문("What kind of news story is this?")이 더 복잡한 질문보다 더 나은 성능을 보입니다.
- 즉, 너무 구체적인 질문("To which section of a newspaper would this article belong?")보다는 일반적인 질문("What's the main theme?")이 더 나은 성능을 보이는 경향이 있습니다.

In [13]:
import gc

gc.collect()

clear_cache()

In [26]:
def get_embedding(model, tokenizer, text):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt").to(model.device)
    # Get the hidden states from the model
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)
        hidden_states = outputs.hidden_states
        # Use the last hidden state (or you can experiment with others)
        embedding = hidden_states[-1][:, 0, :]  # CLS token embedding
    return embedding


def zero_shot_classification_v2(
    model, tokenizer, task_description, text, candidate_labels
):
    # Get the embedding for the input text
    text_embedding = get_embedding(model, tokenizer, task_description + text)

    # Get the embeddings for each label
    label_embeddings = [
        get_embedding(model, tokenizer, label) for label in candidate_labels
    ]

    # Compute dot product between text embedding and each label embedding
    similarities = [
        torch.dot(text_embedding.squeeze(), label_embedding.squeeze())
        for label_embedding in label_embeddings
    ]

    # Select the label with the highest similarity score
    return torch.argmax(torch.tensor(similarities))

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Predicted label: Positive
