In [None]:
!pip install transformers
!pip install captum

Collecting captum
  Downloading captum-0.7.0-py3-none-any.whl.metadata (26 kB)
Downloading captum-0.7.0-py3-none-any.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: captum
Successfully installed captum-0.7.0


In [None]:
# 런타임 30초 소요
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
from captum.attr import LayerIntegratedGradients, visualization as viz
import torch

def visualize_sentiment(text: str):
    """
    Visualizes the sentiment of the given text using a pre-trained DistilBERT model.

    Args:
        text (str): The text to visualize.
    """

    # 사전 학습 모델 및 토크나이저
    model_path = 'distilbert-base-uncased-finetuned-sst-2-english'
    model = DistilBertForSequenceClassification.from_pretrained(model_path)
    tokenizer = DistilBertTokenizer.from_pretrained(model_path)
    model.eval()

    # 주어진 텍스트에 대해 입력 텐서와 베이스라인을 생성하는 함수 정의
    def construct_input_and_baseline(input_text: str):
        """Constructs input and baseline tensors for the given text."""
        max_length = 768
        baseline_token_id = tokenizer.pad_token_id
        sep_token_id = tokenizer.sep_token_id
        cls_token_id = tokenizer.cls_token_id

        text_ids = tokenizer.encode(input_text, max_length=max_length, truncation=True, add_special_tokens=False)
        input_ids = [cls_token_id] + text_ids + [sep_token_id]
        baseline_input_ids = [cls_token_id] + [baseline_token_id] * len(text_ids) + [sep_token_id]
        token_list = tokenizer.convert_ids_to_tokens(input_ids)

        return torch.tensor([input_ids], device='cpu'), torch.tensor([baseline_input_ids], device='cpu'), token_list

    # 입력과 베이스라인 구축
    input_ids, baseline_input_ids, all_tokens = construct_input_and_baseline(text)

    # 모델 출력 함수 정의
    def model_output(inputs):
        return model(inputs)[0]

    # 층 적분 그레이디언트
    lig = LayerIntegratedGradients(model_output, model.distilbert.embeddings)

    # 타깃 클래스
    target_classes = [0, 1]
    attributions = {}
    delta = {}

    # 클래스 속성(attributions) 계산
    for target_class in target_classes:
        attributions[target_class], delta[target_class] = lig.attribute(
            inputs=input_ids,
            baselines=baseline_input_ids,
            target=target_class,
            return_convergence_delta=True,
            internal_batch_size=1)

    # 속성(attributions) 요약
    neg_attributions = attributions[0].sum(dim=-1).squeeze(0) / torch.norm(attributions[0])
    pos_attributions = attributions[1].sum(dim=-1).squeeze(0) / torch.norm(attributions[1])

    # 클래스 예측
    pred_prob, pred_class = torch.max(
        model(input_ids)[0]), int(torch.argmax(model(input_ids)[0]))

    # 예측된 클래스에 근거하여 속성 선택
    summarized_attr = pos_attributions if pred_class == 1 else neg_attributions

    # 데이터 시각화
    score_vis = viz.VisualizationDataRecord(
                        word_attributions=summarized_attr,
                        pred_prob=pred_prob,
                        pred_class=pred_class,
                        true_class=None,
                        attr_class=text,
                        attr_score=summarized_attr.sum(),
                        raw_input_ids=all_tokens,
                        convergence_score=delta[pred_class])

    # 결과 시각화
    viz.visualize_text([score_vis])


In [None]:
# 런타임 20초 소요
text = "The movie was not bad as mentioned by critics. It was in fact awesome; I enjoyed the whole time"
visualize_sentiment(text)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
,1 (4.65),The movie was not bad as mentioned by critics. It was in fact awesome; I enjoyed the whole time,12.92,[CLS] the movie was not bad as mentioned by critics . it was in fact awesome ; i enjoyed the whole time [SEP]
,,,,
