In [1]:
from IPython.display import display, HTML
display(HTML("""
<style>
div.container{width:90% !important;}
div.cell.code_cell.rendered{width:100%;}
div.input_prompt{padding:0px;}
div.CodeMirror {font-family:Consolas; font-size:16pt;}
div.text_cell_render.rendered_html{font-size:16pt;}
div.output {font-size:12pt; 
las; font-size:16pt;}
div.prompt {min-width:70px;}
div#toc-wrapper{padding-top:120px;}
div.text_cell_render ul li{font-size:16pt;padding:5px;}
table.dataframe{font-size:16px;}
</style>
"""))

# <span style="color:red">ch1. 허깅페이스</span>
- Inference API 이용 : 모델의 결과를 surver에서
- pipeline() 이용 : 모델을 다운로드받아 모델의 결과를 local에서
    * raw text -> tokenizer -> model -> [0.11, 0.55, 0.XX,~] logits값으로 prediction결과 출력
```
허깅페이스 transformers에서 지원하는 task
"sentiment-analysis" : "text-classification"의 별칭(감정분석 전용으로 사용)
"text-classification" : 감정분석, 뉴스분류, 리뷰 분류 등 일반적인 문장 분류
"zero-shot-classification" : 레이블을 학습 없이 주어진 후보군 중에서 분류
"token-calssification" : 개체명 인식(NER : Named Entity Recognition) 등 단위 라벨링
"ner" : "token-calssification"의 별칭
"fill-mask" : 빈칸 채우기
"text-generation" : 텍스트 생성 (GPT류 모델에 사용)
"text2text-generation" : 번역, 요약 등 입력 -> 출력 변환
"translation" : 번역
"summarization" : 텍스트요약
"question-answering" : 주어진 context를 보고 질문에 답하기
"image-to-text" : 그림을 설명
"image-classification" : 이미지분류
```

## 1. 텍스트 기반 감정분석(긍정/부정)
- C:\Users\Admin\.cache\huggingface/hub 모델 다운로드

In [2]:
import warnings
import os
import logging
# 경고 제거
warnings.filterwarnings('ignore')

# transformers 로깅 레벨 조정
logging.getLogger("transformers").setLevel(logging.ERROR)

# Hugging Face symlink 경고 제거
os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = '1'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

# from transformers import pipeline, logging as hf_logging
# hf_logging.set_verbosity_error()

In [5]:
from transformers import pipeline
classifier = pipeline(task="sentiment-analysis")
classifier("I've been waiting for a HuggingFace course my whole life.")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Device set to use cpu


[{'label': 'POSITIVE', 'score': 0.9598049521446228}]

In [19]:
classifier = pipeline(task="sentiment-analysis",
                     model="matthewburke/korean_sentiment")
texts = ['미워하지만 사랑해', '그립지만 용서못해', '못생겼어', '잘생겼어']
result = classifier(texts)

Device set to use cpu


In [20]:
for text , result in zip(texts, classifier(texts)):
    label = '긍정' if result['label']=='LABEL_1' else '부정'
    print(f"{text} => {label} : {result['score']}")

미워하지만 사랑해 => 긍정 : 0.9526359438896179
그립지만 용서못해 => 부정 : 0.7990366816520691
못생겼어 => 부정 : 0.9636958241462708
잘생겼어 => 긍정 : 0.9676699638366699


## 2. 제로샷분류

In [21]:
classifier = pipeline("zero-shot-classification")
classifier(
    "This is a course about the Transformers library",
    candidate_labels=["education", "politics", "business"],
)

No model was supplied, defaulted to facebook/bart-large-mnli and revision d7645e1 (https://huggingface.co/facebook/bart-large-mnli).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cpu


{'sequence': 'This is a course about the Transformers library',
 'labels': ['education', 'business', 'politics'],
 'scores': [0.8445960283279419, 0.11197635531425476, 0.04342760890722275]}

## 3.text 생성

In [22]:
generator = pipeline("text-generation", "gpt2") # 택스트 생성 gpt3부터는 허깅페이스 없음
generator("In this course, we will teach you how to",
         pad_token_id=generator.tokenizer.eos_token_id)

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu


[{'generated_text': 'In this course, we will teach you how to write a Python program that will make use of the power of the power of Python and the power of the powerful Python programming language. All you need is a Python interpreter, a Windows computer or a Python interpreter written in C++.\n\nYou will learn Python in Python 3, 2, 3, 4, 5, 6 and 7.\n\nTo get started, you need to start by reading the introduction to the language.\n\nYou will also need a Python installation, which you can install from an external location.\n\nWe will discuss using this as an introduction to the language, and where to find it.\n\nPython 3.x\n\nIn this course, we will be using Python 3.x to write some code. We will use it to help us write code for other projects. First of all, we will use C++ to write some Python programs.\n\nWe will use the functions that C++ provides to create and access variables. Next, we will use the built-in constants to make the value of a variable. Finally, we will use the buil

In [30]:
generator = pipeline("text-generation", "skt/kogpt2-base-v2") # 택스트 생성 gpt3부터는 허깅페이스 없음
result = generator('무녀 히나와 용사 키라는 마왕을 무찌르기 위해 여행',
         pad_token_id=generator.tokenizer.eos_token_id,
         max_new_tokens = 100, # 생성할 최대 길이(생성할 토큰 수)
         num_return_sequences=1, # 생성할 문장 갯수
         do_sample=True, # 다양한 샘플 사용
         top_k=50, # top_k 샘플링(확률 높은 상위 50개 토큰만 사용)
         top_p=0.95, # 확률이 높은 순서대로 95%가 될 때까지의 단어들로만 후보로 사용
         temperature=1.2, # 창의성 조절(낮을수록 보수적)
         no_repeat_ngram_size=2) # 반복방지
print(result[0]['generated_text'])

Device set to use cpu


무녀 히나와 용사 키라는 마왕을 무찌르기 위해 여행하는 것이 꿈이라며 다른 세 개의 왕, 용사를 거느리고 마왕으로부터 마왕에 의해 암살당할 위기에 처한 세 여신을 대신해 그녀를 구출한다.
마왕 역시 그녀에게 의지하고 있는 만큼 그녀의 정체는 바로 히나를 죽인 범인일 가능성이 크다.
그들은 세 명의 왕 중 가장 먼저 탈출하여 그의 계획을 실행하고, 자신의 임무를 성공적으로 완수하기 위해 그녀를 세 명에 나누어 살해한다.
이윽고 그는 새로운 길을 떠난다.
세 왕과는 달리 히나는 한 번의 여행에 불과했다.
또한 히나가 가장 오랫동안


## 4. 마스크(빈칸) 채우기

In [33]:
unmasker = pipeline("fill-mask",
                   model="google-bert/bert-base-uncased")
unmasker("Hello [MASK] my dalring", top_k=2)

Some weights of the model checkpoint at google-bert/bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


[{'score': 0.8635818958282471,
  'token': 1010,
  'token_str': ',',
  'sequence': 'hello, my dalring'},
 {'score': 0.07325759530067444,
  'token': 2000,
  'token_str': 'to',
  'sequence': 'hello to my dalring'}]

### ※ inferenceAPI 사용

In [35]:
from dotenv import load_dotenv
import os
load_dotenv()
# os.environ['HF_TOKEN']
# 허깅페이스 토근을 READ권한으로 생성하여 .env에 추가

True

In [36]:
from huggingface_hub import InferenceClient
client = InferenceClient(
            provider="hf-inference",
            api_key=os.environ['HF_TOKEN'] #허깅페이스 토큰 키
            
)
result = client.fill_mask(
        "Hello [MASK] my dalring",
        model="google-bert/bert-base-uncased",
        top_k=2 # 기본 5개
)
print(result)

[FillMaskOutputElement(score=0.8635819554328918, sequence='hello, my dalring', token=1010, token_str=',', fill_mask_output_token_str=None), FillMaskOutputElement(score=0.07325738668441772, sequence='hello to my dalring', token=2000, token_str='to', fill_mask_output_token_str=None)]


In [38]:
[f'{r.sequence} ({r.score:.2%})' for r in result]

['hello, my dalring (86.36%)', 'hello to my dalring (7.33%)']

## 5. 개채명 인식(NER : Named Entity Recognition)

In [39]:
ner = pipeline(task="ner",
              model="dbmdz/bert-large-cased-finetuned-conll03-english",
              grouped_entities=True) # 개체들은 그릅으로 묶을지 말지 여부
ner("My name is taro and i work at Hugging face in Korea")

config.json:   0%|          | 0.00/998 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Device set to use cpu


[{'entity_group': 'ORG',
  'score': 0.7420061,
  'word': 'Hugging face',
  'start': 30,
  'end': 42},
 {'entity_group': 'LOC',
  'score': 0.9996061,
  'word': 'Korea',
  'start': 46,
  'end': 51}]

## 6. 질의 응답

In [41]:
question_answer = pipeline("question-answering",
                          "distilbert/distilbert-base-cased-distilled-squad")
question_answer(
        question="Where do I work?",
        context= "My name is Taro and I work at hugging face in bangkok")

Device set to use cpu


{'score': 0.661173720494844,
 'start': 30,
 'end': 53,
 'answer': 'hugging face in bangkok'}

In [42]:
context = "My name is Taro and I work at hugging face in bangkok"
result = question_answer(question="Where do I work?", context=context)

In [43]:
result.get('answer'), context[result.get('start') : result.get('end')], result.get('score')

('hugging face in bangkok', 'hugging face in bangkok', 0.661173720494844)

## 7. 문서요약
- 현재 torch 버전이 2.6이하면 허깅페이스에서 강제로 막고 있음

In [49]:
summarizer = pipeline(task="summarization",
                     model="sshleifer/distilbart-cnn-12-6")
summarizer(
    """
Device set to use cpu
Your max_length is set to 130, but your input_length is only 73. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=36)    
    """,
    max_length=130, # 요약할 내용의 최대 토큰 수
    min_length=30, # 요약할 내용의 최소 토큰 수
    do_sample=False # 랜덤성이 없음/ 항상 비슷한 요약
)

Device set to use cpu
Your max_length is set to 130, but your input_length is only 79. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=39)


[{'summary_text': ' Your max_length is set to 130, but your input_length has only 73 . Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max length manually .'}]

## 8. 번역
- pip install sacremoses : 한영번역에서의 경고를 줄이고, 

In [59]:
translator = pipeline(task="translation",
                     model="Helsinki-NLP/opus-mt-tc-big-en-ko")
translator(
"""
you
"""
)

Device set to use cpu


[{'translation_text': '일반'}]