In [2]:
%pip install datasets




In [1]:
import warnings
warnings.filterwarnings('ignore')

# 필수 라이브러리 임포트
import os
import torch
import numpy as np
from datetime import datetime

# Transformers 라이브러리
from transformers import (
    pipeline,                              # 고수준 API - 가장 쉬운 방법
    AutoTokenizer,                         # 자동 토크나이저
    AutoModelForQuestionAnswering,         # QA 모델 자동 로더
    DistilBertTokenizerFast,              # DistilBERT 고속 토크나이저
    DistilBertForQuestionAnswering,        # DistilBERT QA 모델
    ElectraTokenizer,                      # ELECTRA 토크나이저 (한글)
    ElectraForQuestionAnswering,           # ELECTRA QA 모델 (한글)
    DefaultDataCollator,                   # 기본 데이터 콜레이터
    TrainingArguments,                     # 학습 하이퍼파라미터
    Trainer,                               # 범용 트레이너
)




In [4]:
from datasets import load_dataset

[1] 모델을 불러서 사용하기 (Inference)
- pipeline
- AutoModel / AutoTokenizer

[2] 학습 데이터 (SQuAD)
- Fine-tuning용 정답 데이터

[3] 모델 학습시키기 (Fine-tuning)
- 전처리
- 학습
- 평가

<span style="color: Gold"> pipeline

In [5]:
question_answer = pipeline ("question-answering",model = 'distilbert-base-cased-distilled-squad')
context = """Text mining, also referred to as text data mining (abbr.: TDM), similar to text analytics, 
is the process of deriving high-quality information from text. It involves 
"the discovery by computer of new, previously unknown information, 
by automatically extracting information from different written resources." 
Written resources may include websites, books, emails, reviews, and articles. 
High-quality information is typically obtained by devising patterns and trends 
by means such as statistical pattern learning. According to Hotho et al. (2005)
we can distinguish between three different perspectives of text mining: 
information extraction, data mining, and a KDD (Knowledge Discovery in Databases) process."""

question1 = "What is text mining?"
question2 = "What are the perspectives of text mining?"

# 질의 응답 수행
answer1 = question_answer(context = context, question=question1)
answer2 = question_answer(context = context, question=question2)
if answer1['score'] < 0.1 :
  print(f'answer1 : 답변 없음')
else:
  print(f"answer1 : {answer1['answer']}")
if answer2['score'] < 0.1 :
  print(f'answer2 : 답변 없음')
else:
  print(f"answer2 : {answer2['answer']}")

print(f' answer1 = {answer1 }')
print(f' answer2 = {answer2 }')

Device set to use cpu


answer1 : the process of deriving high-quality information from text
answer2 : 답변 없음
 answer1 = {'score': 0.42419031262397766, 'start': 95, 'end': 153, 'answer': 'the process of deriving high-quality information from text'}
 answer2 = {'score': 0.04562292993068695, 'start': 624, 'end': 670, 'answer': 'information extraction, data mining, and a KDD'}


<span style="color: Gold"> AutoModel

In [6]:
#AutoModel
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased-distilled-squad')
model = AutoModelForQuestionAnswering.from_pretrained('distilbert-base-cased-distilled-squad')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()

inputs = tokenizer(question1, context, return_tensors="pt").to(device)
with torch.no_grad():
  outputs = model(**inputs)
start_score = outputs.start_logits
end_score = outputs.end_logits
answer_start = torch.argmax(start_score)
answer_end = torch.argmax(end_score)
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end+1]))
print(f'answer1 : {answer}')

answer1 : the process of deriving high - quality information from text


<span style="color: Gold"> SQuAD

In [None]:
%pip install datasets

In [9]:
from datasets import load_dataset

RuntimeError: Only a single TORCH_LIBRARY can be used to register the namespace prims; please put all of your definitions in a single TORCH_LIBRARY block.  If you were trying to specify implementations, consider using TORCH_LIBRARY_IMPL (which can be duplicated).  If you really intended to define operators for a single namespace in a distributed way, you can use TORCH_LIBRARY_FRAGMENT to explicitly indicate this.  Previous registration of TORCH_LIBRARY was registered at c:\Users\SAMSUNG\miniconda3\envs\llm_env\lib\site-packages\torch\_prims\__init__.py:37; latest registration was registered at c:\Users\SAMSUNG\miniconda3\envs\llm_env\lib\site-packages\torch\_prims\__init__.py:37

In [None]:
#SQuAD 데이터셋 로드 분석
# 스탠포드 대학에서 공개한 질의응답 벤치마크 - Extractive QA 표준

squad = load_dataset('squad', split = 'train[:5000]')
squad = squad.train_test_split(test_size = 0.2, seed=42)

print(squad['train'][0]['context'][:10])
print(squad['train'][0]['question'][:10])
print(squad['train'][0]['answers'])
print(squad['train'][0]['answers']['answer_start'])
print(squad['train'][0]['context'][98:98+len('Neo-Confucian establishment')])


NameError: name 'load_dataset' is not defined

<span style="color: Gold"> fine-tuning

In [None]:
# 사전학습만 된 모델(QA헤드는 초기화) distilbert-base-uncased
# 한국어 학습이 가능하지만 성능 보장 못하고 비효율적
# distilbert-base-uncased 영어전용 모델. 한국어를 전처리할때 어간 및 품사 등이 달라서 심하게 왜곡될 수 있음\
# 한국어면 한국어 전용 베이스모델에 파인튜닝을 해야 함 또는 다국어 모델에 적용
# mBERT BERT-base-mutilingual-cased
# klue/bert-base 등등

tokenzier = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

In [None]:
# QA헤드는 아직 학습되지 않음(랜덤 가중치가 적용)
test_context = """The city is the birthplace of many cultural movements, including the Harlem 
Renaissance in literature and visual art; abstract expressionism 
(also known as the New York School) in painting; and hip hop, punk, salsa, disco, 
freestyle, Tin Pan Alley, and Jazz in music. New York City has been considered 
the dance capital of the world. The city is also widely celebrated in popular lore, 
frequently the setting for books, movies, and television programs."""
    
test_question = "The dance capital of the world is what city in the US?"
# fine-tuning이 되지 않은 모델
inputs = tokenizer(test_question, test_context,return_tensors = 'pt').to(device)
with torch.no_grad():
  outputs = model(**inputs)
start = torch.argmax(outputs.start_logits)
end = torch.argmax(outputs.end_logits)
start, end
inputs_ids = inputs['input_ids'].tolist()[0]
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs_ids[start:end+1]))


In [None]:

import warnings
warnings.filterwarnings('ignore')

# 필수 라이브러리 임포트
import os
import torch
import numpy as np
from datetime import datetime

# Transformers 라이브러리
from transformers import (
    pipeline,                              # 고수준 API - 가장 쉬운 방법
    AutoTokenizer,                         # 자동 토크나이저
    AutoModelForQuestionAnswering,         # QA 모델 자동 로더
    DistilBertTokenizerFast,              # DistilBERT 고속 토크나이저
    DistilBertForQuestionAnswering,        # DistilBERT QA 모델
    ElectraTokenizer,                      # ELECTRA 토크나이저 (한글)
    ElectraForQuestionAnswering,           # ELECTRA QA 모델 (한글)
    DefaultDataCollator,                   # 기본 데이터 콜레이터
    TrainingArguments,                     # 학습 하이퍼파라미터
    Trainer,                               # 범용 트레이너
)
from datasets import load_dataset
     

In [None]:
question_answer = pipeline("question-answering",model = 'distilbert-base-cased-distilled-squad')

context = """Text mining, also referred to as text data mining (abbr.: TDM), similar to text analytics,
is the process of deriving high-quality information from text. It involves
"the discovery by computer of new, previously unknown information,
by automatically extracting information from different written resources."
Written resources may include websites, books, emails, reviews, and articles.
High-quality information is typically obtained by devising patterns and trends
by means such as statistical pattern learning. According to Hotho et al. (2005)
we can distinguish between three different perspectives of text mining:
information extraction, data mining, and a KDD (Knowledge Discovery in Databases) process."""

question1 = "What is text mining?"
question2 = "What are the perspectives of text mining?"

# 질의 응답 수행
answer1 = question_answer(context=context, question=question1)
answer2 = question_answer(context=context, question=question2)
if answer1['score'] < 0.1:
  print(f'answer1 : 답변 없음')
else:
  print(f"answer1 : {answer1['answer']}")
if answer2['score'] < 0.1:
  print(f'answer2 : 답변 없음')
else:
  print(f"answer2 : {answer2['answer']}")

# AutoModel
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased-distilled-squad")
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-cased-distilled-squad")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

inputs = tokenizer(question1, context, return_tensors="pt").to(device)
with torch.no_grad():
    outputs = model(**inputs)
start_score = outputs.start_logits
end_score  = outputs.end_logits
answer_start = torch.argmax(start_score)
answer_end = torch.argmax(end_score)
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end+1]))
print(f"answer1 : {answer}")

In [None]:
# SQuAD 데이터셋 로드 분석
# 스탠포드 대학에서 공개한 질의응답 벤치마크 - Extractive QA 표준
squad = load_dataset('squad', split='train[:5000]')
valid = load_dataset('squad', split='validation[:1000]')

model_name = "distilbert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

max_length = 384
stride = 128

def preprocess(example):
    questions = [q.strip() for q in example["question"]]
    contexts = example["context"]

    tokenized = tokenizer(
        questions,
        contexts,
        truncation="only_second",
        max_length=max_length,
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = tokenized.pop("offset_mapping")
    sample_map = tokenized.pop("overflow_to_sample_mapping")

    start_positions = []
    end_positions = []

    for i, offsets in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = example["answers"][sample_idx]
        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer["text"][0])

        sequence_ids = tokenized.sequence_ids(i)

        # context token 영역 찾기
        ctx_start = sequence_ids.index(1)
        ctx_end = len(sequence_ids) - sequence_ids[::-1].index(1) - 1

        # 정답이 context에 없는 경우
        if not (offsets[ctx_start][0] <= start_char <= offsets[ctx_end][1]):
            start_positions.append(0)
            end_positions.append(0)
        else:
            # 토큰 단위 start
            start_token = ctx_start
            while start_token <= ctx_end and offsets[start_token][0] <= start_char:
                start_token += 1
            start_positions.append(start_token - 1)

            # 토큰 단위 end
            end_token = ctx_end
            while end_token >= ctx_start and offsets[end_token][1] >= end_char:
                end_token -= 1
            end_positions.append(end_token + 1)

    tokenized["start_positions"] = start_positions
    tokenized["end_positions"] = end_positions

    return tokenized


train_dataset = squad.map(preprocess, batched=True, remove_columns=squad.column_names)
valid_dataset = valid.map(preprocess, batched=True, remove_columns=valid.column_names)

In [None]:

print(train_dataset)
print(squad)

In [None]:
# --------------------------------------------------------------------------------
# 3. 모델 로드
# --------------------------------------------------------------------------------
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# --------------------------------------------------------------------------------
# 4. 학습 설정
# --------------------------------------------------------------------------------
training_args = TrainingArguments(
    output_dir="./distilbert_squad",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=50,
    report_to="none",
)

# --------------------------------------------------------------------------------
# 5. Trainer 구성
# --------------------------------------------------------------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    data_collator=DefaultDataCollator(),
)

trainer.train()