# KoBART 기반 모델링

### 1. 필요 모듈 임포트

In [1]:
import accelerate #GPU 사용 시 필요
import transformers
import torch
import pandas as pd
import numpy as np
import os
from transformers import AutoModelForCausalLM, AutoTokenizer, AdamW, BitsAndBytesConfig, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from tqdm.notebook import tqdm
pd.options.display.max_colwidth = 1000

## Check for CUDA availability (GPU 사용 시 필요)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

  torch.utils._pytree._register_pytree_node(


Using device: cpu


  torch.utils._pytree._register_pytree_node(


### 2. 모델 임포트

In [2]:
# Load model directly'
model_name = "gogamza/kobart-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


### 3. 데이터 로드 및 전처리

In [3]:
train_data = pd.read_csv('./중노년층_한국어_방언_데이터/data_따라말하기.csv')

In [4]:
## 사투리 토큰, 표준어 토큰 정의하기
dialect_token = "[방언]"
standard_token = "[표준]"

## 양방향 데이터 리스트 생성
bidirectional_data = []

for dialect, standard in zip(train_data['dialect'], train_data['standard']):
    ## 토큰이 [방언] 일 경우 방언 -> 표준어
    bidirectional_data.append({
        "source": dialect_token + " " + dialect,
        "target": standard
    })
    ## 토큰이 [표준] 일 경우 표준어 -> 방언
    bidirectional_data.append({
        "source": standard_token + " " + standard,
        "target": dialect
    })

## 데이터 토크나이징
tokenized_data = []
for item in bidirectional_data:
    source_encodings = tokenizer(item['source'], max_length=64, truncation=True, padding="max_length", return_tensors="pt")
    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(item['target'], max_length=64, truncation=True, padding="max_length", return_tensors="pt")
    tokenized_data.append({
        "input_ids": source_encodings["input_ids"],
        "attention_mask": source_encodings["attention_mask"],
        "labels": target_encodings["input_ids"]
    })




In [6]:
from datasets import Dataset, DatasetDict # Hugging Face에서 제공하는 데이터셋 라이브러리
import pandas as pd

# Modify the DataFrame creation code
formatted_data_df = pd.DataFrame([{
    "input_ids": np.array(fd["input_ids"].numpy().tolist()[0], dtype=np.uint16),
    "attention_mask": np.array(fd["attention_mask"].numpy().tolist()[0], dtype=np.uint8),
    "labels": np.array(fd["labels"].numpy().tolist()[0], dtype=np.uint16)
} for fd in tokenized_data])

## 데이터를 Dataset 형식으로 변환
train_dataset = Dataset.from_pandas(formatted_data_df)

## 데이터셋 확인
train_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 193902
})

In [7]:
## 학습 데이터셋을 학습 및 평가용으로 분리 (예: 90% 학습, 10% 평가)
train_test_split = train_dataset.train_test_split(test_size=0.1)
dataset_dict = DatasetDict({
    'train': train_test_split['train'],
    'test': train_test_split['test']
})

In [8]:
# 트랜스포머에서 Seq2Seq 학습 모듈 호출
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

## 학습 매개변수 설정
training_args = Seq2SeqTrainingArguments(
    output_dir="checkpoints",                           ## 체크포인트 디렉토리
    evaluation_strategy="epoch",                         ## 평가 전략 = 에포크
    learning_rate=2e-5,                                  ## 학습률 = 2e-5
    per_device_train_batch_size=32,                      ## 배치 사이즈
    weight_decay=0.01,                                   ## 가중치 감소율
    save_total_limit=3,                                  ## 저장할 최대 체크포인트 수
    num_train_epochs=3,                                  ## 에포크 수
    predict_with_generate=True,                          ## '생성'을 사용한 예측 활성화
)

In [10]:
import wandb # 학습 모니터링이 가능한 웹 서비스 (프로젝트를 미리 생성하지 않아 의미는 없었음)

## API 키를 직접 입력

wandb.login(key="6313dd1643dd4fbb403afacbf8b41105d42d6d1f")

## 학습 준비
trainer = Seq2SeqTrainer(
    model=model,                         ## 학습할 모델
    args=training_args,                  ## 학습 설정
    train_dataset=dataset_dict['train'], ## 학습 데이터셋
    eval_dataset=dataset_dict['test'],   ## 평가 데이터셋
    tokenizer=tokenizer
)

## 학습 시작
trainer.train()

## 학습 종료
wandb.finish()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /Users/jeeho/.netrc
dataloader_config = DataLoaderConfiguration(dispatch_batches=None)
You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,0.0974,0.089637
2,0.0817,0.082317
3,0.0751,0.079814
