# LoRA

### LoRA를 활용한 GPT-2 감성 분석 모델 튜닝

In [2]:
!pip install peft datasets transformers

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [None]:
from huggingface_hub import login
login(token="YOURE_TOKEN")

In [6]:
import torch
import os
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
from datasets import load_dataset

In [7]:
# gpu 사용 설정을 위한 device 변수 생성
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [8]:
# 병렬 토크나이저 경고 방지
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [9]:
# 모델 로드
model_name = "gpt2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

base_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
base_model.config.pad_token_id = tokenizer.pad_token_id

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# LoRA 설정
lora_config = LoraConfig(
    r=8,                                         # LoRA 저랭크 차원
    lora_alpha=32,                               # LoRA 가중치 스케일링 파라미터 
    target_modules=["c_attn", "c_fc", "c_proj"], # LoRA 적용할 계층
    lora_dropout=0.1                             # 드롭아웃 설정
)

In [11]:
# 기반 모델에 LoRA 적용 (+ GPU로 이동)
model = get_peft_model(base_model, lora_config).to(device)



In [12]:
# 데이터셋 로드
dataset = load_dataset("imdb")

README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [14]:
# 학습용 데이터 준비
# 샘플링 편향 방지
positive_samples = [sam for sam in dataset["train"] if sam["label"] == 1][:500]
negative_samples = [sam for sam in dataset["train"] if sam["label"] == 0][:500]

# X, y 데이터 구분
train_texts = [sam["text"] for sam in positive_samples + negative_samples]
train_labels = [sam["label"] for sam in positive_samples + negative_samples]

In [16]:
# 검증용 데이터 준비
# 샘플링 편향 방지
positive_evals = [sam for sam in dataset["test"] if sam["label"] == 1][:100]
negative_evals = [sam for sam in dataset["test"] if sam["label"] == 0][:100]

# X, y 데이터 구분 
eval_texts = [sam["text"] for sam in positive_evals + negative_evals]
eval_labels = [sam["label"] for sam in positive_evals + negative_evals]

In [17]:
def preprocess_data(texts, labels):
    encodings = tokenizer(
        texts,
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="pt"
    )
    encodings["labels"] = torch.tensor(labels, dtype=torch.long)
    return encodings

In [18]:
# 토큰화 처리
train_encodings = preprocess_data(train_texts, train_labels)
eval_encodings = preprocess_data(eval_texts, eval_labels)

In [19]:
# 데이터셋 변환을 위한 클래스
class IMDBDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
        return {
            key: val[idx]
            for key, val in self.encodings.items()
        }

In [20]:
train_dataset = IMDBDataset(train_encodings)
eval_dataset = IMDBDataset(eval_encodings)

In [21]:
# 배치를 GPU로 자동 이동시키는 data_colloator 함수
def collate_fn(batch):
    batch = {key: torch.stack([item[key] for item in batch]) for key in batch[0]}
    return batch

In [22]:
# 학습 설정
training_args= TrainingArguments(
    output_dir='./results',        # 모델 저장 경로
    per_device_train_batch_size=4, # 훈련 배치 크기
    per_device_eval_batch_size=4,  # 평가(검증) 배치 크기
    num_train_epochs=5,            # 학습 횟수 (에포크)
    save_steps=100,                # 저장 주기
    save_total_limit=2,            # 최대 저장 모델 개수
    eval_strategy='epoch',         # 에포크 단위 평가
    logging_dir='./logs',          # 로그 저장 경로
    logging_steps=10,              # 로그 출력 주기
    fp16=True                      # FP16 연산 최적화
)

In [23]:
# 트레이너 설정
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=collate_fn
)

No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [24]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,1.0651,No log
2,0.5455,No log
3,0.4691,No log
4,0.2938,No log
5,0.2639,No log




TrainOutput(global_step=625, training_loss=0.7871975387573242, metrics={'train_runtime': 159.136, 'train_samples_per_second': 31.42, 'train_steps_per_second': 3.927, 'total_flos': 1324603146240000.0, 'train_loss': 0.7871975387573242, 'epoch': 5.0})

In [25]:
def predict_sentiment(text):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512
    ).to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        prediction = torch.argmax(logits, dim=-1).item()

    return "긍정" if prediction == 1 else "부정"

In [27]:
# test_review = "I enjoyed watching the movie!"
test_review = "It was very boring..."
result = predict_sentiment(test_review)
result

'부정'