In [2]:
"""문장 생성을 위한 GPT-2 모델의 구조"""

from transformers import GPT2LMHeadModel
#트랜스포머 라이브러리의 GPT2MHeadModel 클래스

model = GPT2LMHeadModel.from_pretrained(pretrained_model_name_or_path="gpt2")
#사전 학습된 모델(pretrained_model_name_or_path)을 from_pretrained 메서드로 불러옴

for main_name, main_module in model.named_children():
    print(main_name)
    for sub_name, sub_module in main_module.named_children():
        print("└",sub_name)
        for ssub_name, ssub_module in sub_module.named_children():
            print("│  └", ssub_name)
            for sssub_name, sssub_module in ssub_module.named_children():
                print("│  │  └",sssub_name)
#실습 모델은 12개의 디코더 계층을 사용하는 간소화 모델
#단어 토큰 임베딩(wte), 단어 위치 임베딩(wpe), 드롭아웃(drop), 트랜스포머 디코더 계층(h), 계층정규화(ln_), 어텐션 메커니즘(attn), 완전 연결 계층(mlp)(순방향 레이어), 선형 임베딩 및 언어 모델(lm_head)

transformer
└ wte
└ wpe
└ drop
└ h
│  └ 0
│  │  └ ln_1
│  │  └ attn
│  │  └ ln_2
│  │  └ mlp
│  └ 1
│  │  └ ln_1
│  │  └ attn
│  │  └ ln_2
│  │  └ mlp
│  └ 2
│  │  └ ln_1
│  │  └ attn
│  │  └ ln_2
│  │  └ mlp
│  └ 3
│  │  └ ln_1
│  │  └ attn
│  │  └ ln_2
│  │  └ mlp
│  └ 4
│  │  └ ln_1
│  │  └ attn
│  │  └ ln_2
│  │  └ mlp
│  └ 5
│  │  └ ln_1
│  │  └ attn
│  │  └ ln_2
│  │  └ mlp
│  └ 6
│  │  └ ln_1
│  │  └ attn
│  │  └ ln_2
│  │  └ mlp
│  └ 7
│  │  └ ln_1
│  │  └ attn
│  │  └ ln_2
│  │  └ mlp
│  └ 8
│  │  └ ln_1
│  │  └ attn
│  │  └ ln_2
│  │  └ mlp
│  └ 9
│  │  └ ln_1
│  │  └ attn
│  │  └ ln_2
│  │  └ mlp
│  └ 10
│  │  └ ln_1
│  │  └ attn
│  │  └ ln_2
│  │  └ mlp
│  └ 11
│  │  └ ln_1
│  │  └ attn
│  │  └ ln_2
│  │  └ mlp
└ ln_f
lm_head


In [4]:
"""GPT-2를 이용한 문장 생성"""

from transformers import pipeline 

generator = pipeline(task="text-generation", model="gpt2")
#파이프라인 클래스는 입력된 작업(task)에 모델(model)로 적합한 파이프라인을 구축한다.
outputs = generator(
    text_inputs = "Machine learning is", #문장의 입력 문맥
    max_length = 20, #최대 길이
    num_return_sequences = 3, #변환 시퀀스 개수
    pad_token_id = generator.tokenizer.eos_token_id #생성된 텍스트의 길이가 max_len보다 작으면 eos로 패딩 처리
)
print(outputs)

[{'generated_text': 'Machine learning is about discovering patterns. To find the best path to solving these problems you need to apply'}, {'generated_text': 'Machine learning is the study of machine learning concepts which have been shown to have important applications in the fields'}, {'generated_text': "Machine learning is at its earliest stage of life, and we've learned a lot about how to use"}]


In [11]:
"""CoLA 데이터세트 불러오기"""

import torch
from torchtext.datasets import CoLA
from transformers import AutoTokenizer
from torch.utils.data import DataLoader

def collator(batch, tokenizer, device):
    source, labels, texts = zip(*batch)
    tokenized = tokenizer(
        texts,
        padding = "longest", #패딩 : 가장 긴 시퀀스에 대해 패딩을 적용
        truncation = True, #절사 : 입력 시퀀스 길이가 최대 길이를 초과하는 경우 해당 시퀀스를 자른다.
        return_tensors = "pt" #반환 형식 설정 : 파이토치 텐서로 결과를 반환
    )
    input_ids = tokenized["input_ids"].to(device)
    attention_mask = tokenized["attention_mask"].to(device)
    labels = torch.tensor(labels, dtype=torch.long).to(device)
    return input_ids, attention_mask, labels

train_data = list(CoLA(split="train"))
valid_data = list(CoLA(split="dev"))
test_data = list(CoLA(split="test"))

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token #gpt2는 사전 학습 시 패딩 기법을 사용하지 않기 때문에 토크나이저의 패딩토큰이 포함되어 있지 않아서 따로 추가함

epochs = 3
batch_size = 16
device = "cuda" if torch.cuda.is_available() else "cpu"

train_dataloader = DataLoader(
    train_data,
    batch_size = batch_size,
    collate_fn = lambda x : collator(x, tokenizer, device),
    shuffle=True
)
valid_dataloader = DataLoader(valid_data, batch_size, collate_fn=lambda x : collator(x, tokenizer, device))
test_dataloader = DataLoader(test_data, batch_size, collate_fn=lambda x : collator(x, tokenizer, device))

print("Train Dataset Length :", len(train_data))
print("Valid Dataset Length :", len(valid_data))
print("Test Dataset Length :",len(test_data))

Train Dataset Length : 8550
Valid Dataset Length : 526
Test Dataset Length : 515


In [12]:
"""GPT-2 모델 설정"""

from torch import optim
from transformers import GPT2ForSequenceClassification
#GPT-2를 기반으로 하는 시퀀스 분류 모델

model = GPT2ForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path="gpt2",
    num_labels = 2 #올바른 문장과 올바르지 않은 문장 분류를 위해 분류 레이블 수를 2로 설정
).to(device)
model.config.pad_token_id = model.config.eos_token_id #GPT2 모델에는 패딩 토큰 미포함. 따라서 포함시켜줌
optimizer = optim.Adam(model.parameters(), lr=5e-5)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
"""GPT-2 모델 학습 및 검증"""

import numpy as np
from torch import nn

def calc_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat)/len(labels_flat)

def train(model, optimizer, dataloader):
    model.train()
    train_loss = 0.0
    for input_ids, attention_mask, labels in dataloader:
        outputs = model(
            input_ids = input_ids,
            attention_mask = attention_mask,
            labels = labels
        )

        loss = outputs.loss
        train_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    train_loss = train_loss / len(dataloader)
    return train_loss

def evaluation(model, dataloader):
    with torch.no_grad():
        model.eval()
        criterion = nn.CrossEntropyLoss()
        val_loss, val_accuracy = 0.0, 0.0

        for input_ids, attention_mask, labels in dataloader:
            outputs = model(
                input_ids = input_ids,
                attention_mask = attention_mask,
                labels = labels
            )
            logits = outputs.logits

            loss = criterion(logits, labels)
            logits = logits.detach().cpu().numpy()
            label_ids = labels.to("cpu").numpy()
            accuracy = calc_accuracy(logits, label_ids)

            val_loss += loss
            val_accuracy += accuracy

    val_loss = val_loss/len(dataloader)
    val_accuracy = val_accuracy/len(dataloader)
    return val_loss, val_accuracy

best_loss = 10000
for epoch in range(epochs):
    train_loss = train(model, optimizer, train_dataloader)
    val_loss, val_accuracy = evaluation(model, valid_dataloader)
    print(f"Epoch:{epoch+1}, Train loss:{train_loss: .4f}, Val loss : {val_loss:.4f}, Val accuracy: {val_accuracy:.4f}")

    if val_loss < best_loss:
        best_loss = val_loss
        torch.save(model.state_dict(),"models/GPT2ForSequenceClassification.pt")
        print("Saved the model weights")

Epoch:1, Train loss: 0.3050, Val loss : 0.5303, Val accuracy: 0.7879
Saved the model weights
Epoch:2, Train loss: 0.2055, Val loss : 0.6521, Val accuracy: 0.7538
Epoch:3, Train loss: 0.1451, Val loss : 0.7367, Val accuracy: 0.7787


In [17]:
model = GPT2ForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path="gpt2",
    num_labels=2
).to(device)
model.config.pad_token_id = model.config.eos_token_id
model.load_state_dict(torch.load("models/GPT2ForSequenceClassification.pt"))

test_loss, test_accuracy = evaluation(model, test_dataloader)
print(f"Test_loss : {test_loss : .4f}")
print(f"Test Accuracy : {test_accuracy : .4f}")

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Test_loss :  0.6494
Test Accuracy :  0.7355
