KoELECTRA의 전이학습을 사용해 진행해보려 했다. 그러나 pytorch에 익숙하지 않았고, 모델의 학습 과정에서 메모리 부족, 모델의 저장 및 불러오기 과정, 예측 함수 실행 등 많은 과정에서 에러가 발생해 최종적으로 프로젝트에 기여한 파일은 아니다.

In [1]:
import pandas as pd
import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, ElectraForSequenceClassification, AdamW
from tqdm.notebook import tqdm

In [2]:
from transformers import TFElectraModel
#모델 불러오기
device = torch.device("cuda")
model = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-small-v2-discriminator").to(device)

Some weights of the model checkpoint at monologg/koelectra-small-v2-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-small-v2-discriminator and are newly initialized

In [4]:
#클래스 설정
class NSMCDataset(Dataset):
      
  def __init__(self, csv_file):
    # NAN 제거
    self.dataset = pd.read_csv(csv_file, sep='\t').dropna(axis=0) 
    # 중복제거
    self.dataset.drop_duplicates(subset=['document'], inplace=True)
    self.tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-small-v2-discriminator")

    print(self.dataset.describe())
  
  def __len__(self):
    return len(self.dataset)
  
  def __getitem__(self, idx):
    row = self.dataset.iloc[idx, 1:3].values
    text = row[0]
    y = row[1]

    inputs = self.tokenizer(
        text, 
        return_tensors='pt',
        truncation=True,
        max_length=256,
        pad_to_max_length=True,
        add_special_tokens=True
        )
    
    input_ids = inputs['input_ids'][0]
    attention_mask = inputs['attention_mask'][0]

    return input_ids, attention_mask, y

In [5]:
train_dataset = NSMCDataset("ratings_train.txt")
test_dataset = NSMCDataset("ratings_test.txt")

                 id          label
count  1.461820e+05  146182.000000
mean   6.779186e+06       0.498283
std    2.919223e+06       0.499999
min    3.300000e+01       0.000000
25%    4.814832e+06       0.000000
50%    7.581160e+06       0.000000
75%    9.274760e+06       1.000000
max    1.027815e+07       1.000000
                 id         label
count  4.915700e+04  49157.000000
mean   6.752945e+06      0.502695
std    2.937158e+06      0.499998
min    6.010000e+02      0.000000
25%    4.777143e+06      0.000000
50%    7.565415e+06      1.000000
75%    9.260204e+06      1.000000
max    1.027809e+07      1.000000


In [22]:
#캐시 비우는 셀
import gc
gc.collect()
torch.cuda.empty_cache()

In [14]:
epochs = 5
batch_size = 32

optimizer = AdamW(model.parameters(), lr=1e-5)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

In [15]:
losses = []
accuracies = []

for i in range(epochs):
  total_loss = 0.0
  correct = 0
  total = 0
  batches = 0

  model.train()

  for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_loader):
    optimizer.zero_grad()
    y_batch = y_batch.to(device)
    y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
    loss = F.cross_entropy(y_pred, y_batch)
    loss.backward()
    optimizer.step()

    total_loss += loss.item()

    _, predicted = torch.max(y_pred, 1)
    correct += (predicted == y_batch).sum()
    total += len(y_batch)

    batches += 1
    if batches % 100 == 0:
      print("Batch Loss:", total_loss, "Accuracy:", correct.float() / total)
  
  losses.append(total_loss)
  accuracies.append(correct.float() / total)
  print("Train Loss:", total_loss, "Accuracy:", correct.float() / total)

  0%|          | 0/4569 [00:00<?, ?it/s]

Batch Loss: 68.80780601501465 Accuracy: tensor(0.5813, device='cuda:0')
Batch Loss: 134.16938388347626 Accuracy: tensor(0.6414, device='cuda:0')
Batch Loss: 191.4479764997959 Accuracy: tensor(0.6760, device='cuda:0')
Batch Loss: 241.68531349301338 Accuracy: tensor(0.7037, device='cuda:0')
Batch Loss: 289.9933234155178 Accuracy: tensor(0.7201, device='cuda:0')
Batch Loss: 336.61875000596046 Accuracy: tensor(0.7326, device='cuda:0')
Batch Loss: 381.182342261076 Accuracy: tensor(0.7424, device='cuda:0')
Batch Loss: 423.5377233326435 Accuracy: tensor(0.7518, device='cuda:0')
Batch Loss: 466.2489667534828 Accuracy: tensor(0.7583, device='cuda:0')
Batch Loss: 508.70631174743176 Accuracy: tensor(0.7637, device='cuda:0')
Batch Loss: 551.265048250556 Accuracy: tensor(0.7679, device='cuda:0')
Batch Loss: 591.6827571243048 Accuracy: tensor(0.7727, device='cuda:0')
Batch Loss: 632.2074180990458 Accuracy: tensor(0.7763, device='cuda:0')
Batch Loss: 672.1392481327057 Accuracy: tensor(0.7795, device=

  0%|          | 0/4569 [00:00<?, ?it/s]

Batch Loss: 32.66043856739998 Accuracy: tensor(0.8594, device='cuda:0')
Batch Loss: 66.4147911220789 Accuracy: tensor(0.8559, device='cuda:0')
Batch Loss: 99.04862867295742 Accuracy: tensor(0.8582, device='cuda:0')
Batch Loss: 131.95213943719864 Accuracy: tensor(0.8577, device='cuda:0')
Batch Loss: 165.73741210252047 Accuracy: tensor(0.8559, device='cuda:0')
Batch Loss: 197.34910338371992 Accuracy: tensor(0.8569, device='cuda:0')
Batch Loss: 229.0286364480853 Accuracy: tensor(0.8575, device='cuda:0')
Batch Loss: 262.9490259960294 Accuracy: tensor(0.8555, device='cuda:0')
Batch Loss: 293.0583957359195 Accuracy: tensor(0.8576, device='cuda:0')
Batch Loss: 324.95573679357767 Accuracy: tensor(0.8582, device='cuda:0')
Batch Loss: 355.86186626553535 Accuracy: tensor(0.8593, device='cuda:0')
Batch Loss: 387.946705378592 Accuracy: tensor(0.8589, device='cuda:0')
Batch Loss: 421.6264896467328 Accuracy: tensor(0.8585, device='cuda:0')
Batch Loss: 455.96832383424044 Accuracy: tensor(0.8582, devic

  0%|          | 0/4569 [00:00<?, ?it/s]

Batch Loss: 28.356926880776882 Accuracy: tensor(0.8744, device='cuda:0')
Batch Loss: 57.051466040313244 Accuracy: tensor(0.8755, device='cuda:0')
Batch Loss: 87.81436567753553 Accuracy: tensor(0.8741, device='cuda:0')
Batch Loss: 116.74185113608837 Accuracy: tensor(0.8759, device='cuda:0')
Batch Loss: 147.26824869215488 Accuracy: tensor(0.8740, device='cuda:0')
Batch Loss: 177.57634741067886 Accuracy: tensor(0.8732, device='cuda:0')
Batch Loss: 206.45984891057014 Accuracy: tensor(0.8729, device='cuda:0')
Batch Loss: 235.38966889679432 Accuracy: tensor(0.8726, device='cuda:0')
Batch Loss: 263.90865544974804 Accuracy: tensor(0.8734, device='cuda:0')
Batch Loss: 292.3339872062206 Accuracy: tensor(0.8737, device='cuda:0')
Batch Loss: 321.22397039085627 Accuracy: tensor(0.8734, device='cuda:0')
Batch Loss: 350.10021747648716 Accuracy: tensor(0.8735, device='cuda:0')
Batch Loss: 379.6919996291399 Accuracy: tensor(0.8737, device='cuda:0')
Batch Loss: 408.67237758636475 Accuracy: tensor(0.8740

  0%|          | 0/4569 [00:00<?, ?it/s]

Batch Loss: 28.24914515018463 Accuracy: tensor(0.8847, device='cuda:0')
Batch Loss: 55.38689959794283 Accuracy: tensor(0.8861, device='cuda:0')
Batch Loss: 82.10857161134481 Accuracy: tensor(0.8863, device='cuda:0')
Batch Loss: 109.42385046929121 Accuracy: tensor(0.8878, device='cuda:0')
Batch Loss: 136.06794829666615 Accuracy: tensor(0.8879, device='cuda:0')
Batch Loss: 163.1275979578495 Accuracy: tensor(0.8884, device='cuda:0')
Batch Loss: 190.49173633754253 Accuracy: tensor(0.8876, device='cuda:0')
Batch Loss: 218.47568356990814 Accuracy: tensor(0.8871, device='cuda:0')
Batch Loss: 244.60915421694517 Accuracy: tensor(0.8874, device='cuda:0')
Batch Loss: 269.7690526768565 Accuracy: tensor(0.8879, device='cuda:0')
Batch Loss: 295.17988200485706 Accuracy: tensor(0.8882, device='cuda:0')
Batch Loss: 322.38947205245495 Accuracy: tensor(0.8877, device='cuda:0')
Batch Loss: 348.7889391183853 Accuracy: tensor(0.8875, device='cuda:0')
Batch Loss: 376.89732099324465 Accuracy: tensor(0.8871, d

  0%|          | 0/4569 [00:00<?, ?it/s]

Batch Loss: 25.547676615417004 Accuracy: tensor(0.8822, device='cuda:0')
Batch Loss: 49.990578323602676 Accuracy: tensor(0.8897, device='cuda:0')
Batch Loss: 74.73963459581137 Accuracy: tensor(0.8908, device='cuda:0')
Batch Loss: 101.15627886354923 Accuracy: tensor(0.8921, device='cuda:0')
Batch Loss: 125.94734793156385 Accuracy: tensor(0.8938, device='cuda:0')
Batch Loss: 150.28235943615437 Accuracy: tensor(0.8942, device='cuda:0')
Batch Loss: 174.80936724692583 Accuracy: tensor(0.8945, device='cuda:0')
Batch Loss: 198.94953168183565 Accuracy: tensor(0.8957, device='cuda:0')
Batch Loss: 223.96615609899163 Accuracy: tensor(0.8958, device='cuda:0')
Batch Loss: 248.35097406432033 Accuracy: tensor(0.8963, device='cuda:0')
Batch Loss: 271.11639020219445 Accuracy: tensor(0.8977, device='cuda:0')
Batch Loss: 295.3084304444492 Accuracy: tensor(0.8978, device='cuda:0')
Batch Loss: 319.41107895597816 Accuracy: tensor(0.8975, device='cuda:0')
Batch Loss: 343.9091587215662 Accuracy: tensor(0.8977

In [16]:
losses, accuracies

([1827.4496524631977,
  1452.8047158643603,
  1318.4067158661783,
  1224.1696892157197,
  1139.8345448412],
 [tensor(0.8206, device='cuda:0'),
  tensor(0.8621, device='cuda:0'),
  tensor(0.8766, device='cuda:0'),
  tensor(0.8865, device='cuda:0'),
  tensor(0.8958, device='cuda:0')])

In [18]:
# 모델 저장하기
torch.save(model.state_dict(), "model.pt")

In [None]:
#테스트셋 정확도 평가, 메모리 부족으로 실행은 하지 않음
model.eval()

test_correct = 0
test_total = 0

for input_ids_batch, attention_masks_batch, y_batch in tqdm(test_loader):
  y_batch = y_batch.to(device)
  y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
  _, predicted = torch.max(y_pred, 1)
  test_correct += (predicted == y_batch).sum()
  test_total += len(y_batch)

print("Accuracy:", test_correct.float() / test_total)

In [9]:
#모델 불러오기, 어떻게....?
model = ElectraForSequenceClassification()
model.load_state_dict(torch.load('model.pt'))
model.eval()

TypeError: __init__() missing 1 required positional argument: 'config'

In [32]:
#예측 함수
tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-small-v2-discriminator")
max_input_length = 256
init_token = tokenizer.cls_token
eos_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token
init_token_idx = tokenizer.convert_tokens_to_ids(init_token)
eos_token_idx = tokenizer.convert_tokens_to_ids(eos_token)
pad_token_idx = tokenizer.convert_tokens_to_ids(pad_token)
unk_token_idx = tokenizer.convert_tokens_to_ids(unk_token)

def predict_sentiment(model, tokenizer, sentence):
    model.eval()
    tokens = tokenizer.tokenize(sentence)
    tokens = tokens[:max_input_length-2]
    indexed = [init_token_idx] + tokenizer.convert_tokens_to_ids(tokens) + [eos_token_idx]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(0)
    prediction = torch.sigmoid(model(tensor))
    return prediction.item()

In [34]:
predict_sentiment(model, tokenizer, "이 영화 진짜 재밌었다!!")

TypeError: sigmoid(): argument 'input' (position 1) must be Tensor, not SequenceClassifierOutput