# Pytorch + HuggingFace

## KcElectra Model

이준범님의 KcElectra-base 사용

https://github.com/Beomi/KcELECTRA


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# HuggingFace transformers 설치
!pip install transformers

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 5.3 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 37.8 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 44.7 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 45.9 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 6.4 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Fo

# Contents

* 1. READ TRAIN DATASETS
    
* 2. KcELECTRA
   
* 3. MODEL TRAIN / INFERENCE
 
* 4. PREDICT

# 1. READ TRAIN DATASETS

* 배달 리뷰 텍스트와 긍/부정 레이블로 구성된 csv 파일을 불러온 후 Train set과 Valid set으로 나눠줍니다.


In [None]:
import pandas as pd
import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import AutoTokenizer, AutoModel, AdamW 
import numpy as np
from tqdm.notebook import tqdm

In [None]:
train_data = pd.read_csv('/content/drive/MyDrive/AI Spark/cafe_review.csv',encoding='utf-8-sig')

In [None]:
train_data[:10]

Unnamed: 0,text,label
0,히히 오늘은 포장했는데 복숭아 상태 안좋다고미안하시다며 서비스를주시는데주문한거보다 ...,1
1,메뉴가 다양해서 좋아요 맛도 최공,1
2,리뷰 쓰기 전에 개를 이미 먹고 있었는데 일반 크로플 개 시켰는데도 그대로 있네요 ...,1
3,서비스을 엄청 주셨어요 ㅠㅠ 감사합니다 정말 맛있어요,1
4,밀크티 넘 맛있어요 서비스도 마니주시구 크로플 바삭바삭하고 맛있었어용 밀크티에 들...,1
5,배달도 빠르고 전부 다 너무 맛있었습니다 서비스도 감사합니다 또 주문할 것 같아용 ...,1
6,원래 복숭아그릭요커트를 두개씩 주시나여 푸짐한 양에 깜짝 놀라버렸어요 좀 비싸다 생...,1
7,허버허버 먹느라 사진은 못 찍었는데 티라미수크로플 다들 꼭 먹어주세요,1
8,밀크티와 크로플을 함께 즐길 수 있다는 것 자체가 큰 행복입니다 맛있어요,1
9,아니 복숭아그릭요거트 두개나 주시다뇨 진짜 하나도 안시고 줜맛탱커리,1


In [None]:
train_dataset = []
for sen, label in zip(train_data['text'], train_data['label']):
  data_train = []
  data_train.append(sen)
  data_train.append(str(label))

  train_dataset.append(data_train)

In [None]:
data_size = len(train_dataset)

In [None]:
train_size = int(data_size * 0.75)
validation_size = data_size - train_size

In [None]:
train_dataset, validation_dataset= random_split(train_dataset, [train_size, validation_size])

# 2. KcELECTRA

* 사전 학습된 kcELECTRA-base 토크나이저를 불러와 워드 임베딩을 진행한다.

In [None]:
class TrainDataset(Dataset):
  
  def __init__(self, dataset):
    self.tokenizer = AutoTokenizer.from_pretrained("beomi/KcELECTRA-base")

    self.sentences = [str([i[0]]) for i in dataset]
    self.labels = [np.int32(i[1]) for i in dataset]

  def __len__(self):
    return (len(self.labels))
  
  def __getitem__(self, i):
    text = self.sentences[i]
    y = self.labels[i]

    inputs = self.tokenizer(
        text, 
        return_tensors='pt',
        truncation=True,
        max_length=128,
        pad_to_max_length=True,
        add_special_tokens=True
        )
    
    input_ids = inputs['input_ids'][0]
    attention_mask = inputs['attention_mask'][0]

    return input_ids, attention_mask, y

In [None]:
train_dataset = TrainDataset(train_dataset)
valid_dataset = TrainDataset(validation_dataset)

In [None]:
# GPU 사용
device = torch.device("cuda")

* 사전학습된 "kcELECTRA-base" 모델을 불러온다.

In [None]:
from torch import nn

model = AutoModel.from_pretrained("beomi/KcELECTRA-base", num_labels=2)

model = model.to(device)

Some weights of the model checkpoint at beomi/KcELECTRA-base were not used when initializing ElectraModel: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# 3. MODEL TRAIN / INFERENCE


In [None]:
batch_size = 16
epochs = 5

In [None]:
optimizer = AdamW(model.parameters(), lr=3e-5)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, num_workers=5, shuffle=True)
valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=8, num_workers=5, shuffle=True)

  cpuset_checked))


* 훈련 데이터셋을 사용해 모델을 학습시킨다.

In [None]:
losses = []
accuracies = []

#정확도 측정을 위한 함수 정의
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

loss_fn = nn.CrossEntropyLoss()

for i in range(epochs):
  train_acc = 0.0
  total_loss = 0.0
  correct = 0
  total = 0
  batches = 0

  model.train()

  for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_dataloader):
    optimizer.zero_grad()
    y_batch = y_batch.long().to(device)
    y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
    y_pred = y_pred[:, -1, :]
    loss = loss_fn(y_pred, y_batch)
    loss.backward()
    optimizer.step()

    total_loss += loss.item()

    train_acc += calc_accuracy(y_pred, y_batch)
    total += len(y_batch)

    batches += 1
    if batches % 50 == 0:
      print("epoch {} loss {} train acc {}".format(i+1, loss.data.cpu().numpy(), train_acc / (batches+1)))
  print("epoch {} loss {} train acc {}".format(i+1, loss.data.cpu().numpy(), train_acc / (batches+1)))
  model.eval()
  


  0%|          | 0/64 [00:00<?, ?it/s]

  cpuset_checked))


epoch 1 loss 2.6612725257873535 train acc 0.49019607843137253
epoch 1 loss 2.0012857913970947 train acc 0.5306623931623932


  0%|          | 0/64 [00:00<?, ?it/s]



epoch 2 loss 0.36052393913269043 train acc 0.8198529411764706
epoch 2 loss 0.27973684668540955 train acc 0.8413461538461539


  0%|          | 0/64 [00:00<?, ?it/s]



epoch 3 loss 0.21145769953727722 train acc 0.9166666666666666
epoch 3 loss 1.0348143577575684 train acc 0.9127136752136752


  0%|          | 0/64 [00:00<?, ?it/s]



epoch 4 loss 0.25882500410079956 train acc 0.9473039215686274
epoch 4 loss 0.0912681296467781 train acc 0.9471153846153846


  0%|          | 0/64 [00:00<?, ?it/s]



epoch 5 loss 0.09546708315610886 train acc 0.9522058823529411
epoch 5 loss 0.17382656037807465 train acc 0.9605769230769231


* 검증 데이터셋에서 추론을 진행한다.

In [None]:
model.eval()
valid_acc = 0.0
total_loss = 0.0
correct = 0
total = 0
batches = 0
for input_ids_batch, attention_masks_batch, y_batch in tqdm(valid_dataloader):
    optimizer.zero_grad()
    y_batch = y_batch.long().to(device)
    y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
    y_pred = y_pred[:, -1, :]
    loss = loss_fn(y_pred, y_batch)
    loss.backward()
    optimizer.step()

    total_loss += loss.item()

    valid_acc += calc_accuracy(y_pred, y_batch)
    total += len(y_batch)

    batches += 1
    if batches % 50 == 0:
      print("epoch {} loss {} valid acc {}".format(0, loss.data.cpu().numpy(), valid_acc / (batches+1)))
print("epoch {} loss {} valid acc {}".format(0, loss.data.cpu().numpy(), valid_acc / (batches+1)))

  0%|          | 0/43 [00:00<?, ?it/s]

  cpuset_checked))


epoch 0 loss 1.070393681526184 valid acc 0.9119318181818182


In [None]:
# 모델 저장하기
torch.save(model.state_dict(), "/content/drive/MyDrive/AI Spark/model.pt")

# 4. PREDICT 

* predict를 위한 함수를 정의 한 후 학습이 잘 되었는지 확인이 가능하다.

In [None]:
# predict를 위한 함수 정의
def predict(sentence):
    data = [sentence, '0']
    dataset_another = [data]
    logits = 0
    another_test = TrainDataset(dataset_another)
    test_dataloader = torch.utils.data.DataLoader(another_test)

    model.eval()

    for input_ids_batch, attention_masks_batch, y_batch in test_dataloader:
        y_batch = y_batch.long().to(device)
        out = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
        out = out[:, -1, :]

        for i in out:
            logits = i
            logits = logits.detach().cpu().numpy()
            logits = np.argmax(logits)
    return logits

In [None]:
predict("이 집은 진짜 맛이 없어요")



0