In [3]:
import pandas as pd
import numpy as np

import torch 
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup


from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim

import datetime
import time
import random

In [4]:
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

There are 1 GPU(s) available.
We will use the GPU: Tesla V100-SXM3-32GB


In [5]:
# path = '/Users/richardpark/Desktop/nlp_project'

In [6]:
ls

DO_NOT_STORE_PERSISTENT_FILES_HERE.md
Untitled.ipynb
Untitled1-Copy1.ipynb
Untitled1-Copy2.ipynb
Untitled1.ipynb
id_container
naver_movie_review_test.csv
naver_movie_review_train.csv
parkdaehee1.ipynb
pytorch_matthewburke:korean_sentiment.ipynb
[0m[01;34mwandb[0m/
weightsmodel_state_dict.pt


In [7]:
train_df = pd.read_csv('./naver_movie_review_train.csv')
test_df = pd.read_csv('./naver_movie_review_test.csv')

In [8]:
train_df.shape, test_df.shape

((143620, 3), (48852, 3))

In [9]:
# from transformers import AutoModelForSequenceClassification

# #load model 

MODEL_NAME = 'matthewburke/korean_sentiment'

tokenizer = AutoTokenizer.from_pretrained('matthewburke/korean_sentiment')

In [10]:
# MODEL_NAME = 'matthewburke/korean_sentiment'
# model_test = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

In [11]:
class NaverMovieReviewDatasetTokenizer ():
    tokenizer = AutoTokenizer.from_pretrained('matthewburke/korean_sentiment')
    
    def __init__(self, df):
        self.df = df
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.df)
    
    '''
    __getitem__ 메서드
    
    class 내부적으로 idx를 돌면서 순회 하고 나중에 값을 index로 추출해볼수 있다.
    '''
    
    def __getitem__(self, idx):
        # id, document, label
        tokenized_example = self.tokenizer(self.df.iloc[idx, 1])
        label = self.df.iloc[idx, 2]
        tokenized_example = {k: torch.tensor(v) for k, v in tokenized_example.items()}
        tokenized_example.update({'labels': torch.tensor(label, dtype=torch.long)})
        
        return tokenized_example
    

In [80]:
train_set = NaverMovieReviewDatasetTokenizer(train_df)

In [81]:
test_set =  NaverMovieReviewDatasetTokenizer(test_df)

In [82]:
train_set[0]

{'input_ids': tensor([    2,  2431, 48502,  7997, 15775,  8028, 10855,     3]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1]),
 'labels': tensor(0)}

In [83]:
# a = [train_set.__getitem__(i) for i in range(train_set.__len__())]

In [12]:
class NaverMovieReviewDatasetLoader():
    def __init__(self, dataset, batch_size):
        self.train_set = dataset
        self.batch_size = batch_size
        
    def loader(self):
        data_loader = DataLoader(
            self.train_set, 
            batch_size = self.batch_size, 
            shuffle = True, 
            collate_fn=DataCollatorWithPadding( 
                tokenizer=tokenizer, 
                padding=True
                )
            )
        return data_loader
        

In [13]:
def load_pretrained_model(model_name) :
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    return model

In [14]:
def load_tokenizer(model_name) :
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return tokenizer


In [87]:
train_loader = NaverMovieReviewDatasetLoader(train_set, 16)

In [88]:
test_loader = NaverMovieReviewDatasetLoader(test_set, 16)

In [89]:
# for step, batch in enumerate(train_loader.loader()) :
#     print(step, batch)

In [15]:
import torch.nn as nn
from transformers import AutoModelForSequenceClassification
import time
import torch


class CustomTextClassifier(nn.Module):
    def __init__(self, model_name):
        super(CustomTextClassifier, self).__init__()
        self.MODEL = AutoModelForSequenceClassification.from_pretrained(model_name)
        # self.loss_fn = nn.CrossEntropyLoss()
        # self.optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

    def forward(self, input_ids, token_type_ids, attention_mask, labels=None):
        output = self.MODEL(input_ids, 
                            token_type_ids=token_type_ids, 
                            attention_mask=attention_mask, 
                            labels=labels)
        return output

In [107]:
# from model import CustomTextClassifier


model = CustomTextClassifier('matthewburke/korean_sentiment')


In [108]:
device = torch.device('cuda')

In [109]:
model.to(device)

CustomTextClassifier(
  (MODEL): ElectraForSequenceClassification(
    (electra): ElectraModel(
      (embeddings): ElectraEmbeddings(
        (word_embeddings): Embedding(50135, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): ElectraEncoder(
        (layer): ModuleList(
          (0): ElectraLayer(
            (attention): ElectraAttention(
              (self): ElectraSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): ElectraSelfOutput(
                (dense): Linear(in_featur

In [110]:
# 옵티마이저 설정
learning_rate = 2e-5
optimizer = AdamW(model.parameters(),
                  lr = learning_rate, # 학습률
                  eps = 1e-8 # 0으로 나누는 것을 방지하기 위한 epsilon 값
                )

# loss function 
loss_fn = nn.CrossEntropyLoss()

# 에폭수
epochs = 4

# 총 훈련 스텝
total_steps = len(train_loader.loader()) * epochs

# lr 조금씩 감소시키는 스케줄러
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [16]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# 시간 표시 함수
def format_time(elapsed):
    # 반올림
    elapsed_rounded = int(round((elapsed)))
    # hh:mm:ss으로 형태 변경
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [112]:
def train_loop(train_dataloader, model, optimizer, epochs):
    for epoch in range(0, epochs):
        # ==========================================================
        # ======================== Training ========================
        # ==========================================================
        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch + 1, epochs))
        print('Training...')
        
        
        # 시작 시간 설정
        t0 = time.time()

        # 로스 초기화
        total_loss = 0
        
        # 훈련모드로 변경 
        model.train()
        
        
        for step, batch in enumerate(train_dataloader):
            batch = batch.to(device)
            # 경과 정보 표시
            if step % 500 == 0 and not step == 0:
                elapsed = format_time(time.time() - t0)
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

            output = model(**batch)
            loss = output[0]
            total_loss += loss.item()
            
            # backpropagation 
            optimizer.zero_grad()
            output['loss'].backward()
            optimizer.step()
            
            # Gradient Clipping
            torch.nn.utils.clip_grad_norm(model.parameters(), 1.0)
            
            scheduler.step()
            
            model.zero_grad()
        
        # Average Loss Calculation
        avg_train_loss = total_loss / len(train_dataloader)
        
        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))

In [113]:
train_loop(train_loader.loader(), model, optimizer, 1)


Training...


  torch.nn.utils.clip_grad_norm(model.parameters(), 1.0)


  Batch   500  of  8,977.    Elapsed: 0:00:32.
  Batch 1,000  of  8,977.    Elapsed: 0:01:04.
  Batch 1,500  of  8,977.    Elapsed: 0:01:35.
  Batch 2,000  of  8,977.    Elapsed: 0:02:07.
  Batch 2,500  of  8,977.    Elapsed: 0:02:39.
  Batch 3,000  of  8,977.    Elapsed: 0:03:11.
  Batch 3,500  of  8,977.    Elapsed: 0:03:42.
  Batch 4,000  of  8,977.    Elapsed: 0:04:14.
  Batch 4,500  of  8,977.    Elapsed: 0:04:46.
  Batch 5,000  of  8,977.    Elapsed: 0:05:17.
  Batch 5,500  of  8,977.    Elapsed: 0:05:49.
  Batch 6,000  of  8,977.    Elapsed: 0:06:20.
  Batch 6,500  of  8,977.    Elapsed: 0:06:52.
  Batch 7,000  of  8,977.    Elapsed: 0:07:23.
  Batch 7,500  of  8,977.    Elapsed: 0:07:55.
  Batch 8,000  of  8,977.    Elapsed: 0:08:27.
  Batch 8,500  of  8,977.    Elapsed: 0:08:58.

  Average training loss: 0.25
  Training epcoh took: 0:09:29


In [114]:
def test_loop(test_dataloader, model):
    t0 = time.time()
    
    # eavaluation model
    model.eval()
    
    eval_loss = 0
    eval_accuracy = 0
    nb_eval_steps, eb_eval_examples = 0, 0
    
    for step, batch in enumerate(test_dataloader):
        batch = batch.to(device)
        
        # 경과 정보 표시
        if step % 100 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(test_dataloader), elapsed))
        
        with torch.no_grad():
            output = model(**batch)
        
        
        logits = output[1]
        logits = logits.detach().cpu().numpy()
        label_ids = batch['labels'].to('cpu').numpy()
        
        # 출력 로짓과 라벨을 비교하여 정확도 계산
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1
        
    print("")
    print("Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("Test took: {:}".format(format_time(time.time() - t0)))
    bert_base_multilingual_cased_accuracy = eval_accuracy/nb_eval_steps

In [115]:
test_loop(test_loader.loader(), model)

  Batch   100  of  3,054.    Elapsed: 0:00:02.
  Batch   200  of  3,054.    Elapsed: 0:00:04.
  Batch   300  of  3,054.    Elapsed: 0:00:05.
  Batch   400  of  3,054.    Elapsed: 0:00:07.
  Batch   500  of  3,054.    Elapsed: 0:00:09.
  Batch   600  of  3,054.    Elapsed: 0:00:11.
  Batch   700  of  3,054.    Elapsed: 0:00:12.
  Batch   800  of  3,054.    Elapsed: 0:00:14.
  Batch   900  of  3,054.    Elapsed: 0:00:16.
  Batch 1,000  of  3,054.    Elapsed: 0:00:18.
  Batch 1,100  of  3,054.    Elapsed: 0:00:20.
  Batch 1,200  of  3,054.    Elapsed: 0:00:21.
  Batch 1,300  of  3,054.    Elapsed: 0:00:23.
  Batch 1,400  of  3,054.    Elapsed: 0:00:25.
  Batch 1,500  of  3,054.    Elapsed: 0:00:27.
  Batch 1,600  of  3,054.    Elapsed: 0:00:29.
  Batch 1,700  of  3,054.    Elapsed: 0:00:30.
  Batch 1,800  of  3,054.    Elapsed: 0:00:32.
  Batch 1,900  of  3,054.    Elapsed: 0:00:34.
  Batch 2,000  of  3,054.    Elapsed: 0:00:36.
  Batch 2,100  of  3,054.    Elapsed: 0:00:38.
  Batch 2,200

In [64]:
cd work

/home/work


In [116]:
# save model
PATH = './weights'
torch.save(model.state_dict(), PATH+'model_state_dict.pt')

In [69]:
# load model 
new_model = CustomTextClassifier('matthewburke/korean_sentiment')
new_model.load_state_dict(torch.load('weightsmodel_state_dict.pt'))
new_model.eval()

CustomTextClassifier(
  (MODEL): ElectraForSequenceClassification(
    (electra): ElectraModel(
      (embeddings): ElectraEmbeddings(
        (word_embeddings): Embedding(50135, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): ElectraEncoder(
        (layer): ModuleList(
          (0): ElectraLayer(
            (attention): ElectraAttention(
              (self): ElectraSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): ElectraSelfOutput(
                (dense): Linear(in_featur

In [70]:
new_model

CustomTextClassifier(
  (MODEL): ElectraForSequenceClassification(
    (electra): ElectraModel(
      (embeddings): ElectraEmbeddings(
        (word_embeddings): Embedding(50135, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): ElectraEncoder(
        (layer): ModuleList(
          (0): ElectraLayer(
            (attention): ElectraAttention(
              (self): ElectraSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): ElectraSelfOutput(
                (dense): Linear(in_featur

In [175]:
review = '이 영화 진짜 재미있다.'

In [176]:
tokenized_example = tokenizer(review)

In [177]:
tokenized_example

{'input_ids': [2, 2741, 9477, 7997, 27931, 4058, 18, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [178]:
for k,v in tokenized_example.items():
    print(k,v)

input_ids [2, 2741, 9477, 7997, 27931, 4058, 18, 3]
token_type_ids [0, 0, 0, 0, 0, 0, 0, 0]
attention_mask [1, 1, 1, 1, 1, 1, 1, 1]


In [179]:
prediction_set = {k: torch.tensor([v]) for k, v in tokenized_example.items()}

In [180]:
prediction_set

{'input_ids': tensor([[    2,  2741,  9477,  7997, 27931,  4058,    18,     3]]),
 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}

In [184]:
def prediction_loop(review, model):
    tokenized_example = tokenizer(review)
    prediction_set = {k: torch.tensor([v]) for k, v in tokenized_example.items()}
    with torch.no_grad():
        output = model(**prediction_set)
   
    pred = output[0]
    
    if (pred[:,0] < pred[:,1]):
        print('긍정입니다!')
    else:
        print('부정입니다!')
    

In [186]:
prediction_loop(review, new_model)

긍정입니다!
