In [132]:
import pandas as pd
import numpy as np

import torch 
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup


from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim

import datetime
import time
import random

In [107]:
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

No GPU available, using the CPU instead.


In [108]:
path = '/Users/richardpark/Desktop/nlp_project'

[34m__pycache__[m[m/                  naver_movie_review_test.csv
best_model.h5                 naver_movie_review_train.csv
data_load.py                  preprocessing.py
data_load_class.ipynb         ratings_test.txt
[34mlibrary[m[m/                      ratings_train.txt
[34mmodels[m[m/                       weightsmodel_state_dict.pt
naver_movie_review_data.csv


In [5]:
pwd

'/Users/richardpark/Desktop/nlp_project'

In [112]:
train_df = pd.read_csv('./naver_movie_review_train.csv')
test_df = pd.read_csv('./naver_movie_review_test.csv')

In [113]:
train_df.shape, test_df.shape

((143620, 3), (48852, 3))

In [214]:
# from transformers import AutoModelForSequenceClassification

# #load model 

MODEL_NAME = 'matthewburke/korean_sentiment'

tokenizer = AutoTokenizer.from_pretrained('matthewburke/korean_sentiment')

In [215]:
MODEL_NAME = 'matthewburke/korean_sentiment'
model_test = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

In [216]:
class NaverMovieReviewDatasetTokenizer ():
    tokenizer = AutoTokenizer.from_pretrained('matthewburke/korean_sentiment')
    
    def __init__(self, df):
        self.df = df
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.df)
    
    '''
    __getitem__ 메서드
    
    class 내부적으로 idx를 돌면서 순회 하고 나중에 값을 index로 추출해볼수 있다.
    '''
    
    def __getitem__(self, idx):
        # id, document, label
        tokenized_example = self.tokenizer(self.df.iloc[idx, 1])
        label = self.df.iloc[idx, 2]
        tokenized_example = {k: torch.tensor(v) for k, v in tokenized_example.items()}
        tokenized_example.update({'labels': torch.tensor(label, dtype=torch.long)})
        
        return tokenized_example
    

In [115]:
train_set = NaverMovieReviewDatasetTokenizer(train_df)

In [116]:
train_set[0]

{'input_ids': tensor([    2,  2431, 48502,  7997, 15775,  8028, 10855,     3]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1]),
 'labels': tensor(0)}

In [18]:
# a = [train_set.__getitem__(i) for i in range(train_set.__len__())]

In [217]:
class NaverMovieReviewDatasetLoader():
    def __init__(self, dataset, batch_size):
        self.train_set = dataset
        self.batch_size = batch_size
        
    def loader(self):
        data_loader = DataLoader(
            self.train_set, 
            batch_size = self.batch_size, 
            shuffle = True, 
            collate_fn=DataCollatorWithPadding( 
                tokenizer=tokenizer, 
                padding=True
                )
            )
        return data_loader
        

In [218]:
file_path = '/Users/richardpark/Desktop/nlp_project/'

# 학습되어 있는 model 파라미터 가져와서 my_model 만들기 
def load_my_model(model_name, file_path) :
    file_name = 'weightsmodel_state_dict.pt'
    my_model = AutoModelForSequenceClassification.from_pretrained(model_name)
    my_model.load_state_dict(torch.load(file_path+file_name))
    my_model.eval()
    return my_model

In [219]:
def load_tokenizer(model_name) :
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return tokenizer

In [220]:
train_loader = NaverMovieReviewDatasetLoader(train_set, 16)

In [221]:
# for step, batch in enumerate(train_loader.loader()) :
#     print(step, batch)

In [229]:
import torch.nn as nn
from transformers import AutoModelForSequenceClassification
import time
import torch


class CustomTextClassifier(nn.Module):
    def __init__(self, model_name):
        super(CustomTextClassifier, self).__init__()
        self.MODEL = AutoModelForSequenceClassification.from_pretrained(model_name)
        # self.loss_fn = nn.CrossEntropyLoss()
        # self.optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

    def forward(self, input_ids, token_type_ids, attention_mask, labels):
        output = self.MODEL(input_ids, 
                            token_type_ids=token_type_ids, 
                            attention_mask=attention_mask, 
                            labels=labels)
        return output

In [230]:
# from model import CustomTextClassifier


model = CustomTextClassifier('matthewburke/korean_sentiment')


In [231]:
model

CustomTextClassifier(
  (MODEL): ElectraForSequenceClassification(
    (electra): ElectraModel(
      (embeddings): ElectraEmbeddings(
        (word_embeddings): Embedding(50135, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): ElectraEncoder(
        (layer): ModuleList(
          (0): ElectraLayer(
            (attention): ElectraAttention(
              (self): ElectraSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): ElectraSelfOutput(
                (dense): Linear(in_featur

In [232]:
# 옵티마이저 설정
learning_rate = 2e-5
optimizer = AdamW(model.parameters(),
                  lr = learning_rate, # 학습률
                  eps = 1e-8 # 0으로 나누는 것을 방지하기 위한 epsilon 값
                )

# loss function 
loss_fn = nn.CrossEntropyLoss()

# 에폭수
epochs = 4

# 총 훈련 스텝
total_steps = len(train_loader.loader()) * epochs

# lr 조금씩 감소시키는 스케줄러
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [233]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# 시간 표시 함수
def format_time(elapsed):
    # 반올림
    elapsed_rounded = int(round((elapsed)))
    # hh:mm:ss으로 형태 변경
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [234]:
def train_loop(train_dataloader, model, optimizer, epochs):
    for epoch in range(0, epochs):
        # ==========================================================
        # ======================== Training ========================
        # ==========================================================
        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch + 1, epochs))
        print('Training...')
        
        
        # 시작 시간 설정
        t0 = time.time()

        # 로스 초기화
        total_loss = 0
        
        # 훈련모드로 변경 
        model.train()
        
        
        for step, batch in enumerate(train_dataloader):
            # print(batch)
            # 경과 정보 표시
            if step % 500 == 0 and not step == 0:
                elapsed = format_time(time.time() - t0)
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

            output = model(**batch)
            loss = output[0]
            total_loss += loss.item()
            
            # backpropagation 
            optimizer.zero_grad()
            output['loss'].backward()
            optimizer.step()
            
            # Gradient Clipping
            torch.nn.utils.clip_grad_norm(model.parameters(), 1.0)
            
            scheduler.step()
            
            model.zero_grad()
        
        # Average Loss Calculation
        avg_train_loss = total_loss / len(train_dataloader)
        
        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))

In [1]:
train_loop(train_loader.loader(), model, optimizer, 4)

In [None]:
def test_loop(test_dataloader, moel, loss_fn):
    t0 = time.time()
    
    # eavaluation model
    model.eval()
    
    eval_loss = 0
    eval_accuracy = 0
    nb_eval_steps, eb_eval_examples = 0, 0
    
    for step, batch in enumerate(test_dataloader):
        # 경과 정보 표시
        if step % 100 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(test_dataloader), elapsed))
        
        with torch.no_grad():
            output = model(**batch)
        
        
        logits = output[1]
        logits = logits.detach().cpu().numpy()
        label_ids = batch['labels'].to('cpu').numpy()
        
        # 출력 로짓과 라벨을 비교하여 정확도 계산
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1
        
    print("")
    print("Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("Test took: {:}".format(format_time(time.time() - t0)))
        

In [None]:
## save model 
# PATH = 
# torch.save(model.state_dict(), PATH+'model_state_dict.pt')

In [None]:
# # load model 
# new_model = CustomTextClassifier('matthewburke/korean_sentiment')
# new_model.load_state_dict(torch.load('weightsmodel_state_dict.pt'))
# new_model.eval()

In [None]:
review = '이 영화 진짜 잼있다.'

In [6]:
def prediction(review) :
    if review:
        tokenized_review = NaverMovieReviewDatasetTokenizer(review)
        data_loader = NaverMovieReviewDatasetLoader(tokenized_review)

        for batch in data_loader.loader():
            with torch.no_grad():
                ouput = model(**batch)
            logits = ouput[1]
            
            print(logits)