<a href="https://colab.research.google.com/github/tedsong3170/nlp/blob/main/kor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install torch

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/50/0c/7d5950fcd80b029be0a8891727ba21e0cd27692c407c51261c3c921f6da3/transformers-4.1.1-py3-none-any.whl (1.5MB)
[K     |████████████████████████████████| 1.5MB 13.1MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 53.7MB/s 
[?25hCollecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 57.7MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893261 sha256=cbe291c4f7a8

In [None]:
# 네이버 영화리뷰 감정분석 데이터 다운로드
!git clone https://github.com/e9t/nsmc.git

Cloning into 'nsmc'...
remote: Enumerating objects: 14763, done.[K
remote: Total 14763 (delta 0), reused 0 (delta 0), pack-reused 14763[K
Receiving objects: 100% (14763/14763), 56.19 MiB | 23.47 MiB/s, done.
Resolving deltas: 100% (1749/1749), done.
Checking out files: 100% (14737/14737), done.


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import tensorflow as tf
import torch

from transformers import ElectraTokenizer
from transformers import ElectraForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import random
import time
import datetime
import os.path

# 입력 토큰의 최대 시퀀스 길이
MAX_LEN = 128
# 배치 사이즈
BATCH_SIZE = 32
TRAIN_PERCENT = 3e-5
EPSILON = 1e-8
# 에폭수
EPOCHS = 4


class KorSentimentAnalyzer:
    train = None
    test = None
    device = None
    model = None
    pretrainedModelPath = None

    def __init__(self, pretrainedModelPath=None):
        
        self.pretrainedModelPath = pretrainedModelPath
        # 디바이스 설정
        if torch.cuda.is_available():
            # GPU 디바이스 이름 구함
            device_name = tf.test.gpu_device_name()

            # GPU 디바이스 이름 검사
            if device_name == '/device:GPU:0':
                print('Found GPU at: {}'.format(device_name))
            else:
                raise SystemError('GPU device not found')

            self.device = torch.device("cuda")
            print('There are %d GPU(s) available.' % torch.cuda.device_count())
            print('We will use the GPU:', torch.cuda.get_device_name(0))
        else:
            self.device = torch.device("cpu")
            print('No GPU available, using the CPU instead.')

        # 분류를 위한 BERT 모델 생성
        if self.pretrainedModelPath is not None:
            if os.path.isdir(self.pretrainedModelPath) is True:
                self.model = ElectraForSequenceClassification.from_pretrained(self.pretrainedModelPath, num_labels=2)
                print("pretrained Model loaded")
            else:
                self.model = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-base-v3-discriminator", num_labels=2)
        else:
            self.model = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-base-v3-discriminator", num_labels=2)

        if torch.cuda.is_available():
            self.model.cuda()

    def loadCSVFile(self, path):
        df = pd.read_csv(path, sep='\t')
        df.drop_duplicates("document", False, True)
        mask = df['document'].str.contains('^[a-zA-Z0-9]*$', regex=True)
        df = df[~mask]
        df = df.drop(df[df.document.str.len() < 5].index)

        return df

    def preprocess(self, target, targetPath=None):
        sentences = None
        labels = None

        if target == "train":
            self.train = self.loadCSVFile('nsmc/ratings_train.txt')
            sentences = self.train['document']
            sentences = ["[CLS] " + str(sentence) + " [SEP]" for sentence in sentences]
            labels = self.train['label'].values
        elif target == "test":
            self.test = self.loadCSVFile('nsmc/ratings_test.txt')
            labels = self.test['label'].values
            sentences = self.test['document']
            sentences = ["[CLS] " + str(sentence) + " [SEP]" for sentence in sentences]
        elif target == "predict":
            self.predict = pd.read_csv(targetPath, sep=',', encoding='cp949')
            sentences = self.predict['Sentence']

        # BERT의 토크나이저로 문장을 토큰으로 분리
        tokenizer = ElectraTokenizer.from_pretrained('monologg/koelectra-base-v3-discriminator', do_lower_case=False)
        tokenized_texts = list()
        for sent in sentences:
            if str(type(sent)) == "<class 'str'>":
                tokenized_texts.append(tokenizer.tokenize(sent))
            else:
                tokenized_texts.append(tokenizer.tokenize(sent))

        # 토큰을 숫자 인덱스로 변환
        input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

        # 문장을 MAX_LEN 길이에 맞게 자르고, 모자란 부분을 패딩 0으로 채움
        input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

        # 어텐션 마스크 초기화
        attention_masks = []

        # 어텐션 마스크를 패딩이 아니면 1, 패딩이면 0으로 설정
        # 패딩 부분은 BERT 모델에서 어텐션을 수행하지 않아 속도 향상
        for seq in input_ids:
            seq_mask = [float(i > 0) for i in seq]
            attention_masks.append(seq_mask)

        if target == "train":
            # 훈련셋과 검증셋으로 분리
            train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels,
                                                                                                random_state=2018,
                                                                                                test_size=0.1)

            # 어텐션 마스크를 훈련셋과 검증셋으로 분리
            train_masks, validation_masks, _, _ = train_test_split(attention_masks,
                                                                   input_ids,
                                                                   random_state=2018,
                                                                   test_size=0.1)

            # 데이터를 파이토치의 텐서로 변환
            train_inputs = torch.tensor(train_inputs, dtype=torch.long)
            train_labels = torch.tensor(train_labels, dtype=torch.long)
            train_masks = torch.tensor(train_masks, dtype=torch.long)
            validation_inputs = torch.tensor(validation_inputs, dtype=torch.long)
            validation_labels = torch.tensor(validation_labels, dtype=torch.long)
            validation_masks = torch.tensor(validation_masks, dtype=torch.long)

            # 파이토치의 DataLoader로 입력, 마스크, 라벨을 묶어 데이터 설정
            # 학습시 배치 사이즈 만큼 데이터를 가져옴
            train_data = TensorDataset(train_inputs, train_masks, train_labels)
            train_sampler = RandomSampler(train_data)
            train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)

            validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
            validation_sampler = SequentialSampler(validation_data)
            validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=BATCH_SIZE)

            return train_dataloader, validation_dataloader
        elif target == "test":
            # 데이터를 파이토치의 텐서로 변환
            test_inputs = torch.tensor(input_ids, dtype=torch.long)
            test_labels = torch.tensor(labels, dtype=torch.long)
            test_masks = torch.tensor(attention_masks, dtype=torch.long)

            # 파이토치의 DataLoader로 입력, 마스크, 라벨을 묶어 데이터 설정
            # 학습시 배치 사이즈 만큼 데이터를 가져옴
            test_data = TensorDataset(test_inputs, test_masks, test_labels)
            test_sampler = RandomSampler(test_data)
            test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=BATCH_SIZE)

            return test_dataloader
        
        else:
            predict_inputs = torch.tensor(input_ids, dtype=torch.long)
            predict_masks = torch.tensor(attention_masks, dtype=torch.long)

            return predict_inputs, predict_masks

    def makeModel(self, trainDataloader, validationDataloader):
        # 옵티마이저 설정
        optimizer = AdamW(self.model.parameters(), lr=TRAIN_PERCENT, eps=EPSILON)

        # 총 훈련 스텝 : 배치반복 횟수 * 에폭
        total_steps = len(trainDataloader) * EPOCHS

        # 처음에 학습률을 조금씩 변화시키는 스케줄러 생성
        scheduler = get_linear_schedule_with_warmup(optimizer,
                                                    num_warmup_steps=0,
                                                    num_training_steps=total_steps)

        # 재현을 위해 랜덤시드 고정
        seed_val = 42
        random.seed(seed_val)
        np.random.seed(seed_val)
        torch.manual_seed(seed_val)
        torch.cuda.manual_seed_all(seed_val)

        # 그래디언트 초기화
        self.model.zero_grad()

        # 에폭만큼 반복
        for epoch_i in range(0, EPOCHS):

            # ========================================
            #               Training
            # ========================================

            print("")
            print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, EPOCHS))
            print('Training...')

            # 시작 시간 설정
            t0 = time.time()

            # 로스 초기화
            total_loss = 0

            # 훈련모드로 변경
            self.model.train()

            # 데이터로더에서 배치만큼 반복하여 가져옴
            for step, batch in enumerate(trainDataloader):
                # 경과 정보 표시
                if step % 500 == 0 and not step == 0:
                    elapsed = self.format_time(time.time() - t0)
                    print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(trainDataloader), elapsed))

                # 배치를 GPU에 넣음
                batch = tuple(t.to(self.device) for t in batch)

                # 배치에서 데이터 추출
                b_input_ids, b_input_mask, b_labels = batch

                # Forward 수행
                outputs = self.model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)

                # 로스 구함
                loss = outputs[0]

                # 총 로스 계산
                total_loss += loss.item()

                # Backward 수행으로 그래디언트 계산
                loss.backward()

                # 그래디언트 클리핑
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)

                # 그래디언트를 통해 가중치 파라미터 업데이트
                optimizer.step()

                # 스케줄러로 학습률 감소
                scheduler.step()

                # 그래디언트 초기화
                self.model.zero_grad()

            # 평균 로스 계산
            avg_train_loss = total_loss / len(trainDataloader)

            print("")
            print("  Average training loss: {0:.2f}".format(avg_train_loss))
            print("  Training epcoh took: {:}".format(self.format_time(time.time() - t0)))

            print("")
            print("Running Validation...")

            # 시작 시간 설정
            t0 = time.time()

            # 평가모드로 변경
            self.model.eval()

            # 변수 초기화
            eval_loss, eval_accuracy = 0, 0
            nb_eval_steps, nb_eval_examples = 0, 0

            # 데이터로더에서 배치만큼 반복하여 가져옴
            for batch in validationDataloader:
                # 배치를 GPU에 넣음
                batch = tuple(t.to(self.device) for t in batch)

                # 배치에서 데이터 추출
                b_input_ids, b_input_mask, b_labels = batch

                # 그래디언트 계산 안함
                with torch.no_grad():
                    # Forward 수행
                    outputs = self.model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

                # 로스 구함
                logits = outputs[0]

                # CPU로 데이터 이동
                logits = logits.detach().cpu().numpy()
                label_ids = b_labels.to('cpu').numpy()

                # 출력 로짓과 라벨을 비교하여 정확도 계산
                tmp_eval_accuracy = self.flat_accuracy(logits, label_ids)
                eval_accuracy += tmp_eval_accuracy
                nb_eval_steps += 1

            print("  Accuracy: {0:.2f}".format(eval_accuracy / nb_eval_steps))
            print("  Validation took: {:}".format(self.format_time(time.time() - t0)))

            self.saveModel(F'ver4_{epoch_i}')

        print("")
        print("Training complete!")

    def testModel(self, test_dataLoader):
        # 시작 시간 설정
        t0 = time.time()

        # 평가모드로 변경
        self.model.eval()

        # 변수 초기화
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0

        # 데이터로더에서 배치만큼 반복하여 가져옴
        for step, batch in enumerate(test_dataLoader):
            # 경과 정보 표시
            if step % 100 == 0 and not step == 0:
                elapsed = self.format_time(time.time() - t0)
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(test_dataLoader), elapsed))

            # 배치를 GPU에 넣음
            batch = tuple(t.to(self.device) for t in batch)

            # 배치에서 데이터 추출
            b_input_ids, b_input_mask, b_labels = batch

            # 그래디언트 계산 안함
            with torch.no_grad():
                # Forward 수행
                outputs = self.model(b_input_ids,
                                     token_type_ids=None,
                                     attention_mask=b_input_mask)

            # 로스 구함
            logits = outputs[0]

            # CPU로 데이터 이동
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            # 출력 로짓과 라벨을 비교하여 정확도 계산
            tmp_eval_accuracy = self.flat_accuracy(logits, label_ids)
            eval_accuracy += tmp_eval_accuracy
            nb_eval_steps += 1

        print("")
        print("Accuracy: {0:.2f}".format(eval_accuracy / nb_eval_steps))
        print("Test took: {:}".format(self.format_time(time.time() - t0)))

    def predict(self, inputPath, outputPath):
        predict = pd.read_csv(path, sep=',', encoding='cp949')
        sentences = predict['Sentence']
        results = list()

        # 평가모드로 변경
        self.model.eval()
        tokenizer = ElectraTokenizer.from_pretrained('monologg/koelectra-base-v3-discriminator', do_lower_case=False)

        id = 0
        for sent in sentences:
            tokenized_texts = list()
            if str(type(sent)) == "<class 'str'>":
                tokenized_texts.append(tokenizer.tokenize(sent))
            else:
                tokenized_texts.append(tokenizer.tokenize(sent))


            input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
            input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
            attention_masks = []

            for seq in input_ids:
                seq_mask = [float(i>0) for i in seq]
                attention_masks.append(seq_mask)

            input = torch.tensor(input_ids)
            mask = torch.tensor(attention_masks)

            b_input_ids = input.to(self.device)
            b_input_mask = mask.to(self.device)
                    
            # 그래디언트 계산 안함
            with torch.no_grad():     
                # Forward 수행
                outputs = self.model(b_input_ids, 
                                token_type_ids=None, 
                                attention_mask=b_input_mask)

            # 로스 구함
            logits = outputs[0]

            # CPU로 데이터 이동
            logits = logits.detach().cpu().numpy()
            label = np.argmax(logits)

            result = {'Id': id, 'Predicted': label}
            results.append(result)
            id = id + 1

        result_frame = pd.DataFrame.from_records(results)
        result_frame.head()

        result_frame.to_csv(outputPath, index=False)

    # 정확도 계산 함수
    def flat_accuracy(self, preds, labels):
        pred_flat = np.argmax(preds, axis=1).flatten()
        labels_flat = labels.flatten()

        return np.sum(pred_flat == labels_flat) / len(labels_flat)

    # 시간 표시 함수
    def format_time(self, elapsed):
        # 반올림
        elapsed_rounded = int(round((elapsed)))

        # hh:mm:ss으로 형태 변경
        return str(datetime.timedelta(seconds=elapsed_rounded))

    def saveModel(self, path):
        if os.path.isdir('/content/gdrive/MyDrive/'):
            a.model.save_pretrained(F'/content/gdrive/MyDrive/{path}')



Found GPU at: /device:GPU:0
There are 1 GPU(s) available.
We will use the GPU: Tesla T4


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=467.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=451776329.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: 

          id  document
label                 
0      71934     71934
1      71412     71412


Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1
...,...,...,...
149995,6222902,인간이 문제지.. 소는 뭔죄인가..,0
149996,8549745,평점이 너무 낮아서...,1
149997,9311800,이게 뭐요? 한국인은 거들먹거리고 필리핀 혼혈은 착하다?,0
149998,2376369,청춘 영화의 최고봉.방황과 우울했던 날들의 자화상,1


In [None]:
# 학습
a = KorSentimentAnalyzer()
train_dataLoader, validation_dataLoader = a.preprocess("train")
test_dataLoader = a.preprocess("test")
a.makeModel(train_dataLoader, validation_dataLoader)
a.testModel(test_dataLoader)

In [None]:
# 예측
a = KorSentimentAnalyzer('/content/gdrive/MyDrive/ver4_3/')
a.predict('/content/gdrive/MyDrive/ko_data.csv', '/content/gdrive/MyDrive/ko_result_4_3.csv')