In [None]:
from google.colab import drive 
drive.mount('/content/gdrive') 

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
import sys
DATA_PATH = '/content/gdrive/My Drive/Colab Notebooks/KU-NLP-2020-1/Data/'
DIC_PATH = '/content/gdrive/My Drive/Colab Notebooks/KU-NLP-2020-1/Dic/'
sys.path.append(DATA_PATH)
sys.path.append(DIC_PATH)

In [None]:
%cd '/content/gdrive/My Drive/Colab Notebooks/KU-NLP-2020-1/'
!pwd

/content/gdrive/My Drive/Colab Notebooks/KU-NLP-2020-1
/content/gdrive/My Drive/Colab Notebooks/KU-NLP-2020-1


In [None]:
# Hugging Face의 트랜스포머 모델을 설치
!pip install cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.7-cp36-cp36m-linux_x86_64.whl
!pip install transformers
# Mecab 형태소 분석기 설치
#!pip install konlpy
#!git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git
#!bash /content/Mecab-ko-for-Google-Colab/install_mecab-ko_on_colab190912.sh

In [None]:
import tensorflow as tf
import torch
# TPU 사용시 import
import torch_xla
import torch_xla.core.xla_model as xm

from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import ElectraTokenizer
from transformers import ElectraModel, ElectraForSequenceClassification
#from tokenization_kocharelectra import KoCharElectraTokenizer
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import random
import time
import datetime

#from konlpy.tag import Mecab
#mecab = Mecab()



In [None]:
# 판다스로 훈련셋과 테스트셋 데이터 로드
train = pd.read_csv(DATA_PATH + "ratings_train.txt", sep='\t')
test = pd.read_csv(DATA_PATH + "ratings_test.txt", sep='\t')
final_test = pd.read_csv(DATA_PATH + "ko_data.csv", sep=',', encoding='CP949')

print(train.shape)
print(test.shape)
print(final_test.shape)

(150000, 3)
(50000, 3)
(11187, 2)


In [None]:
mecab_ues = False # Mecau 사용여부
dic_ues = True # 추가사전 사용여부

In [None]:
# 긍부정 단어 사전데이터를 훈련데이터에 추가
if dic_ues == True:
  pos_dic = pd.read_csv(DIC_PATH + "pos_pol_word.txt", names=['document'])
  neg_dic = pd.read_csv(DIC_PATH + "neg_pol_word.txt", names=['document'])
  pos_dic['id'] = pos_dic.index
  neg_dic['id'] = neg_dic.index
  pos_dic['label'] = 1
  neg_dic['label'] = 0
  dic = pd.concat([pos_dic, neg_dic])

  train = pd.concat([train, dic])
  print(train.shape)

(164673, 3)


In [None]:
# 리뷰 문장 추출
sentences = train['document']
sentences[:10]

0                                  아 더빙.. 진짜 짜증나네요 목소리
1                    흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나
2                                    너무재밓었다그래서보는것을추천한다
3                        교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정
4    사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...
5        막 걸음마 뗀 3세부터 초등학교 1학년생인 8살용영화.ㅋㅋㅋ...별반개도 아까움.
6                                원작의 긴장감을 제대로 살려내지못했다.
7    별 반개도 아깝다 욕나온다 이응경 길용우 연기생활이몇년인지..정말 발로해도 그것보단...
8                               액션이 없는데도 재미 있는 몇안되는 영화
9        왜케 평점이 낮은건데? 꽤 볼만한데.. 헐리우드식 화려함에만 너무 길들여져 있나?
Name: document, dtype: object

In [None]:
# 라벨 추출
labels = train['label'].values
labels

array([0, 1, 0, ..., 0, 0, 0])

In [None]:
# Mecab 사용시 전처리
if mecab_ues:
  tokenized_texts = []
  for idx, s in enumerate(sentences):
    if s:
      try:
        text = spacer.space(re.sub('[-=.#/!?:$}]', '', s))    
        text = mecab.morphs(text)
        text.insert(0, '[CLS]')
        text.append('[SEP]')

        tokenized_texts.append(text)
      except:
        print(idx, s)

  token_df = pd.DataFrame(data=tokenized_texts)      
  token_df.to_csv(DATA_PATH + 'token_train.csv', index=False)

  labels = np.delete(labels ,25857)
  labels = np.delete(labels ,40556)
  labels = np.delete(labels ,55737)
  labels = np.delete(labels ,110014)
  labels = np.delete(labels ,25801)
  labels = np.delete(labels ,126782)
  labels = np.delete(labels ,140721)

  labels_df = pd.DataFrame(data=labels)      
  labels_df.to_csv(DATA_PATH + 'labels_train.csv', index=False, header=0)

In [None]:
# Koelectra BERT의 토크나이저로 문장을 토큰으로 분리
pretrained_weights = 'monologg/koelectra-base-v3-discriminator' #'monologg/kocharelectra-base-discriminator' 
#tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)
tokenizer = ElectraTokenizer.from_pretrained(pretrained_weights, do_lower_case=False)
#tokenizer = KoCharElectraTokenizer.from_pretrained("monologg/kocharelectra-base-discriminator")

if mecab_ues:
  tokenized_df = pd.read_csv(DATA_PATH + 'token_train.csv') 
  tokenized_texts = tokenized_df.tolist()

  labels_df = pd.read_csv(DATA_PATH + 'labels_test.csv') 
  labels = labels_df.tolist()

else:
  # BERT의 입력 형식에 맞게 변환
  sentences = ["[CLS] " + str(sentence) + " [SEP]" for sentence in sentences]
  
  tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

print (sentences[0])
print (tokenized_texts[0])

[CLS] 아 더빙.. 진짜 짜증나네요 목소리 [SEP]
['[CLS]', '아', '더빙', '.', '.', '진짜', '짜증', '##나', '##네', '##요', '목소리', '[SEP]']


In [None]:
# 텍스트 평균, MAX길이 구하기
max_length = 0
average_length = 0

for i, text in enumerate(tokenized_texts):  
  text_length = len(text)
  average_length += text_length
    
  if text_length > max_length:
    max_length = text_length
  
  #print(i, text, text_length)

average_length /= len(tokenized_texts)    

print("Max Text Length", max_length)
print("Average Text Length", average_length)

Max Text Length 142
Average Text Length 21.1219082666861


In [None]:
# 입력 토큰의 최대 시퀀스 길이
MAX_LEN = 142 #128

# 토큰을 숫자 인덱스로 변환
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

# 문장을 MAX_LEN 길이에 맞게 자르고, 모자란 부분을 패딩 0으로 채움
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

input_ids[0]

array([    2,  3079, 33345,    18,    18,  7082, 13215,  4065,  4116,
        4150,  6933,     3,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,

In [None]:
# 어텐션 마스크 초기화
attention_masks = []

# 어텐션 마스크를 패딩이 아니면 1, 패딩이면 0으로 설정
# 패딩 부분은 BERT 모델에서 어텐션을 수행하지 않아 속도 향상
for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

print(attention_masks[0])

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [None]:
# 훈련셋과 검증셋으로 분리
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids,
                                                                                    labels, 
                                                                                    random_state=2018, 
                                                                                    test_size=0.1)

# 어텐션 마스크를 훈련셋과 검증셋으로 분리
train_masks, validation_masks, _, _ = train_test_split(attention_masks, 
                                                       input_ids,
                                                       random_state=2018, 
                                                       test_size=0.1)

# 데이터를 파이토치의 텐서로 변환
train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_masks)
validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)
validation_masks = torch.tensor(validation_masks)				

print(train_inputs[0])
print(train_labels[0])
print(train_masks[0])
print(validation_inputs[0])
print(validation_labels[0])
print(validation_masks[0])

tensor([    2,  8681,  6394,  4239,  4129,    37,  4027,  4234, 13186,  6394,
         4594,  4073,  6996,  4279, 17164, 26525,  4425,  4234,  8268,  4880,
        11134,  4112, 17698,  3671,  4070,  2411,  4025, 11533,  4594,  4125,
         4129,  4799,  4097,  4073,  4129, 11500, 12605,  4418,  4292,  9813,
            3,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

In [None]:
# 배치 사이즈
batch_size = 32

# 파이토치의 DataLoader로 입력, 마스크, 라벨을 묶어 데이터 설정
# 학습시 배치 사이즈 만큼 데이터를 가져옴
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [None]:
# 리뷰 문장 추출
sentences = test['document']
sentences[:10]

0                                                  굳 ㅋ
1                                 GDNTOPCLASSINTHECLUB
2               뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아
3                     지루하지는 않은데 완전 막장임... 돈주고 보기에는....
4    3D만 아니었어도 별 다섯 개 줬을텐데.. 왜 3D로 나와서 제 심기를 불편하게 하죠??
5                                   음악이 주가 된, 최고의 음악영화
6                                              진정한 쓰레기
7             마치 미국애니에서 튀어나온듯한 창의력없는 로봇디자인부터가,고개를 젖게한다
8    갈수록 개판되가는 중국영화 유치하고 내용없음 폼잡다 끝남 말도안되는 무기에 유치한c...
9       이별의 아픔뒤에 찾아오는 새로운 인연의 기쁨 But, 모든 사람이 그렇지는 않네..
Name: document, dtype: object

In [None]:
# 캐글 테스트셋 문장추출
final_sentences = final_test['Sentence']
final_sentences[:10]

0                          정말 많이 울었던 영화입니다.
1                                  시간 낭비예요.
2    포스터를 저렇게밖에 만들지 못했던 제작자의 소심함에 침을 뱉고 싶다.
3      지금 봐도 재미있는 영화!!! 코믹과 감동!!! 그리고 요리!!!
4                 이걸 영화로 만드는 거야?얼마나 가는지 보자.
5        잔잔한 감동을 주는 영화가 좋은 영화다. 줄리안 무어의 매력!
6                   프랑스 영화, 정말 재미없다는 말밖에...
7                        이보다 더 자연스러울 수는 없다.
8                              잠만 자고 있었는데~~
9             오프닝 씬이... (이 영화와 젊음에 대해 말한다.)
Name: Sentence, dtype: object

In [None]:
# 라벨 추출
labels = test['label'].values
labels

array([1, 0, 0, ..., 0, 0, 0])

In [None]:
# Mecab 사용시 전처리
if mecab_ues:
  tokenized_texts = []
  for idx, s in enumerate(sentences):
    if s:
      try:
        text = spacer.space(re.sub('[-=.#/!?:$}]', '', s))    
        text = mecab.morphs(text)
        text.insert(0, '[CLS]')
        text.append('[SEP]')

        tokenized_texts.append(text)
      except:
        print(idx, s)

  token_df = pd.DataFrma(data=tokenized_texts)      
  token_df.to_csv(DATA_PATH + 'token_test.csv', index=False)

  labels = np.delete(labels ,5746)
  labels = np.delete(labels ,7899)
  labels = np.delete(labels ,27097)
  labels = np.delete(labels ,33446)
  labels = np.delete(labels ,34798)
  labels = np.delete(labels ,36279)

  labels_df = pd.DataFrma(data=labels)      
  labels_df.to_csv(DATA_PATH + 'labels_test.csv', index=False, header=0)

In [None]:
# BERT의 토크나이저로 문장을 토큰으로 분리
#tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)
#tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator", do_lower_case=False)

if mecab_ues:
  tokenized_df = pd.read_csv(DATA_PATH + 'token_test.csv') 
  tokenized_texts = tokenized_df.tolist()
  labels_df = pd.read_csv(DATA_PATH + 'labels_test.csv') 

else:
  # BERT의 입력 형식에 맞게 변환
  sentences = ["[CLS] " + str(sentence) + " [SEP]" for sentence in sentences]
  
  tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

final_sentence = ["[CLS] " + str(final_sentence) + " [SEP]" for final_sentence in final_sentences]
final_tokenized_texts = [tokenizer.tokenize(sent) for sent in final_sentence]

print (sentences[0])
print (tokenized_texts[0])
print (final_tokenized_texts[0])

[CLS] 굳 ㅋ [SEP]
['[CLS]', '굳', 'ㅋ', '[SEP]']
['[CLS]', '정말', '많이', '울', '##었', '##던', '영화', '##입니다', '.', '[SEP]']


In [None]:
# 입력 토큰의 최대 시퀀스 길이
MAX_LEN = 142 #128

# 토큰을 숫자 인덱스로 변환
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
final_input_ids = [tokenizer.convert_tokens_to_ids(x) for x in final_tokenized_texts]

# 문장을 MAX_LEN 길이에 맞게 자르고, 모자란 부분을 패딩 0으로 채움
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
final_input_ids = pad_sequences(final_input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

input_ids[0]

array([   2, 2104,  287,    3,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0])

In [None]:
# 어텐션 마스크 초기화
attention_masks = []
final_attention_masks = []

# 어텐션 마스크를 패딩이 아니면 1, 패딩이면 0으로 설정
# 패딩 부분은 BERT 모델에서 어텐션을 수행하지 않아 속도 향상
for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

for seq in final_input_ids:
    seq_mask = [float(i>0) for i in seq]
    final_attention_masks.append(seq_mask)  

print(attention_masks[0])

[1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [None]:
# 데이터를 파이토치의 텐서로 변환
test_inputs = torch.tensor(input_ids)
test_labels = torch.tensor(labels)
test_masks = torch.tensor(attention_masks)
final_test_id = torch.tensor(final_test['Id'])
final_test_inputs = torch.tensor(final_input_ids)
final_test_masks = torch.tensor(final_attention_masks)

print(final_test_id[0])
print(final_test_inputs[0])
print(final_test_masks[0])

tensor(0)
tensor([    2,  6595,  6341,  3194,  4480,  4820,  6394, 10561,    18,     3,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     

In [None]:
# 배치 사이즈
batch_size = 32

# 파이토치의 DataLoader로 입력, 마스크, 라벨을 묶어 데이터 설정
# 학습시 배치 사이즈 만큼 데이터를 가져옴
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

final_test_data = TensorDataset(final_test_id, final_test_inputs, final_test_masks)
final_test_dataloader = DataLoader(final_test_data, batch_size=1)

In [None]:
# GPU 혹은 TPU사용 여부 세팅
tpu_use = True
# Acquires the default Cloud TPU core and moves the model to it
if tpu_use == True:
  device = xm.xla_device()
else:  
  # GPU 디바이스 이름 구함
  device_name = tf.test.gpu_device_name()

  # GPU 디바이스 이름 검사
  if device_name == '/device:GPU:0':
      print('Found GPU at: {}'.format(device_name))
  else:
      raise SystemError('GPU device not found')

In [None]:
# 디바이스 설정
if tpu_use == True:
    device = xm.xla_device()
elif torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

In [None]:
# 분류를 위한 BERT 모델 생성
#model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2)
model = ElectraForSequenceClassification.from_pretrained(pretrained_weights, num_labels = 2)
#model = ElectraModel.from_pretrained(pretrained_weights, return_dict=True)
model.to(device)

Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: 

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(35000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm

In [None]:
'''
# 기존 모델을 기반으로 별도 레이어를 만들때
import torch.nn as nn

class Model(nn.Module):
  def __init__(self):
    super().__init__()
    self.bert_tokenizer = ElectraTokenizer.from_pretrained(pretrained_weights, do_lower_case=False)
    self.bert_model =  ElectraModel.from_pretrained(pretrained_weights, output_hidden_states=True) #, return_dict=True
    self.linear1 = torch.nn.Linear(768, 256)
    self.linear2 = torch.nn.Linear(256, 2)

  def forward(self, b_input_ids, token_type_ids, attention_mask):    
    hidden_tensor = self.bert_model(input_ids=b_input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)[0]
   
    linear1 = self.linear1(hidden_tensor[:,0,:]) #.view(-1,768)) 
    logit = self.linear2(linear1)  

    print(logit)

    return logit
 
model = Model()
#model.cuda()
model.to(device) 
'''

'\n\nimport torch.nn as nn\n\nclass Model(nn.Module):\n  def __init__(self):\n    super().__init__()\n    self.bert_tokenizer = ElectraTokenizer.from_pretrained(pretrained_weights, do_lower_case=False)\n    self.bert_model =  ElectraModel.from_pretrained(pretrained_weights, output_hidden_states=True) #, return_dict=True\n    self.linear1 = torch.nn.Linear(768, 256)\n    self.linear2 = torch.nn.Linear(256, 2)\n\n  def forward(self, b_input_ids, token_type_ids, attention_mask):    \n    hidden_tensor = self.bert_model(input_ids=b_input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)[0]\n   \n    linear1 = self.linear1(hidden_tensor[:,0,:]) #.view(-1,768)) \n    logit = self.linear2(linear1)  \n\n    print(logit)\n\n    return logit\n \nmodel = Model()\n#model.cuda()\nmodel.to(device) \n'

In [None]:
# 옵티마이저 설정
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # 학습률 1e-6
                  eps = 1e-8 # 0으로 나누는 것을 방지하기 위한 epsilon 값
                )

# 에폭수
epochs = 1

# 총 훈련 스텝 : 배치반복 횟수 * 에폭
total_steps = len(train_dataloader) * epochs

# 처음에 학습률을 조금씩 변화시키는 스케줄러 생성
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [None]:
# 정확도 계산 함수
def flat_accuracy(preds, labels):
    
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
# 시간 표시 함수
def format_time(elapsed):

    # 반올림
    elapsed_rounded = int(round((elapsed)))
    
    # hh:mm:ss으로 형태 변경
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
# 재현을 위해 랜덤시드 고정
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
criterion = torch.nn.CrossEntropyLoss().to(device)

# 그래디언트 초기화
model.zero_grad()

# 에폭만큼 반복
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # 시작 시간 설정
    t0 = time.time()

    # 로스 초기화
    total_loss = 0

    # 훈련모드로 변경
    model.train()
        
    # 데이터로더에서 배치만큼 반복하여 가져옴
    for step, batch in enumerate(train_dataloader):
        # 경과 정보 표시
        if step % 500 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # 배치를 GPU에 넣음
        batch = tuple(t.to(device) for t in batch)
        
        # 배치에서 데이터 추출
        b_input_ids, b_input_mask, b_labels = batch

        # Forward 수행                
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask, 
                        labels=b_labels)
        
        #if step % 500 == 0:
         #print(outputs)
        
        # 로스 구함
        loss = outputs[0]
               

        # 총 로스 계산
        total_loss += loss.item()

        # Backward 수행으로 그래디언트 계산
        loss.backward()

        # 그래디언트 클리핑
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # 그래디언트를 통해 가중치 파라미터 업데이트
        if tpu_use == True:
          xm.optimizer_step(optimizer, barrier=True)
        else:
          optimizer.step()
        

        # 스케줄러로 학습률 감소
        scheduler.step()

        # 그래디언트 초기화
        model.zero_grad()

    # 평균 로스 계산
    avg_train_loss = total_loss / len(train_dataloader)            

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    #시작 시간 설정
    t0 = time.time()

    # 평가모드로 변경
    model.eval()

    # 변수 초기화
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # 데이터로더에서 배치만큼 반복하여 가져옴
    for batch in validation_dataloader:
        # 배치를 GPU에 넣음
        batch = tuple(t.to(device) for t in batch)
        
        # 배치에서 데이터 추출
        b_input_ids, b_input_mask, b_labels = batch
        
        # 그래디언트 계산 안함
        with torch.no_grad():     
            # Forward 수행
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        # 로스 구함
        logits = outputs[0]


        # CPU로 데이터 이동
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        #print(logits, label_ids)
        
        # 출력 로짓과 라벨을 비교하여 정확도 계산
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1

    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")


Training...
  Batch   500  of  4,632.    Elapsed: 0:04:39.
  Batch 1,000  of  4,632.    Elapsed: 0:08:36.
  Batch 1,500  of  4,632.    Elapsed: 0:12:34.
  Batch 2,000  of  4,632.    Elapsed: 0:16:31.
  Batch 2,500  of  4,632.    Elapsed: 0:20:29.
  Batch 3,000  of  4,632.    Elapsed: 0:24:27.
  Batch 3,500  of  4,632.    Elapsed: 0:28:25.
  Batch 4,000  of  4,632.    Elapsed: 0:32:23.
  Batch 4,500  of  4,632.    Elapsed: 0:36:21.

  Average training loss: 0.09
  Training epcoh took: 0:37:24

Running Validation...
  Accuracy: 0.91
  Validation took: 0:00:50

Training complete!


In [None]:
#테스트셋 평가
#시작 시간 설정
t0 = time.time()

# 평가모드로 변경
model.eval()

# 변수 초기화
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

# 데이터로더에서 배치만큼 반복하여 가져옴
for step, batch in enumerate(test_dataloader):
    # 경과 정보 표시
    if step % 100 == 0 and not step == 0:
        elapsed = format_time(time.time() - t0)
        print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(test_dataloader), elapsed))

    # 배치를 GPU에 넣음
    batch = tuple(t.to(device) for t in batch)
    
    # 배치에서 데이터 추출
    b_input_ids, b_input_mask, b_labels = batch
    
    # 그래디언트 계산 안함
    with torch.no_grad():     
        # Forward 수행
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask)
    
    # 로스 구함
    logits = outputs[0]

    # CPU로 데이터 이동
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    
    # 출력 로짓과 라벨을 비교하여 정확도 계산
    tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1

print("")
print("Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
print("Test took: {:}".format(format_time(time.time() - t0)))

  Batch   100  of  1,563.    Elapsed: 0:00:10.
  Batch   200  of  1,563.    Elapsed: 0:00:20.
  Batch   300  of  1,563.    Elapsed: 0:00:29.
  Batch   400  of  1,563.    Elapsed: 0:00:39.
  Batch   500  of  1,563.    Elapsed: 0:00:49.
  Batch   600  of  1,563.    Elapsed: 0:00:58.
  Batch   700  of  1,563.    Elapsed: 0:01:08.
  Batch   800  of  1,563.    Elapsed: 0:01:18.
  Batch   900  of  1,563.    Elapsed: 0:01:27.
  Batch 1,000  of  1,563.    Elapsed: 0:01:37.
  Batch 1,100  of  1,563.    Elapsed: 0:01:47.
  Batch 1,200  of  1,563.    Elapsed: 0:01:56.
  Batch 1,300  of  1,563.    Elapsed: 0:02:06.
  Batch 1,400  of  1,563.    Elapsed: 0:02:16.
  Batch 1,500  of  1,563.    Elapsed: 0:02:26.

Accuracy: 0.91
Test took: 0:02:37


In [None]:
#캐글 테스트셋 평가
#시작 시간 설정
t0 = time.time()
batch = 1

# 평가모드로 변경
model.eval()

# 변수 초기화
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
final_result = []

# 데이터로더에서 배치만큼 반복하여 가져옴
for step, batch in enumerate(final_test_dataloader):
    # 경과 정보 표시
    if step % 1000 == 0 and not step == 0:
        elapsed = format_time(time.time() - t0)
        print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(final_test_dataloader), elapsed))

    # 배치를 GPU에 넣음
    batch = tuple(t.to(device) for t in batch)
    
    # 배치에서 데이터 추출
    b_id, b_input_ids, b_input_mask = batch
    
    # 그래디언트 계산 안함
    with torch.no_grad():     
        # Forward 수행
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask)
    
    # 로스 구함
    logits = outputs[0]

    # CPU로 데이터 이동
    logits = logits.detach().cpu().numpy()

    pred_flat = np.argmax(logits, axis=1).flatten()
    b_id = b_id.cpu().numpy()

    result = np.concatenate((b_id, pred_flat), axis=None)
        
    final_result.append(result)  

  Batch 1,000  of  11,187.    Elapsed: 0:00:38.
  Batch 2,000  of  11,187.    Elapsed: 0:01:12.
  Batch 3,000  of  11,187.    Elapsed: 0:01:48.
  Batch 4,000  of  11,187.    Elapsed: 0:02:23.
  Batch 5,000  of  11,187.    Elapsed: 0:02:58.
  Batch 6,000  of  11,187.    Elapsed: 0:03:33.
  Batch 7,000  of  11,187.    Elapsed: 0:04:07.
  Batch 8,000  of  11,187.    Elapsed: 0:04:41.
  Batch 9,000  of  11,187.    Elapsed: 0:05:17.
  Batch 10,000  of  11,187.    Elapsed: 0:05:52.
  Batch 11,000  of  11,187.    Elapsed: 0:06:27.


In [None]:
#캐글 테스트셋 긍정(1) 혹은 부정(0)으로 분류하여 파일로 저장
rdf = pd.DataFrame(final_result, columns =['Id', 'Predicted'])
rdf.to_csv(DATA_PATH + 'sample.csv', index=False)

final_result[:10]

[array([0, 1]),
 array([1, 0]),
 array([2, 1]),
 array([3, 1]),
 array([4, 0]),
 array([5, 1]),
 array([6, 0]),
 array([7, 1]),
 array([8, 0]),
 array([9, 1])]

In [None]:
# 모델 저장하기
torch.save(model.state_dict(), DATA_PATH +  "nsmc_model.pt")

In [None]:
# 저장된 모델 로드
#model = torch.load(DATA_PATH +  "nsmc_model.pt")
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
model.load_state_dict(torch.load(DATA_PATH +  "nsmc_model.pt", map_location="cuda:0")) 
model.to(device)

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(35000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm