colab 환경에서 실행

# 전처리

## install packages

In [2]:
!pip install transformers
!pip install hanja
!pip install tensorflow_addons

Collecting transformers
  Downloading transformers-4.9.1-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 8.3 MB/s 
Collecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 59.1 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 59.9 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 58.9 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninsta

In [2]:
from tensorflow_addons.optimizers import AdamW

In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
#from konlpy.tag import Mecab
import re
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from transformers import ElectraTokenizer
from transformers import TFElectraForSequenceClassification
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

import hanja
from hanja import hangul

In [4]:
train = pd.read_csv('../data/train_data.csv',index_col=0).reset_index(drop=True)
test = pd.read_csv('../data/test_data.csv',index_col=0).reset_index(drop=True)

In [5]:
# 한자를 한글로 변환, 특수문자 제거
def prepare_text(data):
    transfer = []
    for index, row in data.iterrows():
        result = hanja.translate(row['title'], 'substitution') # transfer hanja to korean
        con = re.sub(r"[^a-zA-Z0-9가-힣]"," ",result) # remove Special Characters
        transfer.append(con)
    return transfer

## prepare data

In [6]:
train['title'] = prepare_text(train)
test['title'] = prepare_text(test)

In [8]:
train.tail(20)

Unnamed: 0,title,topic_idx
45634,NBA 미네소타 뎅 영입 팀버울브스 아니고 팀버불스,5
45635,한국무용협회 예술대상에 박명숙 김근희 김운미 박재근,3
45636,기고 아인슈타인 지휘에 맞춰 블랙홀 듀엣 연주가 시작됐다,3
45637,1보 코로나19 확진자 총 1만156명 어제 하루 94명 증가,2
45638,일자민당 원로 헌법9조는 세계유산 개정 바늘귀만큼도 안돼,4
45639,민주일반연맹 비정규직 차별철폐 공동행동,2
45640,전국대학원생노조 경북대 실험실 폭발 피해자 끝까지 책임져야,2
45641,창원 시내버스 노조 파업 투표 가결 30일 운행 멈춘다종합,2
45642,장석주 시인 사랑은 새로운 이야기가 잉태되는 순간,3
45643,남친과 함께 잔혹하게 동거인 학대 20대 여성 구속영장,2


In [9]:
test.head()

Unnamed: 0,title
0,유튜브 내달 2일까지 크리에이터 지원 공간 운영
1,어버이날 맑다가 흐려져 남부지방 옅은 황사
2,내년부터 국가RD 평가 때 논문건수는 반영 않는다
3,김명자 신임 과총 회장 원로와 젊은 과학자 지혜 모을 것
4,회색인간 작가 김동식 양심고백 등 새 소설집 2권 출간


## tokenize

In [10]:
MAX_LEN = 45
BATCH_SIZE = 32
NUM_EPOCHS = 5

In [11]:
tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=263326.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=61.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=467.0, style=ProgressStyle(description_…




In [12]:
def electra_tokenizer(sent, MAX_LEN):
    encoded_dict = tokenizer.encode_plus(
        text = sent,
        add_special_tokens = True,      # Add '[CLS]' and '[SEP]'
        max_length = MAX_LEN,           # Pad & truncate all sentences.
        pad_to_max_length = True,
        return_attention_mask = True,
    )
    
    input_id = encoded_dict['input_ids']
    attention_mask = encoded_dict['attention_mask'] # And its attention mask (simply differentiates padding from non-padding).
    token_type_id = encoded_dict['token_type_ids'] # differentiate two sentences
    
    return input_id, attention_mask, token_type_id

In [14]:
def tokenize_electra(data):
    # token id, attention mask, token type id를 최종 return
    input_ids = []
    attention_masks = []
    token_type_ids = []
    
    for data_sent in tqdm(data["title"]):
        try:
            input_id, attention_mask, token_type_id = electra_tokenizer(data_sent, MAX_LEN)
          
            input_ids.append(input_id)
            attention_masks.append(attention_mask)
            token_type_ids.append(token_type_id)

        except Exception as e:
            print(e)
            print(data_sent)
            pass
    return np.array(input_ids,dtype=int), np.array(attention_masks,dtype=int), np.array(token_type_ids,dtype=int)

In [15]:
train_input_ids, train_attention_masks, train_token_type_ids = tokenize_electra(train)
train_labels = np.asarray(train['topic_idx'],dtype=np.float32)

train_inputs = (train_input_ids, train_attention_masks, train_token_type_ids)

  0%|          | 0/45654 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 45654/45654 [00:12<00:00, 3758.45it/s]


In [16]:
test_input_ids, test_attention_masks, test_token_type_ids = tokenize_electra(test)

test_inputs = (test_input_ids, test_attention_masks, test_token_type_ids)

100%|██████████| 9131/9131 [00:02<00:00, 3491.60it/s]


# Model build : 감성분석 미세조정

In [23]:
model = TFElectraForSequenceClassification.from_pretrained('monologg/koelectra-base-v3-discriminator',from_pt=True, num_labels=7, attention_probs_dropout_prob=0.3, hidden_dropout_prob=0.3)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'electra.embeddings.position_ids']
- This IS expected if you are initializing TFElectraForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFElectraForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFElectraForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.dens

In [28]:
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

optimizer = AdamW(learning_rate=1e-5, weight_decay=False)

earlystop_callback = EarlyStopping(monitor='val_accuracy', min_delta=0.0001, patience=2)

In [27]:
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [29]:
model.fit(train_inputs, train_labels, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE, validation_split = 0.3, callbacks=[earlystop_callback])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fb267678fd0>

In [30]:
# 영어 포함, attention과 hidden layer에 dropout 적용
results = model.predict(test_inputs)
topic = []
for i in range(len(results)):
    topic.append(np.argmax(results[i],axis=1))

submission = pd.read_csv('../data/sample_submission.csv')
submission['topic_idx'] = np.array(topic).T
submission.to_csv('../submission/electra_8.csv', index=False)



In [None]:
model.save('../electra5')





INFO:tensorflow:Assets written to: /content/drive/MyDrive/dacon/뉴스 토픽 분류/electra5/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/dacon/뉴스 토픽 분류/electra5/assets


In [31]:
np.save('../submission/electra8_save', results['logits'])

In [None]:
# elctra1 : epoch5, lr=1e-5, dropout 각각 0.3