# 전처리

## install packages

In [None]:
!pip install transformers

!pip install hanja

Collecting hanja
[?25l  Downloading https://files.pythonhosted.org/packages/56/97/ce51b5c771e7c9a673568232125e587cbc378ff1dd13057f237bedcd71e8/hanja-0.13.3.tar.gz (120kB)
[K     |████████████████████████████████| 122kB 7.6MB/s 
[?25hCollecting pyyaml==5.1.2
[?25l  Downloading https://files.pythonhosted.org/packages/e3/e8/b3212641ee2718d556df0f23f78de8303f068fe29cdaa7a91018849582fe/PyYAML-5.1.2.tar.gz (265kB)
[K     |████████████████████████████████| 266kB 36.2MB/s 
Collecting pytest-cov
  Downloading https://files.pythonhosted.org/packages/ba/84/576b071aef9ac9301e5c0ff35d117e12db50b87da6f12e745e9c5f745cc2/pytest_cov-2.12.1-py2.py3-none-any.whl
Collecting coverage>=5.2.1
[?25l  Downloading https://files.pythonhosted.org/packages/16/e0/fc9f7bd9b84e6b41d0aad1a113e36714aac0c0a9b307aca5f9af443bc50f/coverage-5.5-cp37-cp37m-manylinux2010_x86_64.whl (242kB)
[K     |████████████████████████████████| 245kB 43.9MB/s 
Building wheels for collected packages: hanja, pyyaml
  Building wheel fo

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
#from konlpy.tag import Mecab
import re
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from transformers import TFBertForSequenceClassification, BertTokenizerFast
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

import hanja
from hanja import hangul

In [None]:
train = pd.read_csv('../train_data.csv',index_col=0).reset_index(drop=True)
test = pd.read_csv('../test_data.csv',index_col=0).reset_index(drop=True)

In [None]:
def prepare_text(data):
    transfer = []
    for index, row in data.iterrows():
        result = hanja.translate(row['title'], 'substitution') # transfer hanja to korean
        con = re.sub(r"[^a-zA-Z0-9가-힣]"," ",result) # remove Special Characters
        con = con.replace('종합'," ").strip()
        transfer.append(con)
    return transfer

## prepare data

In [None]:
train['title'] = prepare_text(train)
test['title'] = prepare_text(test)

In [None]:
train['title'][0]

'인천 핀란드 항공기 결항 휴가철 여행객 분통'

In [None]:
train.tail(20)

Unnamed: 0,title,topic_idx
45634,NBA 미네소타 뎅 영입 팀버울브스 아니고 팀버불스,5
45635,한국무용협회 예술대상에 박명숙 김근희 김운미 박재근,3
45636,기고 아인슈타인 지휘에 맞춰 블랙홀 듀엣 연주가 시작됐다,3
45637,1보 코로나19 확진자 총 1만156명 어제 하루 94명 증가,2
45638,일자민당 원로 헌법9조는 세계유산 개정 바늘귀만큼도 안돼,4
45639,민주일반연맹 비정규직 차별철폐 공동행동,2
45640,전국대학원생노조 경북대 실험실 폭발 피해자 끝까지 책임져야,2
45641,창원 시내버스 노조 파업 투표 가결 30일 운행 멈춘다종합,2
45642,장석주 시인 사랑은 새로운 이야기가 잉태되는 순간,3
45643,남친과 함께 잔혹하게 동거인 학대 20대 여성 구속영장,2


In [None]:
test.head()

Unnamed: 0,title
0,유튜브 내달 2일까지 크리에이터 지원 공간 운영
1,어버이날 맑다가 흐려져 남부지방 옅은 황사
2,내년부터 국가RD 평가 때 논문건수는 반영 않는다
3,김명자 신임 과총 회장 원로와 젊은 과학자 지혜 모을 것
4,회색인간 작가 김동식 양심고백 등 새 소설집 2권 출간


## tokenize

In [None]:
MAX_LEN = 45
BATCH_SIZE = 32
NUM_EPOCHS = 5

In [None]:
tokenizer = BertTokenizerFast.from_pretrained('beomi/kcbert-base')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=249928.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=49.0, style=ProgressStyle(description_w…




In [None]:
def bert_tokenizer(sent, MAX_LEN):
    encoded_dict = tokenizer.encode_plus(
        text = sent,
        add_special_tokens = True,      # Add '[CLS]' and '[SEP]'
        max_length = MAX_LEN,           # Pad & truncate all sentences.
        pad_to_max_length = True,
        return_attention_mask = True,
    )
    
    input_id = encoded_dict['input_ids']
    attention_mask = encoded_dict['attention_mask'] # And its attention mask (simply differentiates padding from non-padding).
    token_type_id = encoded_dict['token_type_ids'] # differentiate two sentences
    
    return input_id, attention_mask, token_type_id

In [None]:
def tokenize_bert(data):
    input_ids = []
    attention_masks = []
    token_type_ids = []

    for data_sent in tqdm(data["title"]):
        try:
            input_id, attention_mask, token_type_id = bert_tokenizer(data_sent, MAX_LEN)
          
            input_ids.append(input_id)
            attention_masks.append(attention_mask)
            token_type_ids.append(token_type_id)

        except Exception as e:
            print(e)
            print(data_sent)
            pass
    return np.array(input_ids,dtype=int), np.array(attention_masks,dtype=int), np.array(token_type_ids,dtype=int)

In [None]:
train_input_ids, train_attention_masks, train_token_type_ids = tokenize_bert(train)
train_labels = np.asarray(train['topic_idx'],dtype=np.float32)

train_inputs = (train_input_ids, train_attention_masks, train_token_type_ids)

  0%|          | 0/45654 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 45654/45654 [00:07<00:00, 6148.93it/s]


In [None]:
test_input_ids, test_attention_masks, test_token_type_ids = tokenize_bert(test)
test_inputs = (test_input_ids, test_attention_masks, test_token_type_ids)

100%|██████████| 9131/9131 [00:01<00:00, 7697.92it/s]


# Model build : 감성분석 미세조정

In [None]:
# attention과 hidden layer에 dropout 적용 : 0.3
model = TFBertForSequenceClassification.from_pretrained('beomi/kcbert-base',from_pt=True, num_labels=7, attention_probs_dropout_prob=0.3, hidden_dropout_prob=0.3)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=619.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=438218004.0, style=ProgressStyle(descri…




All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

earlystop_callback = EarlyStopping(monitor='val_accuracy', min_delta=0.0001, patience=2)

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5), loss=loss, metrics=[metric])

In [None]:
model.fit(train_inputs, train_labels, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE, validation_split = 0.2, callbacks=[earlystop_callback])

Epoch 1/5
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Cause: while/else statement not yet supported
Cause: while/else statement not yet supported
Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fc996b972d0>

In [None]:
results = model.predict(test_inputs)
topic = []
for i in range(len(results)):
    topic.append(np.argmax(results[i],axis=1))

submission = pd.read_csv('../data/sample_submission.csv')
submission['topic_idx'] = np.array(topic).T
submission.to_csv('../submission/bert_4.csv', index=False)



In [None]:
results = model.predict(test_inputs)
np.save('../submission/bert4_save.npy',results['logits'])