In [1]:
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import json
import os
import tqdm

from konlpy.tag import Mecab

import sklearn
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import log_loss, accuracy_score,f1_score
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import transformers
from transformers import *

from sklearn.feature_extraction.text import TfidfVectorizer

warnings.filterwarnings(action='ignore')

In [2]:
train      = pd.read_csv("open/train_data.csv", encoding="utf-8")
test       = pd.read_csv("open/test_data.csv", encoding="utf-8")
submission = pd.read_csv("open/sample_submission.csv", encoding="utf-8")
topic_dict = pd.read_csv("open/topic_dict.csv", encoding="utf-8")

In [3]:
train.head()

Unnamed: 0,index,title,topic_idx
0,0,인천→핀란드 항공기 결항…휴가철 여행객 분통,4
1,1,실리콘밸리 넘어서겠다…구글 15조원 들여 美전역 거점화,4
2,2,이란 외무 긴장완화 해결책은 미국이 경제전쟁 멈추는 것,4
3,3,NYT 클린턴 측근韓기업 특수관계 조명…공과 사 맞물려종합,4
4,4,시진핑 트럼프에 중미 무역협상 조속 타결 희망,4


In [4]:
print(f'train.shape:{train.shape}')
print(f'test.shape:{test.shape}')
print(f'train label 개수: {train.topic_idx.nunique()}')

train.shape:(45654, 3)
test.shape:(9131, 2)
train label 개수: 7


In [5]:
#random seed 고정
tf.random.set_seed(1234)
np.random.seed(1234)
BATCH_SIZE = 32
NUM_EPOCHS = 3
VALID_SPLIT = 0.2
MAX_LEN=100

In [6]:
tokenizer=BertTokenizer.from_pretrained('bert-base-multilingual-cased',  cache_dir='bert_ckpt', do_lower_case=False)

def bert_tokenizer(sent, MAX_LEN):
    
    encoded_dict=tokenizer.encode_plus(
        text = sent, 
        add_special_tokens=True, 
        max_length=MAX_LEN, 
        pad_to_max_length=True, 
        return_attention_mask=True,
        truncation = True)
    
    input_id=encoded_dict['input_ids']
    attention_mask=encoded_dict['attention_mask']
    token_type_id = encoded_dict['token_type_ids']
    
    return input_id, attention_mask, token_type_id

input_ids =[]
attention_masks =[]
token_type_ids =[]
train_data_labels = []

def clean_text(sent):
    sent_clean=re.sub("[^가-힣ㄱ-ㅎㅏ-ㅣ\\s]", " ", sent)
    return sent_clean

for train_sent, train_label in zip(train['title'], train['topic_idx']):
    try:
        input_id, attention_mask, token_type_id = bert_tokenizer(clean_text(train_sent), MAX_LEN=MAX_LEN)
        
        input_ids.append(input_id)
        attention_masks.append(attention_mask)
        token_type_ids.append(token_type_id)
        #########################################
        train_data_labels.append(train_label)
        
    except Exception as e:
        print(e)
        print(train_sent)
        pass

train_input_ids=np.array(input_ids, dtype=int)
train_attention_masks=np.array(attention_masks, dtype=int)
train_token_type_ids=np.array(token_type_ids, dtype=int)
###########################################################
train_inputs=(train_input_ids, train_attention_masks, train_token_type_ids)
train_labels=np.asarray(train_data_labels, dtype=np.int32)

In [7]:
# # 계층 교차 검증
# n_fold = 5  
# seed = 42

# cv = StratifiedKFold(n_splits = n_fold, shuffle=True, random_state=seed)

# # 테스트데이터의 예측값 담을 곳 생성
# test_y = np.zeros((test_x.shape[0], 7))

# # 조기 종료 옵션 추가
# es = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=3,
#                    verbose=1, mode='min', baseline=None, restore_best_weights=True)

# for i, (i_trn, i_val) in enumerate(cv.split(train_x, Y_train), 1):
#     print(f'training model for CV #{i}')

#     model3.fit(train_x[i_trn], 
#             to_categorical(Y_train[i_trn]),
#             validation_data=(train_x[i_val], to_categorical(Y_train[i_val])),
#             epochs=10,
#             batch_size=512,
#             callbacks=[es])     # 조기 종료 옵션
                      
#     test_y += model3.predict(test_x) / n_fold    # 나온 예측값들을 교차 검증 횟수로 나눈다

In [8]:
input_ids =[]
attention_masks =[]
token_type_ids =[]
train_data_labels = []

def clean_text(sent):
    sent_clean=re.sub("[^가-힣ㄱ-ㅎㅏ-ㅣ\\s]", " ", sent)
    return sent_clean

for test_sent in test['title']:
    try:
        input_id, attention_mask, token_type_id = bert_tokenizer(clean_text(test_sent), MAX_LEN=100)
        
        input_ids.append(input_id)
        attention_masks.append(attention_mask)
        token_type_ids.append(token_type_id)
        #########################################
       
    except Exception as e:
        print(e)
        print(test_sent)
        pass
    
test_input_ids=np.array(input_ids, dtype=int)
test_attention_masks=np.array(attention_masks, dtype=int)
test_token_type_ids=np.array(token_type_ids, dtype=int)
###########################################################
test_inputs=(test_input_ids, test_attention_masks, test_token_type_ids)

In [9]:
results=np.zeros((len(test),7))

In [10]:
results

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [11]:
from sklearn.model_selection import StratifiedKFold

class TFBertClassifier(tf.keras.Model):
    def __init__(self, model_name, dir_path, num_class):
        super(TFBertClassifier, self).__init__()

        self.bert = TFBertModel.from_pretrained(model_name, cache_dir=dir_path)
        self.dropout = tf.keras.layers.Dropout(self.bert.config.hidden_dropout_prob)
        self.classifier = tf.keras.layers.Dense(num_class, 
                                                kernel_initializer=tf.keras.initializers.TruncatedNormal(self.bert.config.initializer_range), 
                                                name="classifier")
        
    def call(self, inputs, attention_mask=None, token_type_ids=None, training=False):
        
        #outputs 값: # sequence_output, pooled_output, (hidden_states), (attentions)
        outputs = self.bert(inputs, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = outputs[1] 
        pooled_output = self.dropout(pooled_output, training=training)
        logits = self.classifier(pooled_output)

        return logits

cls_model = TFBertClassifier(model_name='bert-base-multilingual-cased',
                                  dir_path='bert_ckpt',
                                  num_class=7)

# 학습 준비하기
optimizer = tf.keras.optimizers.Adam(3e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
cls_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

model_name = "tf2_bert_classifier"

# overfitting을 막기 위한 ealrystop 추가
earlystop_callback = EarlyStopping(monitor='val_accuracy', min_delta=0.0001,patience=2, restore_best_weights=True)
# min_delta: the threshold that triggers the termination (acc should at least improve 0.0001)
# patience: no improvment epochs (patience = 1, 1번 이상 상승이 없으면 종료)\

checkpoint_path = os.path.join(model_name, 'weights.h5')
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create path if exists
if os.path.exists(checkpoint_dir):
    print("{} -- Folder already exists \n".format(checkpoint_dir))
else:
    os.makedirs(checkpoint_dir, exist_ok=True)
    print("{} -- Folder create complete \n".format(checkpoint_dir))
    
cp_callback = ModelCheckpoint(
    checkpoint_path, monitor='val_accuracy', verbose=1, save_best_only=True, save_weights_only=True)

n_fold = 3
a = 0
cv = StratifiedKFold(n_splits = n_fold, shuffle=True)
score=[]

# 학습과 eval 시작
for train_index, test_index in cv.split(train['title'], train['topic_idx']):
    history = cls_model.fit(train_inputs, train_labels, epochs=10, batch_size=32,
                            validation_split = VALID_SPLIT, callbacks=[earlystop_callback, cp_callback])
    
    print("====== predicting test===========")
    results += cls_model.predict(test_inputs)
    
    print("====== predicting test Done===========")
    print(f"score for fold {a} is",results[1])
    
    print("==="*20)
    score.append(results[1])
    
    a+=1
    
print("final LB score",np.mean(score))

Some layers from the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-multilingual-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


tf2_bert_classifier -- Folder already exists 

Epoch 1/10
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.

Epoch 00001: val_accuracy improved from -inf to 0.74614, saving model to tf2_bert_classifier\weights.h5
Epoch 2/10

Epoch 00002: val_accuracy improved from 0.74614 to 0.76782, saving model to tf2_bert_classifier\weights.h5
Epoch 3/10

Epoch 00003: v


Epoch 00003: val_accuracy did not improve from 0.78830
Epoch 4/10

Epoch 00004: val_accuracy did not improve from 0.78830
Epoch 5/10

Epoch 00005: val_accuracy did not improve from 0.78830
score for fold 4 is [ -6.37229657  -9.10835779   2.91837014  40.7464757   -6.85529755
  -4.23812643 -10.98394799]
final LB score 0.8724027872085571


In [12]:
# results = cls_model.predict(test_inputs)
results=results/5
results=tf.argmax(results, axis=1)

In [13]:
submission['topic_idx']=results

In [14]:
submission

Unnamed: 0,index,topic_idx
0,45654,0
1,45655,3
2,45656,2
3,45657,2
4,45658,3
...,...,...
9126,54780,3
9127,54781,2
9128,54782,2
9129,54783,2


In [15]:
submission.to_csv('3rd_bert_baseline.csv', index=False)

In [16]:
#submission.to_csv(path, index = False)