In [None]:
pip install transformers

In [2]:
from keras.models import load_model
import os
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import tensorflow_hub as hub

import pandas as pd
import numpy as np
import re

MAX_LEN = 35

In [4]:
#BERT 토크나이저 불러오기
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased",
                                          do_lower_case=False)

In [5]:
# BERT 회귀모델 불러오기

class TFBertRegressor(tf.keras.Model):
    def __init__(self, model_name, dir_path, num_class):
        super(TFBertRegressor, self).__init__()
        
        self.bert = TFBertModel.from_pretrained(model_name, cache_dir=dir_path) # pre-trained model 불러오기
        self.num_class = num_class 
        self.dropout = tf.keras.layers.Dropout(self.bert.config.hidden_dropout_prob)
        self.regressor = tf.keras.layers.Dense(self.num_class, activation='sigmoid',
                                                kernel_initializer=tf.keras.initializers.TruncatedNormal(self.bert.config.initializer_range), 
                                                name="regressor")
        
        
    def call(self, inputs, attention_mask=None, token_type_ids=None, training=False):
        
        #outputs 값: # sequence_output, pooled_output, (hidden_states), (attentions)
        outputs = self.bert(inputs, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output, training=training)
        logits = self.regressor(pooled_output)

        return logits

In [6]:
regression_model = TFBertRegressor(model_name='bert-base-multilingual-cased',
                                  dir_path='bert_ckpt',
                                  num_class=1)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=625.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1083389348.0, style=ProgressStyle(descr…




Some layers from the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-multilingual-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [7]:
regression_model.load_weights('/content/drive/MyDrive/tf2_BERT/weights.ckpt')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f8926e4d4e0>

## TEST

In [8]:
# tokenizer 함수(데이터 전처리)

def bert_tokenizer_v2(sent1, sent2, MAX_LEN):
    
    # 2개의 문장을 input 으로 받는다
    encoded_dict = tokenizer.encode_plus(
        text = sent1,
        text_pair = sent2,
        add_special_tokens = True, # '[CLS]','[SEP]' token 추가
        max_length = MAX_LEN,   # Pad & truncate all sentences.
        pad_to_max_length = True,
        return_attention_mask = True   # Construct attn. masks.
        
    )
    
    input_id = encoded_dict['input_ids']
    attention_mask = encoded_dict['attention_mask'] # And its attention mask (simply differentiates padding from non-padding).
    token_type_id = encoded_dict['token_type_ids'] # differentiate two sentences
    
    return input_id, attention_mask, token_type_id

# 정규표현식을 사용해서 특수문자 제거

def clean_text(sent):
    sent_clean = re.sub("[^a-zA-Z0-9ㄱ-ㅣ가-힣\\s]", " ", sent)
    return sent_clean

In [9]:
# Load Test dataset
test_data  = pd.read_csv('/content/drive/MyDrive/chatbot_data/test_set.csv',encoding='cp949')
test_data = test_data.iloc[:,:3].dropna()

In [10]:
# Test set도 똑같은 방법으로 구성한다.
input_ids = []
attention_masks = []
token_type_ids = []
data_labels = []

for sent1, sent2, score in test_data[['질문1', '질문2', '유사도']].values:
    try:
        input_id, attention_mask, token_type_id = bert_tokenizer_v2(clean_text(sent1), clean_text(sent2), MAX_LEN)
        input_ids.append(input_id)
        attention_masks.append(attention_mask)
        token_type_ids.append(token_type_id)
        data_labels.append(score)
    except Exception as e:
        print(e)
        print(sent1, sent2)
        pass
    
test_input_ids = np.array(input_ids, dtype=int)
test_attention_masks = np.array(attention_masks, dtype=int)
test_type_ids = np.array(token_type_ids, dtype=int)
test_inputs = (test_input_ids, test_attention_masks, test_type_ids)
test_data_labels = np.array(data_labels)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [11]:
optimizer = tf.keras.optimizers.Adam(3e-5)
regression_model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'], run_eagerly=True)

In [12]:
results = regression_model.evaluate(test_inputs, test_data_labels, batch_size=512)
print("test loss, test accuracy: ", results)

test loss, test accuracy:  [0.1823945790529251, 0.9666666388511658]


In [None]:
y = regression_model.predict(test_inputs)

a = []

for i in y:
    if i > 0.5:
        a.append(1)     
        print(1)
        
    else:
        a.append(0)
        print(0)