In [None]:
# !git clone https://github.com/e9t/nsmc.git
train = pd.read_table("nsmc/"+"ratings_train.txt")
train = train.dropna().reset_index().iloc[:,1:]
test = pd.read_table("nsmc/"+"ratings_test.txt")
test = test.dropna().reset_index().iloc[:,1:]

def preprocessing_sentence_to_BERTinput(df, tokenizer, colname_data, colname_target=None, seq_len=128):
    tokens, masks, segments, targets = [], [], [], []
    for i in tqdm(range(len(df))):
        # token : 문장을 토큰화함
        token = tokenizer.encode(df[colname_data][i], max_length=seq_len, padding='max_length', truncation=True)
        
        # 마스크는 토큰화한 문장에서 패딩이 아닌 부분은 1, 패딩인 부분은 0으로 통일
        num_zeros = token.count(0)
        mask = [1]*(seq_len-num_zeros) + [0]*num_zeros
        
        # 문장의 전후관계를 구분해주는 세그먼트
        segment = [0]*seq_len
              
        # 정리
        tokens.append(token)
        masks.append(mask)
        segments.append(segment)
        if colname_target != None:
            targets.append(df[colname_target][i])
        
    # array 변환
    tokens = np.array(tokens)
    masks = np.array(masks)
    segments = np.array(segments)
    if colname_target != None:
        targets = np.array(targets)
    
    return [tokens, masks, segments], targets

from transformers import pipeline, AutoTokenizer, BertTokenizer, BertTokenizerFast
from transformers import AutoModel, AutoModelForTokenClassification, TFBertModel, TFBertForSequenceClassification

MODEL_NAME = 'bert-base-multilingual-cased'
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
X_train, Y_train = preprocessing_sentence_to_BERTinput(train, tokenizer=tokenizer,
                                                       colname_target='label', colname_data='document')
X_test, Y_test = preprocessing_sentence_to_BERTinput(test, tokenizer=tokenizer,
                                                       colname_target='label', colname_data='document')

def modeling_BERTsentiment(model_name, optimizer, seq_len=128):
    # 입력 변환
    tokens = tf.keras.layers.Input((seq_len,), dtype=tf.int32, name='input_ids')
    masks = tf.keras.layers.Input((seq_len,), dtype=tf.int32, name='input_masks')
    segments = tf.keras.layers.Input((seq_len,), dtype=tf.int32, name='input_segments')
    
    # 모델 로딩
    model = TFBertModel.from_pretrained(model_name)
    outputs = model([tokens, masks, segments])[1]
    
    # 모델 구성
    layer = tf.keras.layers.Dense(1, activation='sigmoid', 
                                  kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02))(outputs)
    model_sentiment = tf.keras.Model([tokens, masks, segments], layer)
    model_sentiment.compile(optimizer=optimizer, loss=tf.keras.losses.BinaryCrossentropy(), metrics = ['accuracy'])
    
    return model_sentiment

import tensorflow_addons as tfa
opt = tfa.optimizers.RectifiedAdam(lr=1.0e-5, weight_decay=0.0025, warmup_proportion=0.05)
MODEL_NAME = 'bert-base-multilingual-cased'

model = modeling_BERTsentiment(model_name=MODEL_NAME, optimizer=opt)
model.fit(X_train, Y_train, epochs=4, shuffle=True, batch_size=100, validation_data=(X_test, Y_test))
