# Bert Model 스팸메일 분류

참고 :
https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/1

L-12 : hidden Layer 12개

H-768 : hidden size 768

A-12 : Attention Heads 12

### Tensorflow hub 설치 (Bert Model Download)

In [5]:
!pip install tensorflow-hub



### Bert Tokenization 모듈 설치

In [0]:
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

In [3]:
!pip install sentencepiece

Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/74/f4/2d5214cbf13d06e7cb2c20d84115ca25b53ea76fa1f0ade0e3c9749de214/sentencepiece-0.1.85-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K     |████████████████████████████████| 1.0MB 1.4MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.85


In [0]:
import tokenization

In [5]:
import numpy as np
import pandas as pd
import tensorflow_hub as hub
import tensorflow as tf
import keras
from keras import backend as K

Using TensorFlow backend.


In [0]:
root_path = '/content/drive/My Drive/'
file_path = '/content/drive/My Drive/MachineLearning_project/sms-spam-collection-dataset/'

### Data load

In [7]:
data = pd.read_csv( file_path + 'spam.csv', encoding='latin-1')
data[:5]

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [8]:
x_data = list(data['v2'])
y_data = list(data['v1'].replace(['ham','spam'],[0,1]))

print(len(x_data))
print(len(y_data))

5572
5572


### Train, Test 데이터 분류

In [9]:
train_len = int(len(x_data) * 0.8)
test_len = len(x_data) - train_len

print('train length : ', train_len)
print('test length : ', test_len)

train length :  4457
test length :  1115


In [10]:
x_train = np.array(x_data[:train_len])
y_train = np.array(y_data[:train_len])
x_test = np.array(x_data[train_len:])
y_test = np.array(y_data[train_len:])

print(x_train.shape)
print(x_test.shape)

(4457,)
(1115,)


### Build BERT Model

In [0]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Lambda, Input

In [0]:
# text에 스페셜 토큰 추가 후 토큰화
def bert_tokenizer(texts, tokenizer, max_len=1024) : 
  all_tokens = []
  all_masks = []
  all_segments = []

  for text in texts :
    text = tokenizer.tokenize(text)
    text = text[:max_len-2] 
    input_seq = ["[CLS]"] + text + ["[SEP]"]
    pad_len = max_len - len(input_seq)

    tokens = tokenizer.convert_tokens_to_ids(input_seq)
    tokens += [0] * pad_len
    pad_masks = [1] * len(input_seq) + [0] * pad_len
    segment_ids = [0] * max_len

    all_tokens.append(tokens)
    all_masks.append(pad_masks)
    all_segments.append(segment_ids)

  return np.array(all_tokens), np.array(all_masks), np.array(all_segments)


In [27]:
te = tokenizer.tokenize("test ha")
te = te[:160-2]
te

['test', 'ha']

In [0]:
# BERT 모델 파인튜닝 부분
# bert_layer : bert임베딩 층
def build_model(bert_layer, max_len=1024) :
  input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
  input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
  segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

  _, seq_output = bert_layer([input_word_ids, input_mask, segment_ids])
  clf_output = seq_output[:, 0, :] #Tensor("strided_slice:0", shape=(None, 768), dtype=float32)

  out = Dense(1, activation='sigmoid')(clf_output)

  model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)

  return model

  #d1,d2 = bert_layer([input_word_ids, input_mask, segment_ids])
  #print(d1) #Tensor("keras_layer_1/Identity:0", shape=(None, 768), dtype=float32)
  #print(d2) #Tensor("keras_layer_1/Identity_1:0", shape=(None, None, 768), dtype=float32)
  #print(d2[:, 0, :]) #Tensor("strided_slice:0", shape=(None, 768), dtype=float32)


In [0]:
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/1", trainable=True)

In [19]:
#build_model(bert_layer,max_len=512)

Tensor("keras_layer_1/Identity:0", shape=(None, 768), dtype=float32)
Tensor("keras_layer_1/Identity_1:0", shape=(None, None, 768), dtype=float32)
Tensor("strided_slice:0", shape=(None, 768), dtype=float32)


In [23]:
#build_model(bert_layer,max_len=512)

Tensor("keras_layer_3/Identity:0", shape=(None, 768), dtype=float32)
Tensor("keras_layer_3/Identity_1:0", shape=(None, None, 768), dtype=float32)
Tensor("strided_slice_3:0", shape=(None, 768), dtype=float32)
Tensor("strided_slice_4:0", shape=(None, 768), dtype=float32)
Tensor("strided_slice_5:0", shape=(None, 768), dtype=float32)


In [0]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()

In [39]:
vocab_file

b'/tmp/tfhub_modules/a7f4eb577e5eeec24c73b9dace49639b7c8193ed/assets/vocab.txt'

In [0]:
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [0]:
train_input = bert_tokenizer(x_train, tokenizer, max_len=160)
test_input = bert_tokenizer(x_test, tokenizer, max_len=160)

In [31]:
train_input

(array([[  101, 14439, 11444, ...,     0,     0,     0],
        [  101, 84591, 37818, ...,     0,     0,     0],
        [  101, 16122, 14722, ...,     0,     0,     0],
        ...,
        [  101, 23894, 10230, ...,     0,     0,     0],
        [  101, 14535, 13028, ...,     0,     0,     0],
        [  101, 19672, 20687, ...,     0,     0,     0]]),
 array([[1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        ...,
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0]]),
 array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]]))

In [32]:
model_BERT = build_model(bert_layer, max_len=160)
model_BERT.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 160)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 160)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 160)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 768), (None, 177853441   input_word_ids[0][0]             
                                                                 input_mask[0][0]           

In [0]:
model_BERT.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [34]:
train_history = model_BERT.fit(train_input, y_train,
    #validation_split = 0.2,
    epochs = 1, # recomended 3-5 epochs
    batch_size = 32
)



### Evaluate Model

In [36]:
print("\n 테스트 정확도: %.4f" % (model_BERT.evaluate(test_input, y_test)[1]))


 테스트 정확도: 0.8700
