# Bertを利用した利用したReview分類

## TenosrflowををImport

In [1]:
!pip install transformers==2.11.0

Collecting transformers==2.11.0
  Downloading transformers-2.11.0-py3-none-any.whl (674 kB)
[K     |████████████████████████████████| 674 kB 4.2 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 36.0 MB/s 
Collecting tokenizers==0.7.0
  Downloading tokenizers-0.7.0-cp37-cp37m-manylinux1_x86_64.whl (5.6 MB)
[K     |████████████████████████████████| 5.6 MB 16.3 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 35.5 MB/s 
Installing collected packages: tokenizers, sentencepiece, sacremoses, transformers
Successfully installed sacremoses-0.0.46 sentencepiece-0.1.96 tokenizers-0.7.0 transformers-2.11.0


In [2]:
import numpy as np
import transformers
from sklearn.metrics import accuracy_score
import tensorflow as tf
import re
import glob

In [3]:
transformers 

<module 'transformers' from '/usr/local/lib/python3.7/dist-packages/transformers/__init__.py'>

## データを構築

In [18]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [19]:
ls drive/MyDrive/研究/DialogueAct-Prediction/work/Bert-Twitter/review_data

[0m[01;34mdata[0m/  [01;34mlabel[0m/  [01;34mtext[0m/


In [20]:
text_path = "drive/MyDrive/研究/DialogueAct-Prediction/work/Bert-Twitter/review_data/text/"
label_path = "drive/MyDrive/研究/DialogueAct-Prediction/work/Bert-Twitter/review_data/label/"

import glob
dir_text_path = glob.glob(text_path+"*.txt")
dir_label_path = glob.glob(label_path+"*.txt")

dir_label_path.sort()
dir_text_path.sort()

In [21]:
texts=[0]*len(dir_text_path)
labels=[0]*len(dir_text_path)
for idx, path in enumerate(dir_text_path):
  with open(path) as f:
    l = f.read().split(",")
  texts[idx] = l

for idx, path in enumerate(dir_label_path):
  with open(path) as f:
    l = f.read().split("\n")
  labels[idx] = l

In [34]:
labels_=[]
texts_=[]
idx=0

for i,z in zip(texts, labels):
  if len(i) < len(z):
    texts_.append(texts[idx])
    labels_.append(labels[idx][0:len(i)])
  elif len(i) > len(z):
    labels_.append(labels[idx])
    texts_.append(texts[idx][0:len(z)])
  idx+=1

In [30]:
len(texts[0])

648

In [35]:
to_vec_labels=[]
for label in labels_:
  l=[0]*len(label)
  for i, v in enumerate(label):
    if l=='positive':
      l[i] = 1
  to_vec_labels.append(l)

In [36]:
for i,v in enumerate(texts_):
  if len(to_vec_labels[i]) != len(v):
    print(len(to_vec_labels[i]), len(v))
    print("miss")

In [38]:
data_texts=[]
data_labels=[]

for i,v in enumerate(texts_):
  for j,r in enumerate(v):
    data_texts.append(r)
    data_labels.append(to_vec_labels[i][j])

In [39]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data_texts, data_labels, test_size=0.2, random_state=100)

## モデル構築

In [40]:
model_name = "cl-tohoku/bert-base-japanese"
tokenizer = transformers.BertTokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/258k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/110 [00:00<?, ?B/s]

### テキストをBERTに適応する変換

In [41]:
def to_features(texts, max_length):
    shape = (len(texts), max_length)
    # input_idsやattention_mask, token_type_idsの説明はglossaryに記載(cf. https://huggingface.co/transformers/glossary.html)
    input_ids = np.zeros(shape, dtype="int32")
    attention_mask = np.zeros(shape, dtype="int32")
    token_type_ids = np.zeros(shape, dtype="int32")
    for i, text in enumerate(texts):
        encoded_dict = tokenizer.encode_plus(text, max_length=max_length, pad_to_max_length=True)
        input_ids[i] = encoded_dict["input_ids"]
        attention_mask[i] = encoded_dict["attention_mask"]
        token_type_ids[i] = encoded_dict["token_type_ids"]
    return [input_ids, attention_mask, token_type_ids]

### Bertの単一モデルの構築

In [42]:
def build_model(model_name, num_classes, max_length):
    input_shape = (max_length, )
    input_ids = tf.keras.layers.Input(input_shape, dtype=tf.int32)
    attention_mask = tf.keras.layers.Input(input_shape, dtype=tf.int32)
    token_type_ids = tf.keras.layers.Input(input_shape, dtype=tf.int32)
    bert_model = transformers.TFBertModel.from_pretrained(model_name)
    last_hidden_state, pooled_output = bert_model([input_ids,
                                                  attention_mask,
                                                  token_type_ids])
    
    flatten_output = tf.keras.layers.Flatten()(pooled_output)
    drop_output = tf.keras.layers.Dropout(0.1)(drop_output)
    output = tf.keras.layers.Dense(64, activation='relu')(drop_output)
    drop_output = tf.keras.layers.Dropout(0.1)(output)
    output = tf.keras.layers.Dense(num_classes, activation="softmax")(drop_output)
    model = tf.keras.Model(inputs=[input_ids, attention_mask, token_type_ids], outputs=[output])
    optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
    model.compile(optimizer=optimizer, loss="categorical_crossentropy", metrics=["acc"])
    return model

### データセット/モデル構築

In [43]:
num_classes = 2
max_length = 128
batch_size = 16
epochs = 30

x_train = to_features(X_train, max_length)
y_train = tf.keras.utils.to_categorical(y_train, num_classes=num_classes)

In [44]:
model = build_model(model_name, num_classes=num_classes, max_length=max_length)
model.summary()

Downloading:   0%|          | 0.00/479 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/545M [00:00<?, ?B/s]

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 128)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 128)]        0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 128)]        0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     ((None, 128, 768), ( 110617344   input_1[0][0]                    
                                                                 input_2[0][0]                

### モデルの訓練

In [46]:
# 訓練
model.fit(
    x_train,
    y_train,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.2
)

Epoch 1/30
 10/393 [..............................] - ETA: 5:10 - loss: 4.4703e-09 - acc: 1.0000

KeyboardInterrupt: ignored