# Bertを利用した利用したReview分類

## TenosrflowををImport

In [None]:
!pip install transformers==2.11.0

In [None]:
import numpy as np
import transformers
from sklearn.metrics import accuracy_score
import tensorflow as tf
import re
import glob

In [None]:
transformers 

## データを構築

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
ls drive/MyDrive/研究/DialogueAct-Prediction/work/Bert-Twitter/review_data

In [None]:
text_path = "drive/MyDrive/研究/DialogueAct-Prediction/work/Bert-Twitter/data/text/"
label_path = "drive/MyDrive/研究/DialogueAct-Prediction/work/Bert-Twitter/data/label/"

import glob
dir_text_path = glob.glob(text_path+"*.txt")
dir_label_path = glob.glob(label_path+"*.txt")

dir_label_path.sort()
dir_text_path.sort()

In [None]:
texts=[0]*len(dir_text_path)
labels=[0]*len(dir_label_path)
for idx, path in enumerate(dir_text_path):
  with open(path) as f:
    l = f.read().split("', '")
  texts[idx] = l

for idx, path in enumerate(dir_label_path):
  with open(path) as f:
    l = f.read().split("\n")
  labels[idx] = l[:-1]

In [None]:
for l,t in zip(labels, texts):
  if (len(l)!=len(t)):
    print(len(l), len(t))

In [None]:
labels_=[]
texts_=[]
idx=0

for i,z in zip(texts, labels):
  if len(i) < len(z):
    texts_.append(texts[idx])
    labels_.append(labels[idx][0:len(i)])
  elif len(i) > len(z):
    labels_.append(labels[idx])
    texts_.append(texts[idx][0:len(z)])
  else:
    labels_.append(labels[idx])
    texts_.append(texts[idx])
  idx+=1

In [None]:
for i,z in zip(labels_, texts_):
  print(len(i), len(z))

In [None]:
to_vec_labels=[]
for label in labels_:
  l=[0]*len(label)
  for i, v in enumerate(label):
    if l=='positive':
      l[i] = 1
  to_vec_labels.append(l)

In [None]:
for i,v in enumerate(texts_):
  if len(to_vec_labels[i]) != len(v):
    print(len(to_vec_labels[i]), len(v))
    print("miss")

In [None]:
data_texts=[]
data_labels=[]

for i,v in enumerate(texts_):
  for j,r in enumerate(v):
    data_texts.append(r)
    data_labels.append(to_vec_labels[i][j])

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data_texts, data_labels, test_size=0.2, random_state=100)

## モデル構築

In [None]:
model_name = "cl-tohoku/bert-base-japanese"
tokenizer = transformers.BertTokenizer.from_pretrained(model_name)

### テキストをBERTに適応する変換

In [None]:
def to_features(texts, max_length):
    shape = (len(texts), max_length)
    # input_idsやattention_mask, token_type_idsの説明はglossaryに記載(cf. https://huggingface.co/transformers/glossary.html)
    input_ids = np.zeros(shape, dtype="int32")
    attention_mask = np.zeros(shape, dtype="int32")
    token_type_ids = np.zeros(shape, dtype="int32")
    for i, text in enumerate(texts):
        encoded_dict = tokenizer.encode_plus(text, max_length=max_length, pad_to_max_length=True)
        input_ids[i] = encoded_dict["input_ids"]
        attention_mask[i] = encoded_dict["attention_mask"]
        token_type_ids[i] = encoded_dict["token_type_ids"]
    return [input_ids, attention_mask, token_type_ids]

### Bertの単一モデルの構築

In [None]:
def build_model(model_name, num_classes, max_length):
    input_shape = (max_length, )
    input_ids = tf.keras.layers.Input(input_shape, dtype=tf.int32)
    attention_mask = tf.keras.layers.Input(input_shape, dtype=tf.int32)
    token_type_ids = tf.keras.layers.Input(input_shape, dtype=tf.int32)
    bert_model = transformers.TFBertModel.from_pretrained(model_name)
    last_hidden_state, pooled_output = bert_model([input_ids,
                                                  attention_mask,
                                                  token_type_ids])
    
    flatten_output = tf.keras.layers.Flatten()(pooled_output)
    drop_output = tf.keras.layers.Dropout(0.1)(flatten_output)
    output = tf.keras.layers.Dense(64, activation='relu')(drop_output)
    drop_output = tf.keras.layers.Dropout(0.1)(output)
    output = tf.keras.layers.Dense(num_classes, activation="softmax")(drop_output)
    model = tf.keras.Model(inputs=[input_ids, attention_mask, token_type_ids], outputs=[output])
    optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
    model.compile(optimizer=optimizer, loss="categorical_crossentropy", metrics=["acc"])
    return model

### データセット/モデル構築

In [None]:
num_classes = 2
max_length = 128
batch_size = 16
epochs = 30

x_train = to_features(X_train, max_length)
y_train = tf.keras.utils.to_categorical(y_train, num_classes=num_classes)

In [None]:
model = build_model(model_name, num_classes=num_classes, max_length=max_length)
model.summary()

### モデルの訓練

In [None]:
# 訓練
model.fit(
    x_train,
    y_train,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.2
)