# BERT でアマゾンレビューを分類してみる

In [1]:
import pandas as pd
import os, math
import numpy as np
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from datasets import load_dataset

from transformers import (
    AutoConfig,
    AutoTokenizer,
    PretrainedConfig,
    TFAutoModelForSequenceClassification,
    TFTrainingArguments,
    set_seed,
)

# GPU 設定
physical_devices = tf.config.list_physical_devices('GPU')
if len(physical_devices) > 0:
    for device in physical_devices:
        tf.config.experimental.set_memory_growth(device, True)
        print('{} memory growth: {}'.format(device, tf.config.experimental.get_memory_growth(device)))
else:
    print("Not enough GPU hardware devices available")

PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU') memory growth: True


In [2]:
# load dataset
data_files = {"train": "amazon_reviews_train.csv", "validation": "amazon_reviews_test.csv"}
datasets = load_dataset("csv", data_files=data_files, cache_dir="tf_cache")

Using custom data configuration default-3dccccb3fde64d7f
Reusing dataset csv (tf_cache\csv\default-3dccccb3fde64d7f\0.0.0\2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0)


In [3]:
# cast label from int to float
from datasets import ClassLabel, Value
new_features = datasets["train"].features.copy()
new_features["label"] = Value('float64')
datasets["train"] = datasets["train"].cast(new_features)

new_features = datasets["validation"].features.copy()
new_features["label"] = Value('float64')
datasets["validation"] = datasets["validation"].cast(new_features)

# remove unnecessary column
datasets = datasets.map(lambda example: {'sentence1': example['sentence1']}, remove_columns=['Unnamed: 0'])

datasets["train"].features, datasets["validation"].features

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=17647.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1961.0), HTML(value='')))




({'label': Value(dtype='float64', id=None),
  'sentence1': Value(dtype='string', id=None)},
 {'label': Value(dtype='float64', id=None),
  'sentence1': Value(dtype='string', id=None)})

In [4]:
# define tokenizer

bert_folder = "cl-tohoku/bert-base-japanese-v2"
# bert_folder = "cl-tohoku/bert-base-japanese-char"
# tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    bert_folder,
    cache_dir=None,
    revision="main",
    use_auth_token=None,
)

# ラベル変換
def build_label_convertor(labels:list):
    labels = np.unique(labels)
    labels = sorted(labels)
    label2id = {v:k for k, v in enumerate(labels)}
    return label2id

label2id = build_label_convertor(datasets["train"]["label"])
num_labels = len(label2id)


# config
config = AutoConfig.from_pretrained(
    bert_folder,
    num_labels=num_labels,
    cache_dir=None,
    revision="main",
    use_auth_token=None,
)
config.label2id = label2id

In [5]:
# tokenizeする
def preprocess_function(examples):
    # Tokenize the texts
    args = ((examples["sentence1"],))
    result = tokenizer(*args, max_length=max_seq_length, truncation=True)
    
    # Map labels to IDs
    if config.label2id is not None and "label" in examples:
        result["label"] = [(config.label2id[l] if l != -1 else -1) for l in examples["label"]]
    return result

max_seq_length = 128
batch_size = 10
processed_datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=False)
processed_datasets

HBox(children=(FloatProgress(value=0.0, max=18.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'label', 'sentence1', 'token_type_ids'],
        num_rows: 17647
    })
    validation: Dataset({
        features: ['attention_mask', 'input_ids', 'label', 'sentence1', 'token_type_ids'],
        num_rows: 1961
    })
})

In [6]:
# データをtensorflowの形式に変換する
def convert_dataset_for_tensorflow(
    dataset, batch_size, dataset_mode="variable_batch", drop_remainder=True
):
    column_names = {col for col in dataset.column_names}
    non_label_column_names = [name for name in column_names if name not in ["label", 'token_type_ids', 'input_ids', 'attention_mask']]

    def densify_ragged_batch(features, label=None):
        features = {
            feature: ragged_tensor.to_tensor(shape=batch_shape[feature]) for feature, ragged_tensor in features.items()
        }
        if label is None:
            return features
        else:
            return features, label
    
    feature_keys = list(set(dataset.features.keys()) - set(non_label_column_names + ["label"]))
    if dataset_mode == "variable_batch":
        batch_shape = {key: None for key in feature_keys}
        data = {key: tf.ragged.constant(dataset[key]) for key in feature_keys}
    elif dataset_mode == "constant_batch":
        data = {key: tf.ragged.constant(dataset[key]) for key in feature_keys}
        batch_shape = {
            key: tf.concat(([batch_size], ragged_tensor.bounding_shape()[1:]), axis=0)
            for key, ragged_tensor in data.items()
        }
    else:
        raise ValueError("Unknown dataset mode!")
    
    if "label" in dataset.features:
        labels = tf.convert_to_tensor(np.array(dataset["label"]))
        tf_dataset = tf.data.Dataset.from_tensor_slices((data, labels))
    else:
        tf_dataset = tf.data.Dataset.from_tensor_slices(data)
        
    tf_dataset = tf_dataset.shuffle(buffer_size=len(dataset))
    tf_dataset = tf_dataset.batch(batch_size=batch_size, drop_remainder=drop_remainder).map(densify_ragged_batch)
    return tf_dataset


tf_data = dict()
for key in processed_datasets:
    tf_data[key] = convert_dataset_for_tensorflow(processed_datasets[key], batch_size)
tf_data


{'train': <MapDataset shapes: ({token_type_ids: (10, None), attention_mask: (10, None), input_ids: (10, None)}, (10,)), types: ({token_type_ids: tf.int32, attention_mask: tf.int32, input_ids: tf.int32}, tf.float64)>,
 'validation': <MapDataset shapes: ({token_type_ids: (10, None), attention_mask: (10, None), input_ids: (10, None)}, (10,)), types: ({token_type_ids: tf.int32, attention_mask: tf.int32, input_ids: tf.int32}, tf.float64)>}

In [7]:
# モデル構築
model = TFAutoModelForSequenceClassification.from_pretrained(
    bert_folder,
    config=config,
    cache_dir=None,
    revision="main",
    use_auth_token=False,
)

import math
optimizer = tf.keras.optimizers.Adam(
    learning_rate=3e-5
)

loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = ["accuracy"]
model.compile(optimizer=optimizer, loss=loss_fn, metrics=metrics)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at cl-tohoku/bert-base-japanese-v2 and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# train
from tensorflow.keras.callbacks import EarlyStopping
history = model.fit(
    tf_data["train"],
    validation_data=tf_data["validation"],
    epochs=5,
)
min_index = np.argmin(history.history["val_loss"])
print("val_loss:", history.history["val_loss"][min_index], "val_acc:", history.history["val_accuracy"][min_index])

Epoch 1/5
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
val_loss: 0.618066132068634 val_acc: 0.8183673620223999
