# Finetuning of the pretrained Japanese BERT model

Finetune the pretrained model to solve multi-class classification problems.  
This notebook requires the following objects:
- trained sentencepiece model (model and vocab files)
- pretraiend Japanese BERT model

In [1]:
import configparser
import glob
import os
import pandas as pd
import subprocess
import sys
import tarfile 
from urllib.request import urlretrieve

CURDIR = os.getcwd()
CONFIGPATH = os.path.join(CURDIR, os.pardir, 'config.ini')
config = configparser.ConfigParser()
config.read(CONFIGPATH)

['/work/notebook/../config.ini']

## Data preparing

You need execute the following cells just once.

In [2]:
FILEURL = config['FINETUNING-DATA']['FILEURL']
FILEPATH = config['FINETUNING-DATA']['FILEPATH']
EXTRACTDIR = config['FINETUNING-DATA']['TEXTDIR']

Download and unzip data.  
Dataset is livedoor ニュースコーパス in https://www.rondhuit.com/download.html.

In [2]:
urlretrieve(FILEURL, FILEPATH)

mode = "r:gz"
tar = tarfile.open(FILEPATH, mode) 
tar.extractall(EXTRACTDIR) 
tar.close()

Data preprocessing.

In [3]:
def extract_txt(filename):
    with open(filename) as text_file:
        # 0: URL, 1: timestamp
        text = text_file.readlines()[2:]
        text = [sentence.strip() for sentence in text]
        text = list(filter(lambda line: line != '', text))
        return ''.join(text)

In [4]:
categories = [ 
    name for name 
    in os.listdir( os.path.join(EXTRACTDIR, "text") ) 
    if os.path.isdir( os.path.join(EXTRACTDIR, "text", name) ) ]

categories = sorted(categories)

In [5]:
categories

['dokujo-tsushin',
 'it-life-hack',
 'kaden-channel',
 'livedoor-homme',
 'movie-enter',
 'peachy',
 'smax',
 'sports-watch',
 'topic-news']

In [6]:
table = str.maketrans({
    '\n': '',
    '\t': '　',
    '\r': '',
})

In [8]:
all_text = []
all_label = []

for cat in categories:
    files = glob.glob(os.path.join(EXTRACTDIR, "text", cat, "{}*.txt".format(cat)))
    files = sorted(files)
    body = [ extract_txt(elem).translate(table) for elem in files ]
    label = [cat] * len(body)
    
    all_text.extend(body)
    all_label.extend(label)

In [9]:
df = pd.DataFrame({'text' : all_text, 'label' : all_label})

In [19]:
# df.head()

In [11]:
df = df.sample(frac=1, random_state=23).reset_index(drop=True)

In [20]:
# df.head()

Save data as tsv files.  
test:dev:train = 2:2:6.

In [13]:
df[:len(df) // 5].to_csv( os.path.join(EXTRACTDIR, "test.tsv"), sep='\t', index=False)
df[len(df) // 5:len(df)*2 // 5].to_csv( os.path.join(EXTRACTDIR, "dev.tsv"), sep='\t', index=False)
df[len(df)*2 // 5:].to_csv( os.path.join(EXTRACTDIR, "train.tsv"), sep='\t', index=False)

## Finetune pre-trained model

In [4]:
PRETRAINED_MODEL_PATH = '../model/model.ckpt-270000'
FINETUNE_OUTPUT_DIR = '../model/livedoor_output'

In [5]:
!python3 ../src/run_classifier.py \
  --task_name=livedoor \
  --do_train=true \
  --do_eval=true \
  --data_dir=../data/livedoor \
  --model_file=../model/wiki-ja.model \
  --vocab_file=../model/wiki-ja.vocab \
  --init_checkpoint={PRETRAINED_MODEL_PATH} \
  --max_seq_length=128 \
  --train_batch_size=32 \
  --learning_rate=2e-5 \
  --num_train_epochs=3.0 \
  --output_dir={FINETUNE_OUTPUT_DIR}

## Predict using the finetuned model

Let's predict test data using the finetuned model.  

In [2]:
import sys
sys.path.append("../src")

import tokenization_sentencepiece as tokenization
from run_classifier import LivedoorProcessor
from run_classifier import model_fn_builder
from run_classifier import file_based_input_fn_builder
from run_classifier import file_based_convert_examples_to_features

In [3]:
sys.path.append("../bert")

import modeling
import optimization
import tensorflow as tf

In [4]:
import configparser
import json
import glob
import os
import pandas as pd
import tempfile

def str_to_value(input_str):
    """
    Convert data type of value of dict to appropriate one.
    Assume there are only three types: str, int, float.
    """
    if input_str.isalpha():
        return input_str
    elif input_str.isdigit():
        return int(input_str)
    else:
        return float(input_str)

bert_config_file = tempfile.NamedTemporaryFile(mode='w+t', encoding='utf-8', suffix='.json')
bert_config_file.write(json.dumps({k:str_to_value(v) for k,v in config['BERT-CONFIG'].items()}))
bert_config_file.seek(0)
bert_config = modeling.BertConfig.from_json_file(bert_config_file.name)

In [5]:
output_ckpts = glob.glob("{}/model.ckpt*data*".format('/tmp/livedoor_output'))
latest_ckpt = sorted(output_ckpts)[-1]
FINETUNED_MODEL_PATH = latest_ckpt.split('.data-00000-of-00001')[0]

In [6]:
class FLAGS(object):
    '''Parameters.'''
    def __init__(self):
        self.model_file = "../model/wiki-ja.model"
        self.vocab_file = "../model/wiki-ja.vocab"
        self.do_lower_case = True
        self.use_tpu = False
        self.output_dir = "/dummy"
        self.data_dir = "../data/livedoor"
        self.max_seq_length = 512
        self.init_checkpoint = FINETUNED_MODEL_PATH
        self.predict_batch_size = 16
        
        # The following parameters are not used in predictions.
        # Just use to create RunConfig.
        self.master = None
        self.save_checkpoints_steps = 1
        self.iterations_per_loop = 1
        self.num_tpu_cores = 1
        self.learning_rate = 0
        self.num_warmup_steps = 0
        self.num_train_steps = 0
        self.train_batch_size = 0
        self.eval_batch_size = 0

In [7]:
FLAGS = FLAGS()

In [8]:
processor = LivedoorProcessor()
label_list = processor.get_labels()

In [9]:
tokenizer = tokenization.FullTokenizer(
    model_file=FLAGS.model_file, vocab_file=FLAGS.vocab_file,
    do_lower_case=FLAGS.do_lower_case)

tpu_cluster_resolver = None

is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2

run_config = tf.contrib.tpu.RunConfig(
    cluster=tpu_cluster_resolver,
    master=FLAGS.master,
    model_dir=FLAGS.output_dir,
    save_checkpoints_steps=FLAGS.save_checkpoints_steps,
    tpu_config=tf.contrib.tpu.TPUConfig(
        iterations_per_loop=FLAGS.iterations_per_loop,
        num_shards=FLAGS.num_tpu_cores,
        per_host_input_for_training=is_per_host))

Loaded a trained SentencePiece model.


In [10]:
model_fn = model_fn_builder(
    bert_config=bert_config,
    num_labels=len(label_list),
    init_checkpoint=FLAGS.init_checkpoint,
    learning_rate=FLAGS.learning_rate,
    num_train_steps=FLAGS.num_train_steps,
    num_warmup_steps=FLAGS.num_warmup_steps,
    use_tpu=FLAGS.use_tpu,
    use_one_hot_embeddings=FLAGS.use_tpu)


estimator = tf.contrib.tpu.TPUEstimator(
    use_tpu=FLAGS.use_tpu,
    model_fn=model_fn,
    config=run_config,
    train_batch_size=FLAGS.train_batch_size,
    eval_batch_size=FLAGS.eval_batch_size,
    predict_batch_size=FLAGS.predict_batch_size)

INFO:tensorflow:Using config: {'_model_dir': '/dummy', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 1, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f3ddd2f0e10>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=1, num_shards=1, num_cores_per_replica=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None, input_partition_di

In [11]:
predict_examples = processor.get_test_examples(FLAGS.data_dir)
predict_file = tempfile.NamedTemporaryFile(mode='w+t', encoding='utf-8', suffix='.tf_record')

file_based_convert_examples_to_features(predict_examples, label_list,
                                        FLAGS.max_seq_length, tokenizer,
                                        predict_file.name)

predict_drop_remainder = True if FLAGS.use_tpu else False

predict_input_fn = file_based_input_fn_builder(
    input_file=predict_file.name,
    seq_length=FLAGS.max_seq_length,
    is_training=False,
    drop_remainder=predict_drop_remainder)

In [13]:
result = estimator.predict(input_fn=predict_input_fn)

In [14]:
# It will take a few hours on CPU.

result = list(result)

In [40]:
result[:2]

[{'probabilities': array([6.3093408e-04, 3.1677485e-04, 2.3901617e-04, 1.1296889e-03,
         7.7284197e-04, 8.3176029e-04, 7.1590225e-04, 9.9412304e-01,
         1.2399884e-03], dtype=float32)},
 {'probabilities': array([2.3374568e-04, 7.7296095e-04, 9.9691844e-01, 3.6862720e-04,
         2.6846604e-04, 2.7929764e-04, 5.5196742e-04, 2.6788114e-04,
         3.3855080e-04], dtype=float32)}]

Read test data set and add prediction results.

In [15]:
import pandas as pd

In [16]:
test_df = pd.read_csv("../data/livedoor/test.tsv", sep='\t')

In [17]:
test_df['predict'] = [ label_list[elem['probabilities'].argmax()] for elem in result ]

In [21]:
# test_df.head()

In [42]:
sum( test_df['label'] == test_df['predict'] ) / len(test_df)

0.8472505091649695

A littel more detailed check using `sklearn.metrics`.

In [18]:
!pip install scikit-learn

In [37]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [38]:
print(classification_report(test_df['label'], test_df['predict']))

                precision    recall  f1-score   support

dokujo-tsushin       0.96      0.83      0.89       178
  it-life-hack       0.86      0.93      0.90       172
 kaden-channel       0.74      0.93      0.83       176
livedoor-homme       0.83      0.83      0.83        95
   movie-enter       1.00      0.65      0.78       158
        peachy       0.63      0.91      0.75       174
          smax       0.92      0.97      0.94       167
  sports-watch       0.94      0.99      0.96       190
    topic-news       0.98      0.54      0.70       163

     micro avg       0.85      0.85      0.85      1473
     macro avg       0.87      0.84      0.84      1473
  weighted avg       0.87      0.85      0.84      1473



In [39]:
print(confusion_matrix(test_df['label'], test_df['predict']))

[[147   1   2   2   0  25   1   0   0]
 [  0 160   3   4   0   3   2   0   0]
 [  0   9 164   0   0   2   1   0   0]
 [  1   4   5  79   0   6   0   0   0]
 [  0   1   1   2 102  51   0   1   0]
 [  1   0   4   3   0 158   8   0   0]
 [  0   3   1   0   0   1 162   0   0]
 [  0   0   0   0   0   0   0 188   2]
 [  4   7  41   5   0   4   2  12  88]]
