# Bert : model fine-tune

以Imdb資料集作為範例，呈現如何以外部資料集做BERT model的fine-tune。

此部分只做了training與testing，未使用到run_classifier.py內do_eval的功能

### Prerequisites

- 需先下載好pretrain model checkpoint, vocab_txt, bert_model.config檔案並放置在'bert_model'資料夾內

## Import packages

In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.utils import shuffle

import tokenization
from run_classifier import *

  from ._conv import register_converters as _register_converters


## IMDB sentiment dataset

In [2]:
# download data from stanford AI Lab
# source : http://ai.stanford.edu/~amaas/data/sentiment/

if not os.path.exists('aclImdb'):
    !wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
    !tar zxf aclImdb_v1.tar.gz
    !rm aclImdb_v1.tar.gz

In [3]:
def load_file(dir_path):
    article_list = []
    for file in os.listdir(dir_path):
        with open(os.path.join(dir_path,file),'r') as f:
            article_list.append(f.readlines()[0])
    return article_list

def write_tf_record(output_file_name, label_list, vocab_file, x, y = None, do_lower_case = True, max_seq_length = 128):
    
    if y!=None:
        input_data = zip(x, y)
    else:
        input_data = x
    
    examples = []
    
    for i, (text , label) in enumerate(input_data):
        text_a = tokenization.convert_to_unicode(text)
        examples.append(InputExample(guid = i, text_a = text_a, text_b = None, label = label))

    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)

    file_based_convert_examples_to_features(examples, label_list, max_seq_length, tokenizer, output_file_name)
    

In [4]:
# read data
pos_train_data = load_file('./aclImdb/train/pos')
neg_train_data = load_file('./aclImdb/train/neg')

pos_test_data = load_file('./aclImdb/test/pos')
neg_test_data = load_file('./aclImdb/test/neg')


# convert data to tf_record file
'''settings'''
do_lower_case = True
max_seq_length = 128
vocab_file='bert_model/vocab.txt'
train_file = 'tmp/train.tf_record'
test_file = 'tmp/test.tf_record'
''''''

# shuffle training-set
train_x = pos_train_data+neg_train_data
train_y = ['pos' for _ in pos_train_data]+['neg' for _ in neg_train_data]
train_x, train_y = shuffle(train_x, train_y)

# make directory if 'tmp' is not exist
if not os.path.exists('tmp'):
    os.mkdir('tmp')

# write train.tf_record
write_tf_record(
    output_file_name = train_file,
    label_list = ['pos','neg'],
    vocab_file = vocab_file,
    x = train_x,
    y = train_y)

# write test.tf_record
write_tf_record(
    output_file_name = test_file,
    label_list = ['pos','neg'],
    vocab_file = vocab_file,
    x = pos_test_data+neg_test_data,
    y = ['pos' for _ in pos_test_data]+['neg' for _ in neg_test_data])

INFO:tensorflow:Writing example 0 of 25000
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: 0
INFO:tensorflow:tokens: [CLS] this movie is the first of the six in ##fa ##mous gu ##inea pi ##g movies and is one of the best . at the same time , it looks real ##istic and un ##real ##istic , just knowing that the movie is fa ##ke . the story is about a woman who got captured and is torture ##d in a lot of different ways . a man in the beginning of the story receives a letter without a return address , and it includes a manga video , showing the torture . the men who capture her are testing the limits of a human before they die . some scenes are shock ##ing , such as the eye ##ball scene and others are not shock [SEP]
INFO:tensorflow:input_ids: 101 10531 18379 10124 10105 10422 10108 10105 12449 10106 13369 110106 75980 78209 24109 10240 39129 10111 10124 10464 10108 10105 12504 119 10160 10105 11561 10635 117 10271 59148 13486 29025 10111 10119 42923 29025 117 12820 104862 10189 10105 

## Model training

In [5]:
'''settings'''
vocab_file='bert_model/vocab.txt'
data_dir = 'tmp/'
bert_config_file = 'bert_model/bert_config.json'
init_checkpoint = 'bert_model/bert_model.ckpt'
train_file = 'tmp/train.tf_record'
output_dir = 'tmp/'

label_list = ['pos','neg']
num_train_examples = len(pos_train_data+neg_train_data) # or just type 25000

train_batch_size = 32
num_train_epochs = 3
learning_rate = 2e-5
iterations_per_loop = 1000
save_checkpoints_steps = 10000
eval_batch_size = 8
predict_batch_size = 8
max_seq_length = 128
warmup_proportion = 0.1
num_train_steps = int(
    (num_train_examples) / train_batch_size * num_train_epochs)
num_warmup_steps = int(num_train_steps * warmup_proportion)

tpu_cluster_resolver = None
master = None
num_tpu_cores = None
use_tpu = False
''''''



# 以下主要將run_clssifier內的程式碼段落額外取出

bert_config = modeling.BertConfig.from_json_file(bert_config_file)

model_fn = model_fn_builder(
  bert_config=bert_config,
  num_labels=len(label_list),
  init_checkpoint=init_checkpoint,
  learning_rate=learning_rate,
  num_train_steps=num_train_steps,
  num_warmup_steps=num_warmup_steps,
  use_tpu=use_tpu,
  use_one_hot_embeddings=use_tpu)


is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
run_config = tf.contrib.tpu.RunConfig(
  cluster=tpu_cluster_resolver,
  master=master,
  model_dir=output_dir,
  save_checkpoints_steps=save_checkpoints_steps,
  tpu_config=tf.contrib.tpu.TPUConfig(
      iterations_per_loop=iterations_per_loop,
      num_shards=num_tpu_cores,
      per_host_input_for_training=is_per_host))

estimator = tf.contrib.tpu.TPUEstimator(
  use_tpu=use_tpu,
  model_fn=model_fn,
  config=run_config,
  train_batch_size=train_batch_size,
  eval_batch_size=eval_batch_size,
  predict_batch_size=predict_batch_size)

train_input_fn = file_based_input_fn_builder(
    input_file=train_file,
    seq_length=max_seq_length,
    is_training=True,
    drop_remainder=True)
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

INFO:tensorflow:Using config: {'_model_dir': 'tmp/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 10000, '_save_checkpoints_secs': None, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f0ebdb55f28>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=1000, num_shards=None, num_cores_per_replica=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None), '_cluster': None}
INFO:tensorflow:_TPUContext: eval_on_tpu True
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Running train on CPU
INFO:tensorflow:*** Features ***
INFO:tensorflow:  na

<tensorflow.contrib.tpu.python.tpu.tpu_estimator.TPUEstimator at 0x7f0ebdb62198>

## Model testing

In [6]:
'''settings'''
bert_config_file = 'bert_model/bert_config.json'
predict_file = 'tmp/test.tf_record'
output_dir = 'tmp/'
init_checkpoint = 'tmp/model.ckpt-2343'

label_list = ['pos','neg']

train_batch_size = 32
learning_rate = 5e-5
iterations_per_loop = 1000
save_checkpoints_steps = None
eval_batch_size = 8
predict_batch_size = 8
max_seq_length = 128
num_train_steps = 1
num_warmup_steps = 1

tpu_cluster_resolver = None
master = None
num_tpu_cores = None
use_tpu = False
''''''

# 以下主要將run_clssifier內的程式碼段落額外取出

bert_config = modeling.BertConfig.from_json_file(bert_config_file)

model_fn = model_fn_builder(
  bert_config=bert_config,
  num_labels=len(label_list),
  init_checkpoint=init_checkpoint,
  learning_rate=learning_rate,
  num_train_steps=num_train_steps,
  num_warmup_steps=num_warmup_steps,
  use_tpu=use_tpu,
  use_one_hot_embeddings=use_tpu)


is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
run_config = tf.contrib.tpu.RunConfig(
  cluster=tpu_cluster_resolver,
  master=master,
  model_dir=output_dir,
  save_checkpoints_steps=save_checkpoints_steps,
  tpu_config=tf.contrib.tpu.TPUConfig(
      iterations_per_loop=iterations_per_loop,
      num_shards=num_tpu_cores,
      per_host_input_for_training=is_per_host))

estimator = tf.contrib.tpu.TPUEstimator(
  use_tpu=use_tpu,
  model_fn=model_fn,
  config=run_config,
  train_batch_size=train_batch_size,
  eval_batch_size=eval_batch_size,
  predict_batch_size=predict_batch_size)


predict_drop_remainder =  False
predict_input_fn = file_based_input_fn_builder(
    input_file=predict_file,
    seq_length=max_seq_length,
    is_training=False,
    drop_remainder=predict_drop_remainder)

result = estimator.predict(input_fn=predict_input_fn)

# write out prediction
output_predict_file = os.path.join(output_dir, "test_results.tsv")
with tf.gfile.GFile(output_predict_file, "w") as writer:
  tf.logging.info("***** Predict results *****")
  for prediction in result:
    output_line = "\t".join(
        str(class_probability) for class_probability in prediction) + "\n"
    writer.write(output_line)

INFO:tensorflow:Using config: {'_model_dir': 'tmp/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': None, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f0e48128828>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=1000, num_shards=None, num_cores_per_replica=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None), '_cluster': None}
INFO:tensorflow:_TPUContext: eval_on_tpu True
INFO:tensorflow:***** Predict results *****
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Running infer on CPU
INFO:tenso

## Let's see the result


In [7]:
#load test_result file

import pandas as pd

dat = pd.read_csv('tmp/test_results.tsv', sep='\t', header = None)
prediction = np.argmax(dat.as_matrix(),axis=1)

  


In [8]:
dat.tail()

Unnamed: 0,0,1
24995,0.004216,0.995784
24996,0.003066,0.996934
24997,0.74622,0.25378
24998,0.041623,0.958377
24999,0.004703,0.995297


### Get answer from test.tf_record file

為了呈現tf_record檔案應該如何被讀取，這邊我們直接從test.tf_record讀取正確的label_ids(事實上也可以從資料前處理端直接將正確答案拿過來)。

In [9]:
import tensorflow as tf

seq_length = 128
out_label = []

# TF檔
filename = 'tmp/test.tf_record'
# 產生文件名隊列
filename_queue = tf.train.string_input_producer([filename], 
                                                 shuffle=False, 
                                                 num_epochs=1)

name_to_features = {
  "input_ids": tf.FixedLenFeature([seq_length], tf.int64),
  "input_mask": tf.FixedLenFeature([seq_length], tf.int64),
  "segment_ids": tf.FixedLenFeature([seq_length], tf.int64),
  "label_ids": tf.FixedLenFeature([], tf.int64),
}

# 數據讀取器
reader = tf.TFRecordReader()
key, serialized_example = reader.read(filename_queue)

# 數據解析
data_features = tf.parse_single_example(
            serialized_example,
            features=name_to_features)
    
with tf.Session() as sess:
    # 初始化是必要的動作
    sess.run(tf.global_variables_initializer())
    sess.run(tf.local_variables_initializer())
    
    # 建立執行緒協調器
    coord = tf.train.Coordinator()
    
    # 啟動文件隊列，開始讀取文件
    threads = tf.train.start_queue_runners(coord=coord)

    count = 0
    
    try:
        while not coord.should_stop():
            [d] = sess.run([data_features])

            out_label.append(d['label_ids'])
            count += 1

    except tf.errors.OutOfRangeError:
        print('Done!')

    finally:
        # 最後要記得把文件隊列關掉
        coord.request_stop()
    
    coord.join(threads)

Done!


### confusion matrix and accuracy

In [10]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_true = out_label, y_pred = prediction)
print(cm)

print('accuracy : %.4f'%(np.sum(np.diag(cm))/np.sum(cm)))

[[10909  1591]
 [ 2020 10480]]
accuracy : 0.8556
