# Bert : model fine-tune (modify run_classifier)

以Imdb資料集作為範例，呈現如何以外部資料集做BERT model的fine-tune。此版本將直接修改processor class以及run_classifier.py內容，使哦行訓練可以直接在終端機下執行。

### Prerequisites

- 需先下載好pretrain model checkpoint, vocab_txt, bert_model.config檔案並放置在'bert_model'資料夾內

## Import packages

In [1]:
import os
import numpy as np
import pandas as pd

os.chdir('../')

  from ._conv import register_converters as _register_converters


## IMDB sentiment dataset

In [2]:
# download data from stanford AI Lab
# source : http://ai.stanford.edu/~amaas/data/sentiment/

if not os.path.exists('aclImdb'):
    !wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
    !tar zxf aclImdb_v1.tar.gz
    !rm aclImdb_v1.tar.gz

## run_classifier修改部分

需要修改的部分有兩個，首先我們需要針對imdb資料制定一個適合的Processor(主要還是在於讀入檔案與標記的方式)。另外，在main(_) 底下我們也需要將自訂的Processor名稱輸入，在執行時才呼叫的到。

以下將修改的部分列出，另外也可參考run_classifier_modified.py檔案(內容已直接改好)

- 注意：由於我們需要手動將資料做隨機打亂，在此使用scikit learn的shuffle函數，若無安裝scikit learn的環境需先安裝。

In [5]:

### 手動自訂Processor

class sProcessor(DataProcessor):
  def load_file(self, dir_path):
    article_list = []
    for file in os.listdir(dir_path):
        with open(os.path.join(dir_path,file),'r') as f:
            article_list.append(f.readlines()[0])
    return article_list

  def get_train_examples(self, data_dir):
    """See base class."""
    pos_train_data = self.load_file(os.path.join(data_dir,'train/pos'))
    neg_train_data = self.load_file(os.path.join(data_dir,'train/neg'))
    
    train_x = pos_train_data+neg_train_data
    train_y = ['pos' for _ in pos_train_data]+['neg' for _ in neg_train_data]
    train_x, train_y = shuffle(train_x, train_y)
    
    return self._create_examples(zip(train_x, train_y), "train")
  
  def get_dev_examples(self, data_dir):
    """See base class."""
    
    """
    注意，因為我們手上只有train跟test兩份資料，因此dev_examples我們也以testset替代
    後續在取tf_record資料中正確答案時需在dev.tf_record取得，因testset理論上是沒有正確標記的
    """
    pos_dev_data = self.load_file(os.path.join(data_dir,'test/pos'))
    neg_dev_data = self.load_file(os.path.join(data_dir,'test/neg'))
    
    dev_x = pos_dev_data+neg_dev_data
    dev_y = ['pos' for _ in pos_dev_data]+['neg' for _ in neg_dev_data]
    
    return self._create_examples(zip(dev_x, dev_y), "dev")
    
  def get_test_examples(self, data_dir):
    """See base class."""
    pos_test_data = self.load_file(os.path.join(data_dir,'test/pos'))
    neg_test_data = self.load_file(os.path.join(data_dir,'test/neg'))
    
    test_x = pos_test_data+neg_test_data
    test_y = ['pos' for _ in test_x]
    
    return self._create_examples(zip(test_x, test_y), "test")

  def get_labels(self):
    """See base class."""
    return ["pos", "neg"]

  def _create_examples(self, lines, set_type):
    """Creates examples for the training sets."""
    examples = []
    for (i, line) in enumerate(lines):
      
      """資料沒有欄位名稱，因此不需要跳過第一行"""
      #if i == 0:
      #  continue
      
      guid = "%s-%s" % (set_type, i)
      text_a = tokenization.convert_to_unicode(line[0])
      text_b = None
      if set_type == "test":
        label = "pos"
      else:
        label = line[1]
      examples.append(
          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    return examples


### 找到def main(_) 底下的 processors 並增加自訂的processor

processors = {
  "cola": ColaProcessor,
  "mnli": MnliProcessor,
  "mrpc": MrpcProcessor,
  "xnli": XnliProcessor,
  "imdb": sProcessor,    #只要在task_name輸入IMDB就會以這個processor載入資料
}


## Model training and evaluate

In [6]:
!python run_classifier_modified.py \
  --task_name=IMDB \
  --do_train=true \
  --do_eval=true \
  --data_dir=aclImdb \
  --vocab_file=bert_model/vocab.txt \
  --bert_config_file=bert_model/bert_config.json \
  --init_checkpoint=bert_model/bert_model.ckpt \
  --max_seq_length=128 \
  --train_batch_size=32 \
  --learning_rate=2e-5 \
  --num_train_epochs=3.0 \
  --output_dir=tmp

  from ._conv import register_converters as _register_converters
INFO:tensorflow:Using config: {'_model_dir': 'tmp', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 1000, '_save_checkpoints_secs': None, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f66fef6acc0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=1000, num_shards=8, num_cores_per_replica=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None), '_cluster': None}
INFO:tensorflow:_TPUContext: eval_on_tpu True
INFO:tensorflow:Writing example 0 of 25000
INFO:tensorflow:*** E

## Model testing

In [8]:
!python run_classifier_modified.py \
  --task_name=IMDB \
  --do_predict=true \
  --data_dir=aclImdb \
  --vocab_file=bert_model/vocab.txt \
  --bert_config_file=bert_model/bert_config.json \
  --init_checkpoint=tmp/model.ckpt-2343 \
  --max_seq_length=128 \
  --output_dir=tmp

  from ._conv import register_converters as _register_converters
INFO:tensorflow:Using config: {'_model_dir': 'tmp', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 1000, '_save_checkpoints_secs': None, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7ffa6b725f60>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=1000, num_shards=8, num_cores_per_replica=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None), '_cluster': None}
INFO:tensorflow:_TPUContext: eval_on_tpu True
INFO:tensorflow:Writing example 0 of 25000
INFO:tensorflow:*** E

## Let's see the result


In [9]:
#load test_result file

import pandas as pd
import numpy as np

dat = pd.read_csv('tmp/test_results.tsv', sep='\t', header = None)
prediction = np.argmax(dat.as_matrix(),axis=1)

  import sys


In [10]:
dat.tail()

Unnamed: 0,0,1
24995,0.003486,0.996514
24996,0.002624,0.997376
24997,0.387072,0.612928
24998,0.147487,0.852513
24999,0.002469,0.997531


### Get answer from dev.tf_record file

其實在此的答案會與eval的結果相同，但為了走完整個流程我們還是繼續看下去。

In [12]:
import tensorflow as tf

seq_length = 128
out_label = []

# TF檔
filename = 'tmp/eval.tf_record'
# 產生文件名隊列
filename_queue = tf.train.string_input_producer([filename], 
                                                 shuffle=False, 
                                                 num_epochs=1)

name_to_features = {
  "input_ids": tf.FixedLenFeature([seq_length], tf.int64),
  "input_mask": tf.FixedLenFeature([seq_length], tf.int64),
  "segment_ids": tf.FixedLenFeature([seq_length], tf.int64),
  "label_ids": tf.FixedLenFeature([], tf.int64),
}

# 數據讀取器
reader = tf.TFRecordReader()
key, serialized_example = reader.read(filename_queue)

# 數據解析
data_features = tf.parse_single_example(
            serialized_example,
            features=name_to_features)
    
with tf.Session() as sess:
    # 初始化是必要的動作
    sess.run(tf.global_variables_initializer())
    sess.run(tf.local_variables_initializer())
    
    # 建立執行緒協調器
    coord = tf.train.Coordinator()
    
    # 啟動文件隊列，開始讀取文件
    threads = tf.train.start_queue_runners(coord=coord)

    try:
        while not coord.should_stop():
            [d] = sess.run([data_features])

            out_label.append(d['label_ids'])

    except tf.errors.OutOfRangeError:
        print('Done!')

    finally:
        # 最後要記得把文件隊列關掉
        coord.request_stop()
    
    coord.join(threads)

Done!


### Confusion matrix and accuracy

同樣可以看到正確率與上個版本的訓練結果差不多(因為參數設定值皆相同)

In [13]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_true = out_label, y_pred = prediction)
print(cm)

print('accuracy : %.4f'%(np.sum(np.diag(cm))/np.sum(cm)))

[[10731  1769]
 [ 1832 10668]]
accuracy : 0.8560
