In [1]:
import numpy as np
import pandas as pd
import os
import pickle as pkl
import json

In [2]:
params_dict_path = "args.json"
with open(params_dict_path, "r", encoding="utf-8") as f:
    params_dict = json.load(f)
from collections import namedtuple
params_tuple = namedtuple("params_tuple" ,params_dict.keys())
for k, v in params_dict.items():
    exec("params_tuple.{}={}".format(k, "'{}'".format(v) if type(v) == type("") else v))

In [3]:
params_tuple

__main__.params_tuple

In [4]:
def show_file_first_rows(file_path, rows= 100):
    line_list = []
    with open(file_path, "r", encoding="utf-8") as f:
        cnt = 0
        while True:
            line = f.readline()
            if not line or cnt >= rows:
                break
            line_list.append("{}\n".format(line.strip()))
            cnt += 1
    return "".join(line_list)

def identify_sep(file_path):
    first_lines_str = show_file_first_rows(file_path)
    lines = first_lines_str.split("\n")
    spliters = ["\t", " "]
    sep_cnt_dict = dict(map(lambda sep: (sep ,sum(map(lambda l: len(l.split(sep)), lines))), spliters))
    return list(map(lambda t2: t2[0],sorted(sep_cnt_dict.items(), key = lambda t2: -1 * t2[-1])))[0]

import sys, pickle, os, random
def produce_tag2label(input_series):
    input_series = pd.Series(input_series).dropna()
    return dict(map(lambda t2: (t2[1], t2[0]) ,enumerate(input_series.unique().tolist())))

def read_corpus_by_pd(corpus_path):
    sep = identify_sep(corpus_path)
    return pd.read_csv(corpus_path, header = None, delimiter=sep, skip_blank_lines=False)

def retrieve_sep_nest_list(corpus_df):
    #na_line_where = np.where(pd.isna(corpus_df.iloc[:, -1]))[0]
    #na_line_where = np.where(pd.isna(corpus_df.iloc[:, 0]))[0]
    na_line_where = np.where(corpus_df.apply(lambda x: np.any(pd.isna(x)), axis = 1))
    assert type(na_line_where) == type((0,))
    na_line_where = na_line_where[0]
    #assert 0 not in na_line_where and (corpus_df.shape[0] - 1) not in na_line_where
    assert 0 not in na_line_where
    nest_sep_indice_list = []
    for idx in range(corpus_df.shape[0]):
        if not nest_sep_indice_list:
            nest_sep_indice_list.append([idx])
        else:
            if idx not in na_line_where:
                nest_sep_indice_list[-1].append(idx)
            else:
                nest_sep_indice_list.append([])
    nest_sep_indice_list = list(filter(len, nest_sep_indice_list))
    return nest_sep_indice_list

def read_corpus_by_pd_nest_list(corpus_path):
    #### output format [([...], [...]), ] or [([...], [...], [...], [...]), ]
    corpus_df = read_corpus_by_pd(corpus_path)
    assert corpus_df.shape[1] in (4, 2)
    nest_sep_list = retrieve_sep_nest_list(corpus_df)
    for sliced_df in map(lambda inner_list: corpus_df.iloc[inner_list, :], nest_sep_list):
        ele = list(zip(*sliced_df.values.tolist()))
        yield ele
        
def vocab_build(vocab_path, corpus_path, min_count):
    """

    :param vocab_path:
    :param corpus_path:
    :param min_count:
    :return:
    """
    #data = read_corpus(corpus_path)
    data = list(read_corpus_by_pd_nest_list(corpus_path))
    word2id = {}
    #for sent_, tag_ in data:
    for t in data:
        if len(data[0]) == 2:
            sent_, tag_ = t
        elif len(data[0]) == 4:
            sent_, _, _, tag_ = t
        else:
            1 / 0
        for word in sent_:
            if word.isdigit():
                word = '<NUM>'
            elif ('\u0041' <= word <='\u005a') or ('\u0061' <= word <='\u007a'):
                #word = '<ENG>'
                pass
            if word not in word2id:
                word2id[word] = [len(word2id)+1, 1]
            else:
                word2id[word][1] += 1
    low_freq_words = []
    for word, [word_id, word_freq] in word2id.items():
        if word_freq < min_count and word != '<NUM>' and word != '<ENG>':
            low_freq_words.append(word)
    for word in low_freq_words:
        del word2id[word]

    new_id = 1
    for word in word2id.keys():
        word2id[word] = new_id
        new_id += 1
    word2id['<UNK>'] = new_id
    word2id['<PAD>'] = 0

    print(len(word2id))
    with open(vocab_path, 'wb') as fw:
        pickle.dump(word2id, fw)

def sentence2id(sent, word2id):
    """

    :param sent:
    :param word2id:
    :return:
    """
    sentence_id = []
    for word in sent:
        if word.isdigit():
            word = '<NUM>'
        elif ('\u0041' <= word <= '\u005a') or ('\u0061' <= word <= '\u007a'):
            #word = '<ENG>'
            pass
        if word not in word2id:
            word = '<UNK>'
        sentence_id.append(word2id[word])
    return sentence_id

def words_as_char2id(word, char2id):
    word_id = []
    for char in word:
        if char not in char2id:
            char = '<UNK>'
        word_id.append(char2id[char])
    return word_id

def read_dictionary(vocab_path):
    """

    :param vocab_path:
    :return:
    """
    vocab_path = os.path.join(vocab_path)
    with open(vocab_path, 'rb') as fr:
        word2id = pickle.load(fr)
    print('vocab_size:', len(word2id))
    return word2id


def random_embedding(vocab, embedding_dim):
    """

    :param vocab:
    :param embedding_dim:
    :return:
    """
    embedding_mat = np.random.uniform(-0.25, 0.25, (len(vocab), embedding_dim))
    embedding_mat = np.float32(embedding_mat)
    return embedding_mat

def pad_sequences(sequences, pad_mark=0):
    """

    :param sequences:
    :param pad_mark:
    :return:
    """
    max_len = max(map(lambda x : len(x), sequences))
    seq_list, seq_len_list = [], []
    for seq in sequences:
        seq = list(seq)
        seq_ = seq[:max_len] + [pad_mark] * max(max_len - len(seq), 0)
        seq_list.append(seq_)
        seq_len_list.append(min(len(seq), max_len))
    return seq_list, seq_len_list

def pad_char_sequences(sequences, char2id):
    """

    :param sequences:
    :param pad_mark:
    :return:
    """
    #### sequences [batch, seq_len, word_len,]
    assert "<PAD>" in char2id
    pad_mark = char2id["<PAD>"]
    
    #max_len = max(map(lambda x : len(x), sequences))
    max_seq_len = max(map(lambda x : len(x), sequences))
    max_word_len = max(map(lambda char_nest_list: max(map(lambda char_list: len(char_list), char_nest_list)), sequences))
    #print(max_seq_len, max_word_len)
    
    #seq_list, seq_len_list = [], []
    #### [B, S, W], [B,], [B, S]
    char_list, seq_len_list, word_len_nest_list = [], [], []
    for seq in sequences:
        #### [...]
        seq = list(seq)
        
        char_list.append([])
        word_len_nest_list.append([])
        for word in seq:
            word_len_nest_list[-1].append(min(len(word), max_word_len))
            word_ = word[:max_word_len] + [pad_mark] * max(max_word_len - len(word), 0)
            char_list[-1].append(word_)
        
        seq_len_list.append(len(char_list[-1]))
        for _ in range(max(max_seq_len - len(seq), 0)):
            char_list[-1].append([pad_mark] * max_word_len)
            word_len_nest_list[-1].append(0)
        #seq_ = seq[:max_len] + [pad_mark] * max(max_len - len(seq), 0)
        #seq_list.append(seq_)
        #seq_len_list.append(min(len(seq), max_len))
    #return seq_list, seq_len_list
    return char_list, seq_len_list, word_len_nest_list

def char2id_build(word2id_pkl_path):
    from functools import reduce
    with open(word2id_pkl_path, "rb") as f:
        word2id_dict = pkl.load(f)
    char2id_dict = dict(map(lambda t2: (t2[1], t2[0]), enumerate(list(reduce(lambda a, b: a.union(b) ,map(lambda word: set(list(word)),word2id_dict.keys()))))))
    padding_idx = len(char2id_dict)
    empty_idx = len(char2id_dict) + 1
    char2id_dict["<PAD>"] = padding_idx
    char2id_dict["<UNK>"] = empty_idx
    assert len(char2id_dict) == max(char2id_dict.values()) + 1
    assert min(char2id_dict.values()) == 0
    print("char size {}".format(len(char2id_dict)))
    return char2id_dict

multi_col_train_data = read_corpus_by_pd("BERT-NER/data/train.txt")
feature_0_to_label = produce_tag2label(multi_col_train_data[1])
feature_1_to_label = produce_tag2label(multi_col_train_data[2])
tag2label = produce_tag2label(multi_col_train_data[3])

def batch_yield_multi(data, batch_size, vocab, feature_0_to_label, feature_1_to_label, tag2label, shuffle=False):
    """

    :param data:
    :param batch_size:
    :param vocab:
    :param tag2label:
    :param shuffle:
    :return:
    """
    if shuffle:
        random.shuffle(data)

    #seqs, labels = [], []
    seqs, feature_0, feature_1, labels = [], [], [], []

    for (sent_, feat0_, feat1_, tag_) in data:
        sent_ = sentence2id(sent_, vocab)
        label0_ = [feature_0_to_label[tag] for tag in feat0_]
        label1_ = [feature_1_to_label[tag] for tag in feat1_]
        label_ = [tag2label[tag] for tag in tag_]

        if len(seqs) == batch_size:
            #yield seqs, labels
            yield seqs, feature_0, feature_1, labels
            #seqs, labels = [], []
            seqs, feature_0, feature_1, labels = [], [], [], []
    
        seqs.append(sent_)
        feature_0.append(label0_)
        feature_1.append(label1_)
        labels.append(label_)
        
    if len(seqs) != 0:
        yield seqs, feature_0, feature_1, labels
        
def batch_yield_multi_with_chars(data, batch_size, vocab, char2id, feature_0_to_label, feature_1_to_label, tag2label, shuffle=False):
    """

    :param data:
    :param batch_size:
    :param vocab:
    :param tag2label:
    :param shuffle:
    :return:
    """
    if shuffle:
        random.shuffle(data)

    #seqs, labels = [], []
    #seqs, feature_0, feature_1, labels = [], [], [], []
    chars ,seqs, feature_0, feature_1, labels = [], [], [], [], []

    for (sent_, feat0_, feat1_, tag_) in data:
        char_ = list(map(lambda word: words_as_char2id(word, char2id), sent_))
        
        sent_ = sentence2id(sent_, vocab)
        label0_ = [feature_0_to_label[tag] for tag in feat0_]
        label1_ = [feature_1_to_label[tag] for tag in feat1_]
        label_ = [tag2label[tag] for tag in tag_]

        if len(seqs) == batch_size:
            #yield seqs, labels
            #yield seqs, feature_0, feature_1, labels
            yield  chars, seqs, feature_0, feature_1, labels
            #seqs, labels = [], []
            #seqs, feature_0, feature_1, labels = [], [], [], []
            chars ,seqs, feature_0, feature_1, labels = [], [], [], [], []
        
        chars.append(char_)
        
        seqs.append(sent_)
        feature_0.append(label0_)
        feature_1.append(label1_)
        labels.append(label_)
        
    if len(seqs) != 0:
        #yield seqs, feature_0, feature_1, labels
        yield  chars, seqs, feature_0, feature_1, labels
        
import logging, sys, argparse


def str2bool(v):
    # copy from StackOverflow
    if v.lower() in ('yes', 'true', 't', 'y', '1'):
        return True
    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
        return False
    else:
        raise argparse.ArgumentTypeError('Boolean value expected.')


def get_entity(tag_seq, char_seq):
    PER = get_PER_entity(tag_seq, char_seq)
    LOC = get_LOC_entity(tag_seq, char_seq)
    ORG = get_ORG_entity(tag_seq, char_seq)
    return PER, LOC, ORG


def get_PER_entity(tag_seq, char_seq):
    length = len(char_seq)
    PER = []
    for i, (char, tag) in enumerate(zip(char_seq, tag_seq)):
        if tag == 'B-PER':
            if 'per' in locals().keys():
                PER.append(per)
                del per
            per = char
            if i+1 == length:
                PER.append(per)
        if tag == 'I-PER':
            per += char
            if i+1 == length:
                PER.append(per)
        if tag not in ['I-PER', 'B-PER']:
            if 'per' in locals().keys():
                PER.append(per)
                del per
            continue
    return PER


def get_LOC_entity(tag_seq, char_seq):
    length = len(char_seq)
    LOC = []
    for i, (char, tag) in enumerate(zip(char_seq, tag_seq)):
        if tag == 'B-LOC':
            if 'loc' in locals().keys():
                LOC.append(loc)
                del loc
            loc = char
            if i+1 == length:
                LOC.append(loc)
        if tag == 'I-LOC':
            loc += char
            if i+1 == length:
                LOC.append(loc)
        if tag not in ['I-LOC', 'B-LOC']:
            if 'loc' in locals().keys():
                LOC.append(loc)
                del loc
            continue
    return LOC


def get_ORG_entity(tag_seq, char_seq):
    length = len(char_seq)
    ORG = []
    for i, (char, tag) in enumerate(zip(char_seq, tag_seq)):
        if tag == 'B-ORG':
            if 'org' in locals().keys():
                ORG.append(org)
                del org
            org = char
            if i+1 == length:
                ORG.append(org)
        if tag == 'I-ORG':
            org += char
            if i+1 == length:
                ORG.append(org)
        if tag not in ['I-ORG', 'B-ORG']:
            if 'org' in locals().keys():
                ORG.append(org)
                del org
            continue
    return ORG


def get_logger(filename):
    logger = logging.getLogger('logger')
    logger.setLevel(logging.DEBUG)
    logging.basicConfig(format='%(message)s', level=logging.DEBUG)
    handler = logging.FileHandler(filename)
    handler.setLevel(logging.DEBUG)
    handler.setFormatter(logging.Formatter('%(asctime)s:%(levelname)s: %(message)s'))
    logging.getLogger().addHandler(handler)
    return logger



In [5]:
import tensorflow as tf
import numpy as np
import os, argparse, time, random
#from model import BiLSTM_CRF
#from utils import str2bool, get_logger, get_entity
#from data import read_corpus, read_dictionary, tag2label, random_embedding


## Session configuration
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # default: 0
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.per_process_gpu_memory_fraction = 0.2  # need ~700MB GPU memory

args = params_tuple
args.mode = "train"
args.mode

vocab_path = "multi_data_path/word2id.pkl"
import shutil
if os.path.exists(vocab_path):
    os.remove(vocab_path)
    #shutil.rmtree(vocab_path)
if not os.path.exists(vocab_path.split("/")[0]):
    os.mkdir(vocab_path.split("/")[0])

#corpus_path = "/home/svjack/temp_dir/BERT-NER/data/train.txt"
corpus_path = "BERT-NER/data/train.txt"
min_count = 0
vocab_build(vocab_path=vocab_path, corpus_path=corpus_path, min_count=min_count)

train_path = corpus_path
#test_path = "/home/svjack/temp_dir/BERT-NER/data/test.txt"
test_path = "BERT-NER/data/test.txt"
word2id_pkl_path = "multi_data_path/word2id.pkl"
word2id = read_dictionary(word2id_pkl_path)

char2id = char2id_build(word2id_pkl_path)
args.train_data = "multi_data_path"

22927
vocab_size: 22927
char size 87


In [6]:
args.char_embedding_dim = 100
if args.pretrain_embedding == 'random':
    embeddings = random_embedding(word2id, args.embedding_dim)
else:
    embedding_path = 'pretrain_embedding.npy'
    embeddings = np.array(np.load(embedding_path), dtype='float32')
char_embeddings = random_embedding(char2id, args.char_embedding_dim)

## read corpus and get training data
if args.mode != 'demo':
    #train_path = os.path.join('.', args.train_data, 'train_data')
    #test_path = os.path.join('.', args.test_data, 'test_data')
    #train_data = read_corpus(train_path)
    #test_data = read_corpus(test_path); test_size = len(test_data)
    #train_data = read_corpus(train_path)
    train_data = list(read_corpus_by_pd_nest_list(train_path))
    #test_data = read_corpus(test_path); test_size = len(test_data)
    test_data = list(read_corpus_by_pd_nest_list(test_path)); test_size = len(test_data)

## paths setting
paths = {}
timestamp = str(int(time.time())) if args.mode == 'train' else args.demo_model
output_path = os.path.join('.', args.train_data+"_save", timestamp)
if not os.path.exists(output_path): os.makedirs(output_path)
summary_path = os.path.join(output_path, "summaries")
paths['summary_path'] = summary_path
if not os.path.exists(summary_path): os.makedirs(summary_path)
model_path = os.path.join(output_path, "checkpoints/")
if not os.path.exists(model_path): os.makedirs(model_path)
ckpt_prefix = os.path.join(model_path, "model")
paths['model_path'] = ckpt_prefix
result_path = os.path.join(output_path, "results")
paths['result_path'] = result_path
if not os.path.exists(result_path): os.makedirs(result_path)
log_path = os.path.join(result_path, "log.txt")
paths['log_path'] = log_path
get_logger(log_path).info(str(args))

<class '__main__.params_tuple'>


In [8]:
paths

{'summary_path': './multi_data_path_save/1597451905/summaries',
 'model_path': './multi_data_path_save/1597451905/checkpoints/model',
 'result_path': './multi_data_path_save/1597451905/results',
 'log_path': './multi_data_path_save/1597451905/results/log.txt'}

In [9]:
!cp /home/svjack/temp_dir/zh-NER-TF/*.pl .

In [10]:
import os

def conlleval(label_predict, label_path, metric_path):
    """

    :param label_predict:
    :param label_path:
    :param metric_path:
    :return:
    """
    eval_perl = "./conlleval_rev.pl"
    with open(label_path, "w") as fw:
        line = []
        for sent_result in label_predict:
            for char, tag, tag_ in sent_result:
                tag = '0' if tag == 'O' else tag
                char = char.encode("utf-8")
                line.append("{} {} {}\n".format(char, tag, tag_))
            line.append("\n")
        fw.writelines(line)
    os.system("perl {} < {} > {}".format(eval_perl, label_path, metric_path))
    with open(metric_path) as fr:
        metrics = [line.strip() for line in fr]
    return metrics

In [11]:
import numpy as np
import os, time, sys
import tensorflow as tf
from tensorflow.contrib.rnn import LSTMCell
from tensorflow.contrib.crf import crf_log_likelihood
from tensorflow.contrib.crf import viterbi_decode

In [12]:
class BiLSTM_CRF_MULTI_CHAR(object):
    def __init__(self, args, embeddings, char_embeddings, feature_0_to_label, feature_1_to_label, tag2label, vocab, char2id, paths, config):
        #self.max_char_capacity = 20
        self.char2id = char2id
        
        self.batch_size = args.batch_size
        self.epoch_num = args.epoch
        self.hidden_dim = args.hidden_dim
        self.embeddings = embeddings
        
        self.char_embeddings = char_embeddings
        
        self.filters_list = args.filters_list
        self.kernel_list = args.kernel_list
        assert len(self.filters_list) == len(self.kernel_list)
        
        self.CRF = args.CRF
        self.update_embedding = args.update_embedding
        self.dropout_keep_prob = args.dropout
        self.optimizer = args.optimizer
        self.lr = args.lr
        self.clip_grad = args.clip
        self.tag2label = tag2label
        self.num_tags = len(tag2label)
        self.vocab = vocab
        self.shuffle = args.shuffle
        self.model_path = paths['model_path']
        self.summary_path = paths['summary_path']
        self.logger = get_logger(paths['log_path'])
        self.result_path = paths['result_path']
        self.config = config
        
        self.feature_0_to_label = feature_0_to_label
        self.feature_1_to_label = feature_1_to_label
        
    def build_graph(self):
        self.add_placeholders()
        self.lookup_layer_op()
        self.biLSTM_layer_op()
        self.softmax_pred_op()
        self.loss_op()
        self.trainstep_op()
        self.init_op()
    
    def add_placeholders(self):
        self.char_ids = tf.placeholder(tf.int32, shape = [None, None, None], name = "char_ids")
        
        self.word_ids = tf.placeholder(tf.int32, shape=[None, None], name="word_ids")
        self.labels = tf.placeholder(tf.int32, shape=[None, None], name="labels")
        self.sequence_lengths = tf.placeholder(tf.int32, shape=[None], name="sequence_lengths")

        self.dropout_pl = tf.placeholder(dtype=tf.float32, shape=[], name="dropout")
        self.lr_pl = tf.placeholder(dtype=tf.float32, shape=[], name="lr")
        
        self.feature0 = tf.placeholder(tf.int32, shape=[None, None], name="feature0")
        self.feature1 = tf.placeholder(tf.int32, shape=[None, None], name="feature1")

    def lookup_layer_op(self):
        with tf.variable_scope("words"):
            _word_embeddings = tf.Variable(self.embeddings,
                                           dtype=tf.float32,
                                           trainable=self.update_embedding,
                                           name="_word_embeddings")
            word_embeddings = tf.nn.embedding_lookup(params=_word_embeddings,
                                                     ids=self.word_ids,
                                                     name="word_embeddings")
            self.word_embeddings =  tf.nn.dropout(word_embeddings, self.dropout_pl)
        
        with tf.variable_scope("chars"):
            '''
            _char_embeddings = tf.Variable(self.embeddings,
                                           dtype=tf.float32,
                                           trainable=self.update_embedding,
                                           name="_char_embeddings")
            '''
            #char_embeddings
            _char_embeddings = tf.Variable(self.char_embeddings,
                                           dtype=tf.float32,
                                           trainable=self.update_embedding,
                                           name="_char_embeddings")
            char_embeddings = tf.nn.embedding_lookup(params=_char_embeddings,
                                                     ids=self.char_ids,
                                                     name="char_embeddings")
            #### [B, S, W, dim]
            self.char_embeddings =  tf.nn.dropout(char_embeddings, self.dropout_pl)
        
    def produce_char_output(self, char_embeddings, filters, kernel_size, var_scope = None):
        assert type(var_scope) == type("")
        with tf.variable_scope(var_scope):
            conv2d_layer = tf.layers.conv2d(char_embeddings, kernel_size=kernel_size, filters=filters, padding="same", name = "char_conv")      
    
            conv2d_layer_t = tf.transpose(conv2d_layer, [0, 2, 3, 1])
            max2d_layer = tf.layers.MaxPooling2D(pool_size=3, strides=2, padding = "same", name = "char_max")(conv2d_layer_t)
            max2d_layer_t = tf.transpose(max2d_layer, [0, 3, 1, 2])
            char_output_max = tf.reduce_max(tf.reshape(max2d_layer_t, [tf.shape(max2d_layer_t)[0], tf.shape(max2d_layer_t)[1], -1]), axis = -1, name = "char_reduce_max")
            char_expand_max = tf.expand_dims(char_output_max, -1)
            
            char_output_min = tf.reduce_min(tf.reshape(max2d_layer_t, [tf.shape(max2d_layer_t)[0], tf.shape(max2d_layer_t)[1], -1]), axis = -1, name = "char_reduce_min")
            char_expand_min = tf.expand_dims(char_output_min, -1)
            
            char_output_mean = tf.reduce_max(tf.reshape(max2d_layer_t, [tf.shape(max2d_layer_t)[0], tf.shape(max2d_layer_t)[1], -1]), axis = -1, name = "char_reduce_mean")
            char_expand_mean = tf.expand_dims(char_output_mean, -1)
            
            return tf.concat([char_expand_max, char_expand_min, char_expand_mean], axis = -1)
        
    def biLSTM_layer_op(self):
        with tf.variable_scope("bi-lstm"):
            cell_fw = LSTMCell(self.hidden_dim)
            cell_bw = LSTMCell(self.hidden_dim)
            #### word-embed [B, L, N]
            #### feat_0 feat_1 [B, L]
            
            '''
            filters = 3
            kernel_size = 3
            #fake_char_embedding = tf.convert_to_tensor(np.random.random([B, S, W, dim]))
            
            conv2d_layer = tf.layers.conv2d(self.char_embeddings, kernel_size=kernel_size, filters=filters, padding="same", name = "char_conv")      
    
            conv2d_layer_t = tf.transpose(conv2d_layer, [0, 2, 3, 1])
            max2d_layer = tf.layers.MaxPooling2D(pool_size=3, strides=2, padding = "same", name = "char_max")(conv2d_layer_t)
            max2d_layer_t = tf.transpose(max2d_layer, [0, 3, 1, 2])
            char_output = tf.reduce_max(tf.reshape(max2d_layer_t, [tf.shape(max2d_layer_t)[0], tf.shape(max2d_layer_t)[1], -1]), axis = -1, name = "char_reduce_max")
            char_expand = tf.expand_dims(char_output, -1)
            '''
            char_output_list = []
            for iidx, var_scope_name in map(lambda idx: (idx ,"char_output_{}".format(idx)), range(len(args.filters_list))):
                #### char_embeddings, filters, kernel_size, var_scope
                filters, kernel_size = self.filters_list[iidx], self.kernel_list[iidx]
                char_output_list.append(self.produce_char_output(self.char_embeddings ,filters, kernel_size, var_scope = var_scope_name))
            char_expand = tf.concat(char_output_list, axis = -1)

            feat0_expand = tf.expand_dims(self.feature0, -1)
            feat0_expand = tf.cast(feat0_expand, tf.float32)
            feat1_expand = tf.expand_dims(self.feature1, -1)
            feat1_expand = tf.cast(feat1_expand, tf.float32)
            #### [B, S, concat-d]
            #inputs = tf.concat([self.word_embeddings, feat0_expand, feat1_expand], axis = -1, name = "bind_inputs")
            inputs = tf.concat([char_expand ,self.word_embeddings, feat0_expand, feat1_expand], axis = -1, name = "bind_inputs")
            
            '''
            (output_fw_seq, output_bw_seq), _ = tf.nn.bidirectional_dynamic_rnn(
                cell_fw=cell_fw,
                cell_bw=cell_bw,
                inputs=self.word_embeddings,
                sequence_length=self.sequence_lengths,
                dtype=tf.float32)
            '''
            (output_fw_seq, output_bw_seq), _ = tf.nn.bidirectional_dynamic_rnn(
                cell_fw=cell_fw,
                cell_bw=cell_bw,
                inputs=inputs,
                sequence_length=self.sequence_lengths,
                dtype=tf.float32)
            
            output = tf.concat([output_fw_seq, output_bw_seq], axis=-1)
            output = tf.nn.dropout(output, self.dropout_pl)

        with tf.variable_scope("proj"):
            W = tf.get_variable(name="W",
                                shape=[2 * self.hidden_dim, self.num_tags],
                                initializer=tf.contrib.layers.xavier_initializer(),
                                dtype=tf.float32)

            b = tf.get_variable(name="b",
                                shape=[self.num_tags],
                                initializer=tf.zeros_initializer(),
                                dtype=tf.float32)

            s = tf.shape(output)
            output = tf.reshape(output, [-1, 2*self.hidden_dim])
            pred = tf.matmul(output, W) + b

            self.logits = tf.reshape(pred, [-1, s[1], self.num_tags])
            
    def loss_op(self):
        if self.CRF:
            log_likelihood, self.transition_params = crf_log_likelihood(inputs=self.logits,
                                                                   tag_indices=self.labels,
                                                                   sequence_lengths=self.sequence_lengths)
            self.loss = -tf.reduce_mean(log_likelihood)

        else:
            losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits,
                                                                    labels=self.labels)
            mask = tf.sequence_mask(self.sequence_lengths)
            losses = tf.boolean_mask(losses, mask)
            self.loss = tf.reduce_mean(losses)

        tf.summary.scalar("loss", self.loss)

    def softmax_pred_op(self):
        if not self.CRF:
            self.labels_softmax_ = tf.argmax(self.logits, axis=-1)
            self.labels_softmax_ = tf.cast(self.labels_softmax_, tf.int32)
    
    def trainstep_op(self):
        with tf.variable_scope("train_step"):
            self.global_step = tf.Variable(0, name="global_step", trainable=False)
            if self.optimizer == 'Adam':
                optim = tf.train.AdamOptimizer(learning_rate=self.lr_pl)
            elif self.optimizer == 'Adadelta':
                optim = tf.train.AdadeltaOptimizer(learning_rate=self.lr_pl)
            elif self.optimizer == 'Adagrad':
                optim = tf.train.AdagradOptimizer(learning_rate=self.lr_pl)
            elif self.optimizer == 'RMSProp':
                optim = tf.train.RMSPropOptimizer(learning_rate=self.lr_pl)
            elif self.optimizer == 'Momentum':
                optim = tf.train.MomentumOptimizer(learning_rate=self.lr_pl, momentum=0.9)
            elif self.optimizer == 'SGD':
                optim = tf.train.GradientDescentOptimizer(learning_rate=self.lr_pl)
            else:
                optim = tf.train.GradientDescentOptimizer(learning_rate=self.lr_pl)

            grads_and_vars = optim.compute_gradients(self.loss)
            
            grads_and_vars_clip = [[tf.clip_by_value(g, -self.clip_grad, self.clip_grad), v] for g, v in grads_and_vars]
            self.train_op = optim.apply_gradients(grads_and_vars_clip, global_step=self.global_step)

    def init_op(self):
        self.init_op = tf.global_variables_initializer()

    def add_summary(self, sess):
        """

        :param sess:
        :return:
        """
        self.merged = tf.summary.merge_all()
        self.file_writer = tf.summary.FileWriter(self.summary_path, sess.graph)
            
    def train(self, train, dev):
        """

        :param train:
        :param dev:
        :return:
        """
        saver = tf.train.Saver(tf.global_variables())

        with tf.Session(config=self.config) as sess:
            sess.run(self.init_op)
            self.add_summary(sess)

            for epoch in range(self.epoch_num):
                self.run_one_epoch(sess, train, dev, self.tag2label, epoch, saver)

    def test(self, test):
        saver = tf.train.Saver()
        with tf.Session(config=self.config) as sess:
            self.logger.info('=========== testing ===========')
            saver.restore(sess, self.model_path)
            label_list, seq_len_list = self.dev_one_epoch(sess, test)
            self.evaluate(label_list, seq_len_list, test)
    
    def get_feed_dict(self, chars ,seqs, feat0, feat1, labels=None, lr=None, dropout=None):
        """

        :param seqs:
        :param labels:
        :param lr:
        :param dropout:
        :return: feed_dict
        """
        char_ids, seq_len_list, word_len_nest_list = pad_char_sequences(chars, self.char2id)
        
        word_ids, seq_len_list = pad_sequences(seqs, pad_mark=0)
        feat0_, _ = pad_sequences(feat0, pad_mark=-1)
        feat1_, _ = pad_sequences(feat1, pad_mark=-1)
        
        #feed_dict = {self.word_ids: word_ids,
        #             self.sequence_lengths: seq_len_list}
        '''
        feed_dict = {self.word_ids: word_ids,
                     self.sequence_lengths: seq_len_list,
                    self.feature0: feat0_,
                     self.feature1: feat1_,
                    }
        '''
        #print(np.asarray(char_ids).shape)
        #print("-"*100)
        
        feed_dict = {
            self.char_ids: char_ids,
            self.word_ids: word_ids,
                     self.sequence_lengths: seq_len_list,
                    self.feature0: feat0_,
                     self.feature1: feat1_,
                    }
        
        if labels is not None:
            labels_, _ = pad_sequences(labels, pad_mark=0)
            feed_dict[self.labels] = labels_
        if lr is not None:
            feed_dict[self.lr_pl] = lr
        if dropout is not None:
            feed_dict[self.dropout_pl] = dropout

        return feed_dict, seq_len_list
    
    #def predict_one_batch(self, sess, seqs, feat0, feat1):
    def predict_one_batch(self, sess, chars, seqs, feat0, feat1):
        """

        :param sess:
        :param seqs:
        :return: label_list
                 seq_len_list
        """
        #feed_dict, seq_len_list = self.get_feed_dict(seqs, feat0, feat1, dropout=1.0)
        feed_dict, seq_len_list = self.get_feed_dict(chars ,seqs, feat0, feat1, dropout=1.0)
        
        if self.CRF:
            logits, transition_params = sess.run([self.logits, self.transition_params],
                                                 feed_dict=feed_dict)
            label_list = []
            for logit, seq_len in zip(logits, seq_len_list):
                viterbi_seq, _ = viterbi_decode(logit[:seq_len], transition_params)
                label_list.append(viterbi_seq)
            return label_list, seq_len_list

        else:
            label_list = sess.run(self.labels_softmax_, feed_dict=feed_dict)
            return label_list, seq_len_list
    
    def demo_one(self, sess, sent):
        """

        :param sess:
        :param sent: 
        :return:
        """
        label_list = []
        '''
        for seqs, labels in batch_yield(sent, self.batch_size, self.vocab, self.tag2label, shuffle=False):
        '''
        #for seqs, feat0, feat1, labels in batch_yield_multi(sent, self.batch_size, self.vocab, self.feature_0_to_label, self.feature_1_to_label, self.tag2label, shuffle=False):
        for chars ,seqs, feat0, feat1, labels in batch_yield_multi_with_chars(sent, self.batch_size, self.vocab, self.char2id, self.feature_0_to_label, self.feature_1_to_label, self.tag2label, shuffle=False):
                    
            label_list_, _ = self.predict_one_batch(sess, chars, seqs, feat0, feat1)
            label_list.extend(label_list_)
        label2tag = {}
        for tag, label in self.tag2label.items():
            label2tag[label] = tag if label != 0 else label
        tag = [label2tag[label] for label in label_list[0]]
        return tag
    
    def run_one_epoch(self, sess, train, dev, tag2label, epoch, saver):
        """

        :param sess:
        :param train:
        :param dev:
        :param tag2label:
        :param epoch:
        :param saver:
        :return:
        """
        num_batches = (len(train) + self.batch_size - 1) // self.batch_size

        start_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        #batches = batch_yield(train, self.batch_size, self.vocab, self.tag2label, shuffle=self.shuffle)
        #batches = batch_yield_multi(train, self.batch_size, self.vocab, self.feature_0_to_label, self.feature_1_to_label, self.tag2label, shuffle=self.shuffle)
        batches = batch_yield_multi_with_chars(train, self.batch_size, self.vocab, self.char2id, self.feature_0_to_label, self.feature_1_to_label, self.tag2label, shuffle=self.shuffle)
        
        #for step, (seqs, labels) in enumerate(batches):
        #for step, (seqs, feat0, feat1, labels) in enumerate(batches):
        for step, (chars ,seqs, feat0, feat1, labels) in enumerate(batches):
            sys.stdout.write(' processing: {} batch / {} batches.'.format(step + 1, num_batches) + '\r')
            step_num = epoch * num_batches + step + 1
            '''
            feed_dict, _ = self.get_feed_dict(seqs, labels, self.lr, self.dropout_keep_prob)
            '''
            #feed_dict, _ = self.get_feed_dict(seqs, feat0, feat1, labels, self.lr, self.dropout_keep_prob)
            feed_dict, _ = self.get_feed_dict(chars ,seqs, feat0, feat1, labels, self.lr, self.dropout_keep_prob)
            _, loss_train, summary, step_num_ = sess.run([self.train_op, self.loss, self.merged, self.global_step],
                                                         feed_dict=feed_dict)
            #break
            if step + 1 == 1 or (step + 1) % 300 == 0 or step + 1 == num_batches:
                self.logger.info(
                    '{} epoch {}, step {}, loss: {:.4}, global_step: {}'.format(start_time, epoch + 1, step + 1,
                                                                                loss_train, step_num))

            self.file_writer.add_summary(summary, step_num)
            
            #print((step + 1, num_batches))
            if step + 1 == num_batches:
                saver.save(sess, self.model_path, global_step=step_num)
        
        saver.save(sess, self.model_path, global_step=step_num)
        self.logger.info('===========validation / test===========')
        label_list_dev, seq_len_list_dev = self.dev_one_epoch(sess, dev)
        self.evaluate(label_list_dev, seq_len_list_dev, dev, epoch)

    def dev_one_epoch(self, sess, dev):
        """

        :param sess:
        :param dev:
        :return:
        """
        label_list, seq_len_list = [], []
        #for seqs, labels in batch_yield(dev, self.batch_size, self.vocab, self.tag2label, shuffle=False):
        #for seqs, feat0, feat1, labels in batch_yield_multi(dev, self.batch_size, self.vocab, self.feature_0_to_label, self.feature_1_to_label, self.tag2label, shuffle=False):
        for chars ,seqs, feat0, feat1, labels in batch_yield_multi_with_chars(dev, self.batch_size, self.vocab, self.char2id, self.feature_0_to_label, self.feature_1_to_label, self.tag2label, shuffle=False):

            #label_list_, seq_len_list_ = self.predict_one_batch(sess, seqs)
            label_list_, seq_len_list_ = self.predict_one_batch(sess, chars, seqs, feat0, feat1)
            label_list.extend(label_list_)
            seq_len_list.extend(seq_len_list_)
        return label_list, seq_len_list

    def evaluate(self, label_list, seq_len_list, data, epoch=None):
        """

        :param label_list:
        :param seq_len_list:
        :param data:
        :param epoch:
        :return:
        """
        label2tag = {}
        for tag, label in self.tag2label.items():
            label2tag[label] = tag if label != 0 else label

        model_predict = []
        #for label_, (sent, tag) in zip(label_list, data):
        for label_, (sent, _, _, tag) in zip(label_list, data):
            tag_ = [label2tag[label__] for label__ in label_]
            sent_res = []
            if  len(label_) != len(sent):
                print(sent)
                print(len(label_))
                print(tag)
            for i in range(len(sent)):
                sent_res.append([sent[i], tag[i], tag_[i]])
            model_predict.append(sent_res)
        #print(model_predict)
        epoch_num = str(epoch+1) if epoch != None else 'test'
        label_path = os.path.join(self.result_path, 'label_' + epoch_num)
        metric_path = os.path.join(self.result_path, 'result_metric_' + epoch_num)
        for _ in conlleval(model_predict, label_path, metric_path):
            self.logger.info(_)


In [13]:
#!ls /home/svjack/temp_dir/zh-NER-TF/multi_data_path_save/1597370783/checkpoints

In [16]:
args.epoch = 5

In [17]:
args.filters_list = [3, 5]
args.kernel_list = [3, 5]
model = BiLSTM_CRF_MULTI_CHAR(args, embeddings, char_embeddings, feature_0_to_label, feature_1_to_label, tag2label, word2id, char2id, paths, config = config)
model.build_graph()


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


From <ipython-input-12-ef96684abe84>:67: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.


From <ipython-input-12-ef96684abe84>:108: LSTMCell.__init__ (from tensorflow.python.ops.rnn_cell_impl) is deprecated and will be removed in a future version.
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.


Instructions for updating:
Use `tf.keras.layers.Conv2D` instead.


From <ipython-input-12-ef96684abe84>:90: conv2d (from tensorflow.python.layers.convolutional) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.keras.layers.Conv2D` instead.


Instructions for updating:
Please use `layer.__call__` method instead.


From /home/svjack/anaconda3/envs/py36/lib/python3.6/site-packages/tensorflow_core/python/layers/convolutional.py:424: Layer.apply (from tensorflow.python.keras.engine.base_layer) is deprecated and will be removed in a future version.
Instructions for updating:
Please use `layer.__call__` method instead.


Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))`, which is equivalent to this API


From <ipython-input-12-ef96684abe84>:154: bidirectional_dynamic_rnn (from tensorflow.python.ops.rnn) is deprecated and will be removed in a future version.
Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))`, which is equivalent to this API


Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API


From /home/svjack/anaconda3/envs/py36/lib/python3.6/site-packages/tensorflow_core/python/ops/rnn.py:464: dynamic_rnn (from tensorflow.python.ops.rnn) is deprecated and will be removed in a future version.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API


Instructions for updating:
Please use `layer.add_weight` method instead.


From /home/svjack/anaconda3/envs/py36/lib/python3.6/site-packages/tensorflow_core/python/ops/rnn_cell_impl.py:958: Layer.add_variable (from tensorflow.python.keras.engine.base_layer) is deprecated and will be removed in a future version.
Instructions for updating:
Please use `layer.add_weight` method instead.


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


From /home/svjack/anaconda3/envs/py36/lib/python3.6/site-packages/tensorflow_core/python/ops/rnn_cell_impl.py:962: calling Zeros.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


From /home/svjack/anaconda3/envs/py36/lib/python3.6/site-packages/tensorflow_core/python/ops/rnn.py:244: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.




The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [18]:
print("train data: {}".format(len(train_data)))
model.train(train=train_data, dev=test_data)

train data: 16122
 processing: 1 batch / 252 batches.

2020-08-15 08:43:59 epoch 1, step 1, loss: 29.01, global_step: 1


 processing: 252 batch / 252 batches.

2020-08-15 08:43:59 epoch 1, step 252, loss: 1.818, global_step: 252
processed 50154 tokens with 5648 phrases; found: 5446 phrases; correct: 3863.
accuracy:  94.68%; precision:  70.93%; recall:  68.40%; FB1:  69.64
LOC: precision:  82.72%; recall:  75.48%; FB1:  78.93  1522
MISC: precision:  69.32%; recall:  53.42%; FB1:  60.34  541
ORG: precision:  59.94%; recall:  62.25%; FB1:  61.08  1725
PER: precision:  72.07%; recall:  73.90%; FB1:  72.98  1658


 processing: 1 batch / 252 batches.

2020-08-15 08:49:34 epoch 2, step 1, loss: 1.968, global_step: 253


 processing: 252 batch / 252 batches.

2020-08-15 08:49:34 epoch 2, step 252, loss: 0.8452, global_step: 504
processed 50154 tokens with 5648 phrases; found: 5664 phrases; correct: 4475.
accuracy:  96.37%; precision:  79.01%; recall:  79.23%; FB1:  79.12
LOC: precision:  91.26%; recall:  80.10%; FB1:  85.31  1464
MISC: precision:  68.04%; recall:  66.10%; FB1:  67.05  682
ORG: precision:  75.98%; recall:  74.47%; FB1:  75.22  1628
PER: precision:  76.08%; recall:  88.93%; FB1:  82.01  1890


 processing: 1 batch / 252 batches.

2020-08-15 08:55:12 epoch 3, step 1, loss: 0.9204, global_step: 505


 processing: 252 batch / 252 batches.

2020-08-15 08:55:12 epoch 3, step 252, loss: 0.4852, global_step: 756
processed 50154 tokens with 5648 phrases; found: 5637 phrases; correct: 4552.
accuracy:  96.58%; precision:  80.75%; recall:  80.59%; FB1:  80.67
LOC: precision:  89.76%; recall:  83.57%; FB1:  86.56  1553
MISC: precision:  70.48%; recall:  69.37%; FB1:  69.92  691
ORG: precision:  81.61%; recall:  72.43%; FB1:  76.75  1474
PER: precision:  76.50%; recall:  90.79%; FB1:  83.03  1919


 processing: 1 batch / 252 batches.

2020-08-15 09:00:59 epoch 4, step 1, loss: 0.4359, global_step: 757


 processing: 252 batch / 252 batches.

2020-08-15 09:00:59 epoch 4, step 252, loss: 0.2464, global_step: 1008
processed 50154 tokens with 5648 phrases; found: 5630 phrases; correct: 4593.
accuracy:  96.72%; precision:  81.58%; recall:  81.32%; FB1:  81.45
LOC: precision:  93.05%; recall:  83.51%; FB1:  88.03  1497
MISC: precision:  69.91%; recall:  69.52%; FB1:  69.71  698
ORG: precision:  81.87%; recall:  75.02%; FB1:  78.29  1522
PER: precision:  76.63%; recall:  90.66%; FB1:  83.06  1913


 processing: 1 batch / 252 batches.

2020-08-15 09:05:44 epoch 5, step 1, loss: 0.2535, global_step: 1009


 processing: 84 batch / 252 batches.

KeyboardInterrupt: 

In [19]:
model.model_path

'./multi_data_path_save/1597451905/checkpoints/model'

In [20]:
model_dict = {}
model_dict["tag2label"] , model_dict["feature_0_to_label"] , model_dict["feature_1_to_label"] \
, model_dict["vocab"] , model_dict["char2id"] = \
model.tag2label, model.feature_0_to_label, model.feature_1_to_label, \
model.vocab, model.char2id
model_dict_path = "model_dict"
if os.path.exists(model_dict_path):
    os.remove(model_dict_path)
with open(model_dict_path, "wb") as f:
    pkl.dump(model_dict, f)

In [50]:
paths['model_path'] = "/home/svjack/temp_dir/colab-model/multi_data_path_save/1597451905/checkpoints/model-1008"

In [51]:
tf.reset_default_graph()

In [52]:
args.filters_list = [3, 5]
args.kernel_list = [3, 5]
model = BiLSTM_CRF_MULTI_CHAR(args, embeddings, char_embeddings, feature_0_to_label, feature_1_to_label, tag2label, word2id, char2id, paths, config = config)
model.build_graph()

In [53]:
model.test(test_data)



INFO:tensorflow:Restoring parameters from /home/svjack/temp_dir/colab-model/multi_data_path_save/1597451905/checkpoints/model-1008


Restoring parameters from /home/svjack/temp_dir/colab-model/multi_data_path_save/1597451905/checkpoints/model-1008
processed 50154 tokens with 5648 phrases; found: 5630 phrases; correct: 4593.
accuracy:  96.72%; precision:  81.58%; recall:  81.32%; FB1:  81.45
LOC: precision:  93.05%; recall:  83.51%; FB1:  88.03  1497
MISC: precision:  69.91%; recall:  69.52%; FB1:  69.71  698
ORG: precision:  81.87%; recall:  75.02%; FB1:  78.29  1522
PER: precision:  76.63%; recall:  90.66%; FB1:  83.06  1913


In [48]:
#tf.reset_default_graph()

In [55]:
saver = tf.train.Saver(tf.global_variables())
with tf.Session(config=model.config) as sess:
    #sess.run(model.init_op)
    saver.restore(sess, model.model_path)
    model.add_summary(sess)

    for epoch in range(model.epoch_num):
        #model.run_one_epoch(sess, train, dev, model.tag2label, epoch, saver)
        model.run_one_epoch(sess, train_data, test_data, model.tag2label, epoch, saver)
        break


INFO:tensorflow:Restoring parameters from /home/svjack/temp_dir/colab-model/multi_data_path_save/1597451905/checkpoints/model-1008


Restoring parameters from /home/svjack/temp_dir/colab-model/multi_data_path_save/1597451905/checkpoints/model-1008


 processing: 1 batch / 252 batches.

2020-08-15 10:10:29 epoch 1, step 1, loss: 0.1953, global_step: 1


 processing: 252 batch / 252 batches.

2020-08-15 10:10:29 epoch 1, step 252, loss: 0.656, global_step: 252
processed 50154 tokens with 5648 phrases; found: 5645 phrases; correct: 4652.
accuracy:  96.83%; precision:  82.41%; recall:  82.37%; FB1:  82.39
LOC: precision:  89.61%; recall:  86.39%; FB1:  87.97  1608
MISC: precision:  69.10%; recall:  72.93%; FB1:  70.96  741
ORG: precision:  80.08%; recall:  76.22%; FB1:  78.10  1581
PER: precision:  83.56%; recall:  88.62%; FB1:  86.01  1715
