In [1]:
import codecs
import pickle
import math
import jieba
jieba.initialize()

import re
import os
import numpy as np
import random
import tensorflow as tf
from tensorflow.contrib.crf import crf_log_likelihood
from tensorflow.contrib.crf import viterbi_decode
from collections import defaultdict, namedtuple


Building prefix dict from the default dictionary ...
Loading model from cache /scratch/local/jieba.cache
Loading model cost 0.761 seconds.
Prefix dict has been built successfully.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# 1. Raw Data preprocessing

    1) Load data from file. 
    2) Convert IOB tagging into IOBES tagging. 
    3) Split data into training data, testing data and evaluation data.
    4) Creating item-to-sequence and sequence-to-item dictionaries.
    5) Convert chinese characters and tags into sequence.
    6) Divide data into batches with fixed length and padding samples with 0 to maximum length.

In [2]:
# read sentences from file
def load_data(file_path):
    sentences = []
    sent = []
    for line in codecs.open(file_path, 'r', 'utf8'):
        line = line.rstrip() # Remove any white spaces at the end of the string
        if not line:
            if len(sent) > 0: # a line with "\n" is used for spliting sentences
                sentences.append(sent)
                sent = []
        else:
            word_tag = line.split() # split word and tag
            if len(word_tag) == 2:
                sent.append(word_tag)
    return sentences


# convert IOB tags to IOBES tags
def convert_to_iobes_tags(sentences):   
    for index, sent in enumerate(sentences):
        iob_tags = [word_tag[-1] for word_tag in sent] # obtain iob tags of a sentence
        iobes_tags = [] # iobes tags
        for i, tag in enumerate(iob_tags): 
            if tag == 'O': # O tag is unchanged
                iobes_tags.append(tag)
            elif tag.split('-')[0] == 'B':  # B to S if an entity only includes a single word
                if i + 1 < len(iob_tags) and iob_tags[i + 1].split('-')[0] == 'I':
                    iobes_tags.append(tag)
                else:
                    iobes_tags.append(tag.replace('B-', 'S-'))
            elif tag.split('-')[0] == 'I':  # E is used for the last item of an entity (words > 2)
                if i + 1 < len(iob_tags) and iob_tags[i + 1].split('-')[0] == 'I':
                    iobes_tags.append(tag)
                else:
                    iobes_tags.append(tag.replace('I-', 'E-'))
            else:
                print('ERROR: INVALID IOB TAGGING!')  
        for word, iobes_tag in zip(sent, iobes_tags): # replace tags
            word[-1] = iobes_tag
            
            
# split sentences into train, test, dev
def split_data(sentences):
    train_div = int(len(sentences) * 0.7);  # train set divide number
    train_sentences = sentences[:train_div]
    remaining_sentences = sentences[train_div:] 
    test_div = int(len(remaining_sentences)* 0.6)  # test set divide number
    test_sentences = remaining_sentences[:test_div]
    dev_sentences = remaining_sentences[test_div:]
    return train_sentences, test_sentences, dev_sentences


# creating dictionaries from unique chinese characters to unique id
def create_char_id_convert_dict(sentences):
    char_dict = {} # a dictionary of the frequency of unique chinese characters
    chinese_chars = [[word[0] for word in sent] for sent in sentences] # get words from tupe word_tag
    for chars in chinese_chars: # get frequency of unique chinese characters
        for char in chars:
            if char not in char_dict:
                char_dict[char] = 1
            else:
                char_dict[char] += 1
    char_dict["<PAD>"] = 99999 # spacial word for padding, and intial a largest frequency
    char_dict['<UNK>'] = 99998 # spacial word for unkonwn, and intial a second largest frequency
    # sort characters by frequency (highest to samllest)
    sorted_char_dict = sorted(char_dict.items(), key=lambda x: (-x[1], x[0])) 
    # create two dictionaries: find char by id, or find id by char
    id_to_char = {index: value[0] for index, value in enumerate(sorted_char_dict)} 
    char_to_id = {value: key for key, value in id_to_char.items()}
    return id_to_char, char_to_id


# creating dictionaries from unique tag to unique id
def create_tag_id_convert_dict(sentences):
    tag_dict = {} # a dictionary of the frequency of tags
    tags = [[word[1] for word in sent] for sent in sentences]
    for tag in tags: # get frequency of unique chinese characters
        for t in tag:
            if t not in tag_dict:
                tag_dict[t] = 1
            else:
                tag_dict[t] += 1
    # sort characters by frequency (highest to samllest)
    sorted_tag_dict = sorted(tag_dict.items(), key=lambda x: (-x[1], x[0]))
    # create two dictionaries: find tag by id, or find id by tag
    id_to_tag = {index: value[0] for index, value in enumerate(sorted_tag_dict)} 
    tag_to_id = {value: key for key, value in id_to_tag.items()}
    return id_to_tag, tag_to_id


# Generated formated data for training
def get_formated_data(sentences, char_to_id, tag_to_id):
    formated_data = []
    for sent in sentences:
        sent_chars = [word[0] for word in sent] # get chinese chars
        # convert chars to id
        chars_id = [char_to_id[char if char in char_to_id else '<UNK>'] for char in sent_chars] 
        joined_sent = "".join(sent_chars) # joined all the chars into a sentence

        # Tokenize sent with Jieba to get chinese phrase feature (the start, inside, and end of a phrase)
        phrase_feature = []
        for token in jieba.cut(joined_sent):
            if len(token) == 1: # phrase_feature is 0 if a phase only has one Chinese character
                phrase_feature.append(0)
            else:
                phrase_list = [2] * len(token) # phrase_feature of middle characters in a phase is 2
                phrase_list[0] = 1 # phrase_feature of start character in a phase is 1
                phrase_list[-1] = 3 # phrase_feature of end character in a phase is 3
                phrase_feature.extend(phrase_list)

        tags_id = [tag_to_id[word[-1]] for word in sent] # convert tags to id
        formated_data.append([sent_chars, chars_id, phrase_feature, tags_id]) # formated data
    return formated_data


# Divide data into batches and padding each sample
def generate_batch_data_with_padding(data, bcount):
    batches = []
    batch_count = int(math.ceil(len(data)/ bcount)) # calulate number of batches
    # sorted list based on the length of sentences(short to long)
    sorted_len_data = sorted(train_data, key=lambda x: len(x[0]))
    for i in range(batch_count):
        batch = sorted_len_data[(i * bcount) : ((i + 1) * bcount)] # divided data into batches with fixed length
        pad_sentsents = [] # sentsents after padding
        pad_chars = [] # chinese characters after padding
        pad_phrases = [] # pahrase features after padding
        pad_tags = [] # tags after padding
        max_length = max([len(sample[0]) for sample in batch]) # find the max length of sentence in batch
        for sample in batch:
            sent, char, phrase, tag = sample 
            pad_array = [0] * (max_length - len(sent)) # padding with 0 based on the max length
            pad_sentsents.append(sent + pad_array) 
            pad_chars.append(char + pad_array)
            pad_phrases.append(phrase + pad_array)
            pad_tags.append(tag + pad_array)    
        batches.append([pad_sentsents, pad_chars, pad_phrases, pad_tags]) # get batch data
    return batches



# data processing
folder_patch = "./dataset/"  # dataset folder
data_path = folder_patch + "data.txt" # data path

sentences = load_data(data_path) # load data
print(sentences[0]) 

convert_to_iobes_tags(sentences) # convert to iobes tags
print(sentences[0]) 

train_sentences, test_sentences, dev_sentences = split_data(sentences) # split data 
print("The number of sentences of trainning data is", len(train_sentences))
print("The number of sentences of testing data is", len(test_sentences))
print("The number of sentences of development data is", len(dev_sentences))

# creates chinese characters and senquence convertion dictionaries
id_to_char, char_to_id = create_char_id_convert_dict(train_sentences) 
# creates tags and senquence convertion dictionaries
id_to_tag, tag_to_id = create_tag_id_convert_dict(train_sentences)
print("The number of unique Chinese characters is:", len(char_to_id))
print("The number of unique tag characters is:", len(tag_to_id))

train_data = get_formated_data(train_sentences, char_to_id, tag_to_id) # formated training data
test_data = get_formated_data(test_sentences, char_to_id, tag_to_id) # formated testing data
dev_data = get_formated_data(dev_sentences, char_to_id, tag_to_id) # formated edata
print(train_data[0])

with open(folder_patch + 'dict.pkl', "wb") as out_file:  # dump data for eveluation 
    pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], out_file)

# generate batches with padding
train_batch_data = generate_batch_data_with_padding(train_data, 20) 
dev_batch_data = generate_batch_data_with_padding(dev_data, 100)
test_batch_data = generate_batch_data_with_padding(test_data, 100)

epoch_iterations = len(train_batch_data) # set the iterations per epoch
print("The number of steps per epoch is", epoch_iterations)


[['因', 'O'], ['此', 'O'], ['，', 'O'], ['这', 'O'], ['次', 'O'], ['政', 'O'], ['府', 'O'], ['危', 'O'], ['机', 'O'], ['终', 'O'], ['于', 'O'], ['得', 'O'], ['到', 'O'], ['化', 'O'], ['解', 'O'], ['，', 'O'], ['对', 'O'], ['俄', 'B-LOC'], ['罗', 'I-LOC'], ['斯', 'I-LOC'], ['来', 'O'], ['说', 'O'], ['是', 'O'], ['值', 'O'], ['得', 'O'], ['庆', 'O'], ['幸', 'O'], ['的', 'O'], ['。', 'O']]
[['因', 'O'], ['此', 'O'], ['，', 'O'], ['这', 'O'], ['次', 'O'], ['政', 'O'], ['府', 'O'], ['危', 'O'], ['机', 'O'], ['终', 'O'], ['于', 'O'], ['得', 'O'], ['到', 'O'], ['化', 'O'], ['解', 'O'], ['，', 'O'], ['对', 'O'], ['俄', 'B-LOC'], ['罗', 'I-LOC'], ['斯', 'E-LOC'], ['来', 'O'], ['说', 'O'], ['是', 'O'], ['值', 'O'], ['得', 'O'], ['庆', 'O'], ['幸', 'O'], ['的', 'O'], ['。', 'O']]
The number of sentences of trainning data is 19472
The number of sentences of testing data is 5007
The number of sentences of development data is 3339
The number of unique Chinese characters is: 4277
The number of unique tag characters is: 13
[['因', '此', '，', '这', '次', '政', '府'

2. Build the model and set up hyperparameters

In [3]:
# configure with the model:
# you can make modificiations on these hyperparameters
learning_rate = 0.001
channel_char = 100
channel_seg =20
channel_lstm = 100
len_tags = len(tag_to_id)
len_char = len(char_to_id)

# build model
class Model(object):
    def __init__(self,mode):
#         mode type, check whether the model is running for training or not
        if mode == 'training':
            self.model_training = True
        else:
            print('Mode Errors!Please Choose Correct Training Mode')
        self.__main_setup() # model initializing
        
    def __main_setup(self):
        self.__hyper() #set up hyperparameters
        self.__placeholder() #build tensor holder
        self.__parameters() #initializing
        self.__layers() #create model
        self.__opt() #optimizer
        
    def __layers(self):
        self.__embedding() #embedding layers
        self.__dilated() # iterated dilated cnn 
        self.__loss() 
        
    def __hyper(self):
        self.learningR = learning_rate #learning rate 
#         embedding dimention
        self.channel_char = channel_char  
        self.channel_seg = channel_seg
        self.channel_lstm = channel_lstm
        self.len_tags = len_tags # number of tags
        self.len_chars = len_char #unique Chinese char
        self.output_channel = 0
        
    def __placeholder(self):
        self.gt = tf.placeholder(dtype=tf.int32) #GT
        self.f1_evaluate = tf.Variable(dtype=tf.float32,initial_value=0.0, trainable=False) #best f1 score for evaluate data
        self.f1_test = tf.Variable(dtype=tf.float32,initial_value=0.0, trainable=False) #for test data
        self.whole_steps = tf.Variable(dtype=tf.int32,initial_value=0, trainable=False) #steps for training process
        self.cn_char = tf.placeholder(dtype=tf.int32) #input sentence
        self.cn_segment = tf.placeholder(dtype=tf.int32) #nput Chinese segmentation features
        self.dropout = tf.placeholder(dtype=tf.float32) #dropout
        
    def __parameters(self):
        self.len_segment = 4  #segement features 0,1,2,3
        length = tf.reduce_sum(tf.sign(tf.abs(self.cn_char)), reduction_indices=1)
        self.lengths = tf.cast(length, tf.int32)
        self.batch_size = tf.shape(self.cn_char)[0] #batch_size
        self.num_steps = tf.shape(self.cn_char)[-1] #num_steps: total chars in each sentenc
        self.layers = [1,1,2] #dilation
        self.flag_drop = 0.5
        if self.model_training == False:
            self.flag_drop = 1.0
        self.num_filter = 3
        self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=5)
        self.channel_embedding = self.channel_char + self.channel_seg  # char channels + segmentation channels
        self.iterations = 4
        
    def __embedding(self):        
        # initializing for two features
        with tf.name_scope('Embedding_1'):  #1. the unique Chinese char 
            char_embeddings = tf.get_variable('Embeddings_1',shape=[self.len_chars, self.channel_char],
                initializer=tf.random_normal_initializer(mean=0.0, stddev=0.05, seed=None),
                dtype=tf.float32,trainable=True)
            feature_1 = tf.nn.embedding_lookup(char_embeddings, self.cn_char)
            with tf.name_scope('Embedding_2'): # 2. the word length and location 
                seg_embeddings = tf.get_variable('Embeddings_2',shape=[self.len_segment, self.channel_seg],
                    initializer=tf.random_normal_initializer(mean=0.0, stddev=0.05, seed=None),
                    dtype=tf.float32,trainable=True)
                feature_2 = tf.nn.embedding_lookup(seg_embeddings, self.cn_segment)
            self.embedding_cns = tf.concat([feature_1,feature_2], axis=-1)
#         apply dropout 
        self.embedding_cns = tf.nn.dropout(self.embedding_cns, self.dropout)
    
    def __dilated(self):
#         core networks
        core_input = tf.expand_dims(self.embedding_cns, 1) # shape(?, ?, 120) ——> shape(?, 1, ?, 120)  
        with tf.variable_scope("Core_nets"):
            initialed_weight = tf.get_variable("Idcnn_filter",shape=[1, self.num_filter, self.channel_embedding,
                       self.channel_lstm],initializer=tf.random_normal_initializer(mean=0.0, stddev=0.05, seed=None))
            core_Input = tf.nn.conv2d(core_input, initialed_weight, strides=[1, 1, 1, 1],  padding="SAME",name="core_input")
            output = []
            channels = 0
            for j in range(self.iterations):  
                for i in range(len(self.layers)):
                    dilated_rate = self.layers[i]
                    if i == (len(self.layers) - 1):
                        last_layer = True
                    else:
                        last_layer = False
                    with tf.variable_scope("Dilated-Conv-%d" % i, reuse=tf.AUTO_REUSE):
                        weights = tf.get_variable(name='Weights',shape=[1, self.num_filter, self.channel_lstm,self.channel_lstm],
                            initializer=tf.random_normal_initializer(mean=0.0, stddev=0.05, seed=None))
                        biases = tf.get_variable(name='Biases',shape=[self.channel_lstm])
                        c = tf.nn.atrous_conv2d(core_Input,weights, rate=dilated_rate, padding="SAME") # dilated convolution
                        c = tf.nn.bias_add(c, biases)
                        c = tf.nn.relu(c)
                        if last_layer:
                            channels += self.channel_lstm
                            output.append(c)
                        core_Input = c
            output_last = tf.concat(values=output,axis=3) # 4 layers features
            output_last = tf.nn.dropout(output_last, self.flag_drop)
#             drop dimention: the dimention which contians only one data
            output_last = tf.squeeze(output_last, [1])
            output_last = tf.reshape(output_last, [-1, channels]) # final features done
            self.output_channel = channels
            with tf.variable_scope("Fully"):
                with tf.variable_scope("Unit"):
                    weight = tf.get_variable("Weight", shape=[self.output_channel, self.len_tags],
                                        dtype=tf.float32, initializer=tf.random_normal_initializer(mean=0.0, stddev=0.05, seed=None))
                    bias = tf.get_variable("Bias",  initializer=tf.constant(0.0001, shape=[self.len_tags]))
#                    matmul(x, w) + b.
                    result = tf.nn.xw_plus_b(output_last, weight, bias)
            self.result =  tf.reshape(result, [-1, self.num_steps, self.len_tags])  # num_steps: total chars in each sentenc, len_tags: number of tags

    def __loss(self):
        # loss 
        with tf.variable_scope("CRF"):
            small = -1000.0
            # pad units 
            initial_units = tf.concat([small * tf.ones(shape=[self.batch_size, 1, self.len_tags]), tf.zeros(shape=[self.batch_size, 1, 1])], axis=-1)
            pad_units = tf.cast(small * tf.ones([self.batch_size, self.num_steps, 1]), tf.float32)
            temp = tf.concat([self.result, pad_units], axis=-1)
            temp = tf.concat([initial_units, temp], axis=1)
            gt = tf.concat([tf.cast(self.len_tags*tf.ones([self.batch_size, 1]), tf.int32), self.gt], axis=-1)
            self.transition = tf.get_variable("transit",shape=[self.len_tags + 1, self.len_tags + 1],
                initializer=tf.random_normal_initializer(mean=0.0, stddev=0.05, seed=None))
            likelihood, self.transition = crf_log_likelihood(inputs=temp,tag_indices=gt,transition_params=self.transition,
                sequence_lengths=self.lengths+1)
            self.error = tf.reduce_mean(likelihood*(-1))
            
    def __opt(self):
        with tf.variable_scope("optimizer"):
            self.optimizer = tf.train.AdamOptimizer(self.learningR)
            # apply grad clip to avoid gradient explosion
            gradients = self.optimizer.compute_gradients(self.error)
            limited_gradients = [[tf.clip_by_value(gra, -4, 4), va] for gra, va in gradients]
            self.optimize = self.optimizer.apply_gradients(limited_gradients, self.whole_steps)
            
    def evaluate(self, sess, batch_data, id_to_tag):
        transition = self.transition.eval()
        report = []
        for batch in batch_data:
            cn_sentences = batch[0]
            tags = batch[-1] #true tag
            lengths, scores = self.each_step(sess, False, batch)
            batch_paths = self.viterbi(scores, lengths, transition)
            for i in range(len(cn_sentences)):
                output = []
                sentence = cn_sentences[i][:lengths[i]]
                gt = convert_iobes_to_iob_tags([id_to_tag[int(x)] for x in tags[i][:lengths[i]]])
                predict = convert_iobes_to_iob_tags([id_to_tag[int(x)] for x in batch_paths[i][:lengths[i]]])
                for cn_char, gt, predict in zip(sentence, gt, predict):
                    output.append(" ".join([cn_char, gt, predict]))
                report.append(output)
        return report
    
    def viterbi(self, units, lengths, array):
        # viterbi Algorithm
        paths = []
        begin = np.asarray([[-1000.0]*self.len_tags +[0]])
        for val, temp_len in zip(units, lengths):
            val = val[:temp_len]
            pad = np.ones([temp_len, 1]) * (-1000.0)
            units = np.concatenate([val, pad], axis=1)
            units = np.concatenate([begin, units], axis=0)
            path, _ = viterbi_decode(units, array)
            paths.append(path[1:])
        return paths
    
    def each_step(self, sess, training, batch):
        _, cn_char, cn_segment, tags = batch
        temp_dict = {
            self.cn_char: np.asarray(cn_char),# char id
            self.cn_segment: np.asarray(cn_segment), #segmentation
            self.dropout: 1.0, # val or test do not need to dropout
        }
        if training:
            temp_dict[self.gt] = np.asarray(tags) #GT
            temp_dict[self.dropout] = 0.6
            whole_steps, error, _ = sess.run(
                [self.whole_steps, self.error, self.optimize],
                temp_dict)
            return whole_steps, error
        else:
            lengths, units = sess.run([self.lengths, self.result], temp_dict)
            return lengths, units