In [1]:
import codecs
import pickle
import math
import jieba
jieba.initialize()

Building prefix dict from the default dictionary ...
Loading model from cache /scratch/local/jieba.cache
Loading model cost 0.673 seconds.
Prefix dict has been built successfully.


# 1. Data preprocessing

    1) Load data from file. 
    2) Convert IOB tagging into IOBES tagging. 
    3) Split data into training data, testing data and evaluation data.
    4) Creating item-to-sequence and sequence-to-item dictionaries.
    5) Convert chinese characters and tags into sequence.

In [2]:
# read sentences from file
def load_data(file_path):
    sentences = []
    sent = []
    for line in codecs.open(file_path, 'r', 'utf8'):
        line = line.rstrip() # Remove any white spaces at the end of the string
        if not line:
            if len(sent) > 0: # a line with "\n" is used for spliting sentences
                sentences.append(sent)
                sent = []
        else:
            word_tag = line.split() # split word and tag
            if len(word_tag) == 2:
                sent.append(word_tag)
    return sentences


# convert IOB tags to IOBES tags
def convert_to_iobes_tags(sentences):   
    for index, sent in enumerate(sentences):
        iob_tags = [word_tag[-1] for word_tag in sent] # obtain iob tags of a sentence
        iobes_tags = [] # iobes tags
        for i, tag in enumerate(iob_tags): 
            if tag == 'O': # O tag is unchanged
                iobes_tags.append(tag)
            elif tag.split('-')[0] == 'B':  # B to S if an entity only includes a single word
                if i + 1 < len(iob_tags) and iob_tags[i + 1].split('-')[0] == 'I':
                    iobes_tags.append(tag)
                else:
                    iobes_tags.append(tag.replace('B-', 'S-'))
            elif tag.split('-')[0] == 'I':  # E is used for the last item of an entity (words > 2)
                if i + 1 < len(iob_tags) and iob_tags[i + 1].split('-')[0] == 'I':
                    iobes_tags.append(tag)
                else:
                    iobes_tags.append(tag.replace('I-', 'E-'))
            else:
                print('ERROR: INVALID IOB TAGGING!')  
        for word, iobes_tag in zip(sent, iobes_tags): # replace tags
            word[-1] = iobes_tag
            
            
# split sentences into train, test, dev
def split_data(sentences):
    train_div = int(len(sentences) * 0.7);  # train set divide number
    train_sentences = sentences[:train_div]
    remaining_sentences = sentences[train_div:] 
    test_div = int(len(remaining_sentences)* 0.6)  # test set divide number
    test_sentences = remaining_sentences[:test_div]
    dev_sentences = remaining_sentences[test_div:]
    return train_sentences, test_sentences, dev_sentences


# creating dictionaries from unique chinese characters to unique id
def create_char_id_convert_dict(sentences):
    char_dict = {} # a dictionary of the frequency of unique chinese characters
    chinese_chars = [[word[0] for word in sent] for sent in sentences] # get words from tupe word_tag
    for chars in chinese_chars: # get frequency of unique chinese characters
        for char in chars:
            if char not in char_dict:
                char_dict[char] = 1
            else:
                char_dict[char] += 1
    char_dict["<PAD>"] = 99999 # spacial word for padding, and intial a largest frequency
    char_dict['<UNK>'] = 99998 # spacial word for unkonwn, and intial a second largest frequency
    # sort characters by frequency (highest to samllest)
    sorted_char_dict = sorted(char_dict.items(), key=lambda x: (-x[1], x[0])) 
    # create two dictionaries: find char by id, or find id by char
    id_to_char = {index: value[0] for index, value in enumerate(sorted_char_dict)} 
    char_to_id = {value: key for key, value in id_to_char.items()}
    return id_to_char, char_to_id


# creating dictionaries from unique tag to unique id
def create_tag_id_convert_dict(sentences):
    tag_dict = {} # a dictionary of the frequency of tags
    tags = [[word[1] for word in sent] for sent in sentences]
    for tag in tags: # get frequency of unique chinese characters
        for t in tag:
            if t not in tag_dict:
                tag_dict[t] = 1
            else:
                tag_dict[t] += 1
    # sort characters by frequency (highest to samllest)
    sorted_tag_dict = sorted(tag_dict.items(), key=lambda x: (-x[1], x[0]))
    # create two dictionaries: find tag by id, or find id by tag
    id_to_tag = {index: value[0] for index, value in enumerate(sorted_tag_dict)} 
    tag_to_id = {value: key for key, value in id_to_tag.items()}
    return id_to_tag, tag_to_id


# Generated formated data for training
def get_formated_data(sentences, char_to_id, tag_to_id):
    formated_data = []
    for sent in sentences:
        sent_chars = [word[0] for word in sent] # get chinese chars
        # convert chars to id
        chars_id = [char_to_id[char if char in char_to_id else '<UNK>'] for char in sent_chars] 
        joined_sent = "".join(sent_chars) # joined all the chars into a sentence

        # Tokenize sent with Jieba to get chinese phrase feature (the start, inside, and end of a phrase)
        phrase_feature = []
        for token in jieba.cut(joined_sent):
            if len(token) == 1: # phrase_feature is 0 if a phase only has one Chinese character
                phrase_feature.append(0)
            else:
                phrase_list = [2] * len(token) # phrase_feature of middle characters in a phase is 2
                phrase_list[0] = 1 # phrase_feature of start character in a phase is 1
                phrase_list[-1] = 3 # phrase_feature of end character in a phase is 3
                phrase_feature.extend(phrase_list)

        tags_id = [tag_to_id[word[-1]] for word in sent] # convert tags to id
        formated_data.append([sent_chars, chars_id, phrase_feature, tags_id]) # formated data
    return formated_data



# data processing
folder_patch = "./dataset/"  # dataset folder
data_path = folder_patch + "data.txt" # data path

sentences = load_data(data_path) # load data
print(sentences[0]) 

convert_to_iobes_tags(sentences) # convert to iobes tags
print(sentences[0]) 

train_sentences, test_sentences, dev_sentences = split_data(sentences) # split data 
print("The number of sentences of trainning data is", len(train_sentences))
print("The number of sentences of testing data is", len(test_sentences))
print("The number of sentences of development data is", len(dev_sentences))

# creates chinese characters and senquence convertion dictionaries
id_to_char, char_to_id = create_char_id_convert_dict(train_sentences) 
# creates tags and senquence convertion dictionaries
id_to_tag, tag_to_id = create_tag_id_convert_dict(train_sentences)
print("The number of unique Chinese characters is:", len(char_to_id))
print("The number of unique tag characters is:", len(tag_to_id))

train_data = get_formated_data(train_sentences, char_to_id, tag_to_id) # formated training data
test_data = get_formated_data(test_sentences, char_to_id, tag_to_id) # formated testing data
dev_data = get_formated_data(dev_sentences, char_to_id, tag_to_id) # formated evaluation data
print(train_data[0])

with open(folder_patch + 'dict.pkl', "wb") as out_file:
    pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], out_file)


[['因', 'O'], ['此', 'O'], ['，', 'O'], ['这', 'O'], ['次', 'O'], ['政', 'O'], ['府', 'O'], ['危', 'O'], ['机', 'O'], ['终', 'O'], ['于', 'O'], ['得', 'O'], ['到', 'O'], ['化', 'O'], ['解', 'O'], ['，', 'O'], ['对', 'O'], ['俄', 'B-LOC'], ['罗', 'I-LOC'], ['斯', 'I-LOC'], ['来', 'O'], ['说', 'O'], ['是', 'O'], ['值', 'O'], ['得', 'O'], ['庆', 'O'], ['幸', 'O'], ['的', 'O'], ['。', 'O']]
[['因', 'O'], ['此', 'O'], ['，', 'O'], ['这', 'O'], ['次', 'O'], ['政', 'O'], ['府', 'O'], ['危', 'O'], ['机', 'O'], ['终', 'O'], ['于', 'O'], ['得', 'O'], ['到', 'O'], ['化', 'O'], ['解', 'O'], ['，', 'O'], ['对', 'O'], ['俄', 'B-LOC'], ['罗', 'I-LOC'], ['斯', 'E-LOC'], ['来', 'O'], ['说', 'O'], ['是', 'O'], ['值', 'O'], ['得', 'O'], ['庆', 'O'], ['幸', 'O'], ['的', 'O'], ['。', 'O']]
The number of sentences of trainning data is 19472
The number of sentences of testing data is 5007
The number of sentences of development data is 3339
The number of unique Chinese characters is: 4277
The number of unique tag characters is: 13
[['因', '此', '，', '这', '次', '政', '府'