# <span style='color:orange'> Modelling Chinese Word Segmentation as Sequence to Sequence Prediction Problem </span>

# <span style='color:Green'> Testing Pre-Trained Models Notebook</span>

# Load the model

In [1]:
from keras.models import load_model
import cPickle as pickle
import numpy as np

Using TensorFlow backend.


In [67]:
# model = load_model('/home/asa224/Desktop/students_less_asa224/Test Folder on Less/checkpoints/model.seq2seq_nl_09-0.90.hdf5')
model = load_model('/home/asa224/Desktop/students_less_asa224/Test Folder on Less/model_timestamps_13_epoch1-10.h5')
# model = load_model('/home/asa224/Desktop/students_less_asa224/Test Folder on Less/checkpoints_deeper_net/model.seq2seq_nl_deeper_04-0.89.hdf5')

In [68]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 13, 200)           1084000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 13, 300)           601200    
_________________________________________________________________
lstm_2 (LSTM)                (None, 13, 300)           721200    
_________________________________________________________________
dropout_1 (Dropout)          (None, 13, 300)           0         
_________________________________________________________________
time_distributed_1 (TimeDist (None, 13, 4)             1204      
Total params: 2,407,604
Trainable params: 2,407,604
Non-trainable params: 0
_________________________________________________________________


# Helper functions

In [47]:
SPECIAL_SYMBOL = u'\u02e0'

In [48]:
# we open the count file, get the word and assign labels to each character
# cannot use dictionaries, since the same character may appear again and overwrites the value at its place in dict.
def generateTupleListAccToSentences(filename='/local-scratch/asa224/wseg_simplified_cn.txt'):
    """
    This function generates a data list of lists, which contains sequences and corresponding
    labels for each character, according to the sentences in the input file. This function 
    takes the whole training set txt file as input, and generates sequences according to the 
    line, ie. each sequence is a line. 
    
    INPUT: Input to this function is the training text file
    
    If you want to use this data as training data for LSTM, you have to pad the sequences 
    since they are not of the same length. 
    """
    # filepath = '/mnt/D82A1A8F2A1A6B30/wseg_simplified_cn.txt'
    with(open(filepath, 'rb')) as f:
        data = [[]]
        count = 0
        for line in f:
            line = unicode(line, 'utf-8')
            line = line.replace('\n', '')
            words = line.split(' ')

            for word in words:
                if len(word) == 1:
                    data[count].append((word[0], 3))
                else:
                    for i, character in enumerate(word):
                        if i == 0: # this is the first letter
                            data[count].append((character, 0))
                        elif i == (len(word) - 1): # this is the last letter
                            data[count].append((character, 2))
                        else: # this is somewhere in the middle
                            data[count].append((character, 1))
            data.append([])
            count += 1

        f.close()
        
        return data
    
def generateWordFile(filename='/local-scratch/asa224/wseg_simplified_cn.txt'):
    """
    The function generates the word file, similar to the count_1w.txt file provided by Prof. Anoop
    
    The output of the file can be used to parse the characters, and is an input to the 
    generateTupleList() function as well. 
    
    INPUT: Input to this function is the training text file. 
    
    """
    with(open(filename, 'rb')) as f:
        word_file_1M = open('/local-scratch/word_file_1M', 'wb')
        for line in f:
            line = unicode(line, 'utf-8')
            line = line.replace('\n', '')
            line = line.split(' ')
            
            # add the newline back using a special symbol
            line.append(SPECIAL_SYMBOL)
            for word in line:
                word_file_1M.write(word.encode('utf-8') + '\t'.encode('utf-8') + str(0).encode('utf-8') +\
                                           '\n'.encode('utf-8'))
        f.close()
    word_file_1M.close()

def generateInputWordFile(filename='/local-scratch/asa224/input'):
    """
    The function generates the word file, similar to the count_1w.txt file provided by Prof. Anoop
    
    THIS GENERATES WORD FILE FOR INPUT TEXT. 
    
    The output of the file can be used to parse the characters, and is an input to the 
    generateTupleList() function as well. 
    
    INPUT: Input to this function is the training text file. 
    
    """
    with(open(filename, 'rb')) as f:
        word_file_1M = open('./input_word_file', 'wb')
        for line in f:
            line = unicode(line, 'utf-8')
            # replace the newline character with the special unicode symbol
            line = line.replace('\n', SPECIAL_SYMBOL)
            for word in line:
                word_file_1M.write(word.encode('utf-8') + '\t'.encode('utf-8') + str(0).encode('utf-8') +\
                                           '\n'.encode('utf-8'))
        f.close()
    word_file_1M.close()

def generateTupleList(filename='/local-scratch/asa224/word_file_1M'):
    """
    This function is similar to the above function in the sense that it assigns labels to each
    character in the training set. The function returns a list of tuples, in which each tuple
    contains a single character and its corresponding label. 
    
    INPUT: Input to this function is a WORD FILE generated by generateWordFile(filename) function. 
    
    Use this function in conjunction with nGramSequenceGenerator(labelledlist, n) to create a
    training set with constant sequence size, which does not require paddings. 
    """
    with(open(filename, 'rb')) as f:
        label = []
        for line in f:
            word, count = line.split('\t')
            # making sure the parsing is going fine
            assert int(count) == 0

            word = unicode(word, 'utf-8')
            if len(word) == 1:
                label.append((word[0], 3))
            else:
                for i, character in enumerate(word):
                    if i == 0: # this is the first letter
                        label.append((character, 0))
                    elif i == (len(word) - 1): # this is the last letter
                        label.append((character, 2))
                    else: # this is somewhere in the middle
                        label.append((character, 1))

        f.close()
        return label
    
def nGramSequenceGenerator(labelledlist, n):
    """
    Takes as input the label list of tuples generated by the code above. 
    The function generates sequence of size "n" from the given list. 
    """
    count = len(labelledlist)/n
    ngrammedlist = []
    for i in range(count):
        ngrammedlist.append( labelledlist[i*n : (i+1)*n])
    return ngrammedlist

def nGramSequenceGeneratorForInput(labelledlist, n):
    """
    Takes as input the label list of tuples generated by the code above. 
    The function generates sequence of size "n" from the given list. 
    """
    count = len(labelledlist)/n
    ngrammedlist = []
    for i in range(count):
        ngrammedlist.append( labelledlist[i*n : (i+1)*n])
    
    ngrammedlist.append(labelledlist[len(labelledlist)-n:])
    return ngrammedlist

# Prepare test data for prediction

In [69]:
generateInputWordFile(filename='../../data/input')

In [70]:
tuples = generateTupleList(filename='./input_word_file')

In [71]:
tuples

[(u'\u6cd5', 3),
 (u'\u6b63', 3),
 (u'\u7814', 3),
 (u'\u7a76', 3),
 (u'\u4ece', 3),
 (u'\u6ce2', 3),
 (u'\u9ed1', 3),
 (u'\u64a4', 3),
 (u'\u519b', 3),
 (u'\u8ba1', 3),
 (u'\u5212', 3),
 (u'\u02e0', 3),
 (u'\u65b0', 3),
 (u'\u534e', 3),
 (u'\u793e', 3),
 (u'\u5df4', 3),
 (u'\u9ece', 3),
 (u'\uff19', 3),
 (u'\u6708', 3),
 (u'\uff11', 3),
 (u'\u65e5', 3),
 (u'\u7535', 3),
 (u'\uff08', 3),
 (u'\u8bb0', 3),
 (u'\u8005', 3),
 (u'\u5f20', 3),
 (u'\u6709', 3),
 (u'\u6d69', 3),
 (u'\uff09', 3),
 (u'\u02e0', 3),
 (u'\u6cd5', 3),
 (u'\u56fd', 3),
 (u'\u56fd', 3),
 (u'\u9632', 3),
 (u'\u90e8', 3),
 (u'\u957f', 3),
 (u'\u83b1', 3),
 (u'\u5965', 3),
 (u'\u5854', 3),
 (u'\u5c14', 3),
 (u'\uff11', 3),
 (u'\u65e5', 3),
 (u'\u8bf4', 3),
 (u'\uff0c', 3),
 (u'\u6cd5', 3),
 (u'\u56fd', 3),
 (u'\u6b63', 3),
 (u'\u5728', 3),
 (u'\u7814', 3),
 (u'\u7a76', 3),
 (u'\u4ece', 3),
 (u'\u6ce2', 3),
 (u'\u9ed1', 3),
 (u'\u64a4', 3),
 (u'\u519b', 3),
 (u'\u7684', 3),
 (u'\u8ba1', 3),
 (u'\u5212', 3),
 (u'\u3002', 3

In [72]:
final_input = nGramSequenceGenerator(tuples, n=13)

In [73]:
final_input[-1]

[(u'\u5c9b', 3),
 (u'\u5c40', 3),
 (u'\u52bf', 3),
 (u'\u7ee7', 3),
 (u'\u7eed', 3),
 (u'\u505a', 3),
 (u'\u51fa', 3),
 (u'\u5efa', 3),
 (u'\u8bbe', 3),
 (u'\u6027', 3),
 (u'\u52aa', 3),
 (u'\u529b', 3),
 (u'\u3002', 3)]

In [74]:
parent_path = '/home/asa224/Desktop/students_less_asa224/Test Folder on Less/'

In [75]:
orig_dict = pickle.load( open( parent_path +"orig_dict.p", "rb" ) )
ret_dict = pickle.load( open( parent_path +"ret_dict.p", "rb" ) )

In [76]:
final_input[-2]

[(u'\u65b9', 3),
 (u'\u5e0c', 3),
 (u'\u671b', 3),
 (u'\u6709', 3),
 (u'\u5173', 3),
 (u'\u5404', 3),
 (u'\u65b9', 3),
 (u'\u4e3a', 3),
 (u'\u7f13', 3),
 (u'\u548c', 3),
 (u'\u671d', 3),
 (u'\u9c9c', 3),
 (u'\u534a', 3)]

In [77]:
x = [[]]
count = 0
for i in range(0, len(final_input)): # iterate over the whole dataset
    for j in range(0, len(final_input[i])): # iterate over the current sentence
        try:
            x[i].append(orig_dict[final_input[i][j][0]])
        except KeyError:
            x[i].append(np.random.choice(orig_dict.values()))
            count += 1
    x.append([])

In [78]:
count

4

In [79]:
ret_dict[orig_dict[SPECIAL_SYMBOL]]

u'\u02e0'

In [80]:
x[-2]

[1196, 35, 810, 501, 375, 326, 27, 88, 242, 257, 879, 162, 3]

# Start the prediction process, and write data to output file

In [83]:
out_file = open('./output_BLSTM', 'wb')
prev = 0
forw = 0
for seq in x[:-1]:
    pred_labels = model.predict_classes(np.array(seq).reshape(1,len(seq)))
    # get the class label
    
    for num in range(0, len(pred_labels[0])):
        char = seq[num]
        
        if ret_dict[char] == SPECIAL_SYMBOL:
            out_file.write('\n'.encode('utf-8'))
            prev = 3
            
        elif pred_labels[0][num] == 0: # beg
            
            # current character is beginning
            
            # previous one was beg, put prefix space, since new word is starting
            # previous one was mid, put prefix space since new word is starting
            if prev == 0 or prev == 1:
                out_file.write(' '.encode('utf-8') + ret_dict[char].encode('utf-8'))
            
            # previous one was end, we dont need to put prefix space. 
            # previous one was single, we dont need space since it prints space. 
            elif prev == 2 or prev == 3: 
                out_file.write(ret_dict[char].encode('utf-8'))
    
            prev = 0
            
        elif pred_labels[0][num] == 1: # mid
            
            # current character is mid
            
            # previous one was beg, do nothing.
            # previous one was mid, do nothing. # continue the word
            # previous one was end, it will already have trailing space
            # previous one was single, it will already have space. 
            # if prev == 0 or prev == 1 or prev == 2 or prev == 3:
            out_file.write(ret_dict[char].encode('utf-8'))
            prev = 1
            
        elif pred_labels[0][num] == 2: # end
            
            # current character is end
            
            # previous one was beg, add trailing space, word just ended
            # previous one was mid, add trailing space, word just ended
            if prev == 0 or prev == 1:
                out_file.write(ret_dict[char].encode('utf-8') + ' '.encode('utf-8'))
                
            # previous one was end, do nothing, continue word
            # previous one was single, do nothing, there is already space
            elif prev == 2 or prev == 3:
                out_file.write(ret_dict[char].encode('utf-8'))
            
            prev = 2
            
        elif pred_labels[0][num] == 3: # single letter
            # if its a single letter, always print double space
            if prev == 2 or prev == 3:
                out_file.write(ret_dict[char].encode('utf-8') + ' '.encode('utf-8'))
            else:
                out_file.write(' '.encode('utf-8') + ret_dict[char].encode('utf-8') + ' '.encode('utf-8'))
            prev = 3
out_file.close()





















