# <span style='color:orange'> Modelling Chinese Word Segmentation as Sequence to Sequence Prediction Problem </span>

# Generate data for sequence to sequence modelling

This returns the structure "data" which contains sentences as individual lists, along with the class label of each character. 

In [1]:
import numpy as np
import collections
from sklearn.model_selection import train_test_split
import keras
from keras.utils.np_utils import to_categorical
from keras.preprocessing import sequence

Using TensorFlow backend.


In [2]:
# we open the count file, get the word and assign labels to each character
# cannot use dictionaries, since the same character may appear again and overwrites the value at its place in dict.
def generateTupleListAccToSentences(filename='/local-scratch/asa224/wseg_simplified_cn.txt'):
    """
    This function generates a data list of lists, which contains sequences and corresponding
    labels for each character, according to the sentences in the input file. This function 
    takes the whole training set txt file as input, and generates sequences according to the 
    line, ie. each sequence is a line. 
    
    If you want to use this data as training data for LSTM, you have to pad the sequences 
    since they are not of the same length. 
    """
    # filepath = '/mnt/D82A1A8F2A1A6B30/wseg_simplified_cn.txt'
    with(open(filepath, 'rb')) as f:
        data = [[]]
        count = 0
        for line in f:
            line = unicode(line, 'utf-8')
            line = line.replace('\n', '')
            words = line.split(' ')

            for word in words:
                if len(word) == 1:
                    data[count].append((word[0], 3))
                else:
                    for i, character in enumerate(word):
                        if i == 0: # this is the first letter
                            data[count].append((character, 0))
                        elif i == (len(word) - 1): # this is the last letter
                            data[count].append((character, 2))
                        else: # this is somewhere in the middle
                            data[count].append((character, 1))
            data.append([])
            count += 1

        f.close()
        
        return data
    
def generateTupleList(filename='/local-scratch/asa224/word_file_1M'):
    """
    This function is similar to the above function in the sense that it assigns labels to each
    character in the training set. The function returns a list of tuples, in which each tuple
    contains a single character and its corresponding label. 
    
    Use this function in conjunction with nGramSequenceGenerator(labelledlist, n) to create a
    training set with constant sequence size, which does not require paddings. 
    """
    with(open(filename, 'rb')) as f:
        label = []
        for line in f:
            word, count = line.split('\t')

            # making sure the parsing is going fine
            assert int(count) == 0

            word = unicode(word, 'utf-8')
            if len(word) == 1:
                label.append((word[0], 3))
            else:
                for i, character in enumerate(word):
                    if i == 0: # this is the first letter
                        label.append((character, 0))
                    elif i == (len(word) - 1): # this is the last letter
                        label.append((character, 2))
                    else: # this is somewhere in the middle
                        label.append((character, 1))

        f.close()
        return label
    
def nGramSequenceGenerator(labelledlist, n):
    """
    Takes as input the label list of tuples generated by the code above. 
    The function generates sequence of size "n" from the given list. 
    """
    count = len(labelledlist)/n
    ngrammedlist = []
    for i in range(count):
        ngrammedlist.append( labelledlist[i*n : (i+1)*n])
    return ngrammedlist

# Build initial integer embeddings

In [3]:
all_chars = [data[i][j][0] for i in range(0, len(data)) for j in range(0, len(data[i]))]

In [4]:
len(all_chars)

41617387

In [5]:
def build_dataset(words):
    count = collections.Counter(words).most_common()
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return dictionary, reverse_dictionary

In [6]:
orig_dict, ret_dict = build_dataset(all_chars)
del all_chars

In [7]:
ret_dict[orig_dict[u'\u6743']]

u'\u6743'

In [8]:
len(orig_dict)

5419

## Create sequences suitable for training

In [9]:
x = [[]]
y = [[]]
for i in range(0, len(data)): # iterate over the whole dataset
    for j in range(0, len(data[i])): # iterate over the current sentence
        x[i].append(orig_dict[data[i][j][0]])
        y[i].append(data[i][j][1])
    x.append([])
    y.append([])

In [10]:
del data

In [11]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.20, shuffle=False, random_state=42)

In [12]:
print('Size of the Dataset\n')
print('X_TRAIN: {}'.format(np.shape(X_train)))
print('Y_TRAIN: {}'.format(np.shape(y_train)))

print('X_TEST: {}'.format(np.shape(X_test)))
print('Y_TEST: {}'.format(np.shape(y_test)))

Size of the Dataset

X_TRAIN: (816287,)
Y_TRAIN: (816287,)
X_TEST: (204072,)
Y_TEST: (204072,)


## Convert the labels to categorical. 

This is what they look like now

In [13]:
for i in y_train:
    print(i)
    break

[0, 2, 0, 2, 3, 3, 0, 2, 0, 2, 3, 3, 0, 1, 1, 2, 0, 2, 0, 2, 3, 0, 1, 2, 0, 1, 2, 0, 2, 3, 3, 3, 3, 0, 2, 3, 3, 0, 1, 1, 1, 2, 0, 2, 0, 2, 3, 3, 3]


### Start conversion

Testing labels <br>
There are empty lists in the test set, let's remove them. 

In [14]:
X_train = [t for t in X_train if t != []]
y_train = [t for t in y_train if t != []]

X_test = [t for t in X_test if t != []]
y_test = [t for t in y_test if t != []]

Training labels

In [15]:
for i in range(0, len(y_train)):
    y_train[i] = to_categorical(y_train[i], num_classes=4)

In [16]:
for i in range(0, len(y_test)):
    y_test[i] = to_categorical(y_test[i], num_classes=4)

This how it looks now

In [17]:
for i in y_train:
    print(i)
    break

[[ 1.  0.  0.  0.]
 [ 0.  0.  1.  0.]
 [ 1.  0.  0.  0.]
 [ 0.  0.  1.  0.]
 [ 0.  0.  0.  1.]
 [ 0.  0.  0.  1.]
 [ 1.  0.  0.  0.]
 [ 0.  0.  1.  0.]
 [ 1.  0.  0.  0.]
 [ 0.  0.  1.  0.]
 [ 0.  0.  0.  1.]
 [ 0.  0.  0.  1.]
 [ 1.  0.  0.  0.]
 [ 0.  1.  0.  0.]
 [ 0.  1.  0.  0.]
 [ 0.  0.  1.  0.]
 [ 1.  0.  0.  0.]
 [ 0.  0.  1.  0.]
 [ 1.  0.  0.  0.]
 [ 0.  0.  1.  0.]
 [ 0.  0.  0.  1.]
 [ 1.  0.  0.  0.]
 [ 0.  1.  0.  0.]
 [ 0.  0.  1.  0.]
 [ 1.  0.  0.  0.]
 [ 0.  1.  0.  0.]
 [ 0.  0.  1.  0.]
 [ 1.  0.  0.  0.]
 [ 0.  0.  1.  0.]
 [ 0.  0.  0.  1.]
 [ 0.  0.  0.  1.]
 [ 0.  0.  0.  1.]
 [ 0.  0.  0.  1.]
 [ 1.  0.  0.  0.]
 [ 0.  0.  1.  0.]
 [ 0.  0.  0.  1.]
 [ 0.  0.  0.  1.]
 [ 1.  0.  0.  0.]
 [ 0.  1.  0.  0.]
 [ 0.  1.  0.  0.]
 [ 0.  1.  0.  0.]
 [ 0.  0.  1.  0.]
 [ 1.  0.  0.  0.]
 [ 0.  0.  1.  0.]
 [ 1.  0.  0.  0.]
 [ 0.  0.  1.  0.]
 [ 0.  0.  0.  1.]
 [ 0.  0.  0.  1.]
 [ 0.  0.  0.  1.]]


Firstly, we must update the get_sequence() function to reshape the input and output sequences to be 3-dimensional to meet the expectations of the LSTM. The expected structure has the dimensions [samples, timesteps, features]. The classification problem has 1 sample (e.g. one sequence), a configurable number of timesteps, and one feature per timestep.

Let's reshape the training and testing data

In [18]:
# maximum length of the sequence
maxval = len(max(X_train,key=len))

In [19]:
X_train = sequence.pad_sequences(X_train, value=-1, maxlen=100)
y_train = sequence.pad_sequences(y_train, value=-1, maxlen=100)

X_test = sequence.pad_sequences(X_test, value=-1, maxlen=100)
y_test = sequence.pad_sequences(y_test, value=-1, maxlen=100)

In [20]:
# reshape input and output data to be suitable for LSTMs
n_timesteps = 100
X_train = X_train.reshape(np.shape(X_train)[0], n_timesteps, 1)
y_train = y_train.reshape(np.shape(y_train)[0], n_timesteps, 4)


X_test = X_test.reshape(np.shape(X_test)[0], n_timesteps, 1)
y_test = y_test.reshape(np.shape(y_test)[0], n_timesteps, 4)

In [21]:
X_train.shape

(816287, 100, 1)

TODO: Add embedding layer <br>
TODO: Increase LSTM units <br>
TODO: Decrease the sequence size <br>
TODO: Decrease the sequence size from the training set generation itself <br>

In [30]:
from keras.models import Sequential
from keras.layers import LSTM, TimeDistributed, Dense, Masking, Embedding
# define LSTM
model = Sequential()
model.add(Masking(mask_value=-1, input_shape=(n_timesteps, 1)))
model.add(LSTM(200, return_sequences=True))
model.add(LSTM(200, return_sequences=True))
model.add(TimeDistributed(Dense(4, activation='softmax')))
model.compile(loss='categorical_crossentropy', optimizer='adadelta', metrics=['acc'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
masking_7 (Masking)          (None, 100, 1)            0         
_________________________________________________________________
lstm_5 (LSTM)                (None, 100, 200)          161600    
_________________________________________________________________
lstm_6 (LSTM)                (None, 100, 200)          320800    
_________________________________________________________________
time_distributed_2 (TimeDist (None, 100, 4)            804       
Total params: 483,204
Trainable params: 483,204
Non-trainable params: 0
_________________________________________________________________


In [None]:
history = model.fit(X_train, y_train, epochs=1, 
          batch_size=100, verbose=1, validation_data=[X_test, y_test])

Train on 816287 samples, validate on 204070 samples
Epoch 1/1


In [None]:
history2 = model.fit(X_train, y_train, epochs=1, 
          batch_size=100, verbose=1, validation_data=[X_test, y_test])

Train on 816287 samples, validate on 204070 samples
Epoch 1/1