# <span style='color:orange'> Modelling Chinese Word Segmentation as Sequence to Sequence Prediction Problem </span>

# Generate data for sequence to sequence modelling

This returns the structure "data" which contains sentences as individual lists, along with the class label of each character. 

In [None]:
import numpy as np
import collections
from sklearn.model_selection import train_test_split
import keras
from keras.utils.np_utils import to_categorical
from keras.preprocessing import sequence

In [None]:
# we open the count file, get the word and assign labels to each character
# cannot use dictionaries, since the same character may appear again and overwrites the value at its place in dict.

filepath = '/local-scratch/asa224/wseg_simplified_cn.txt'
# filepath = '/mnt/D82A1A8F2A1A6B30/wseg_simplified_cn.txt'
with(open(filepath, 'rb')) as f:
    data = [[]]
    count = 0
    for line in f:
        line = unicode(line, 'utf-8')
        line = line.replace('\n', '')
        words = line.split(' ')
        
        for word in words:
            if len(word) == 1:
                data[count].append((word[0], 3))
            else:
                for i, character in enumerate(word):
                    if i == 0: # this is the first letter
                        data[count].append((character, 0))
                    elif i == (len(word) - 1): # this is the last letter
                        data[count].append((character, 2))
                    else: # this is somewhere in the middle
                        data[count].append((character, 1))
        data.append([])
        count += 1
        
    f.close()

# Build initial integer embeddings

In [None]:
all_chars = [data[i][j][0] for i in range(0, len(data)) for j in range(0, len(data[i]))]

In [None]:
len(all_chars)

In [None]:
def build_dataset(words):
    count = collections.Counter(words).most_common()
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return dictionary, reverse_dictionary

In [None]:
orig_dict, ret_dict = build_dataset(all_chars)
del all_chars

In [None]:
ret_dict[orig_dict[u'\u6743']]

In [None]:
len(orig_dict)

## Create sequences suitable for training

In [None]:
x = [[]]
y = [[]]
for i in range(0, len(data)): # iterate over the whole dataset
    for j in range(0, len(data[i])): # iterate over the current sentence
        x[i].append(orig_dict[data[i][j][0]])
        y[i].append(data[i][j][1])
    x.append([])
    y.append([])

In [None]:
del data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.20, shuffle=False, random_state=42)

In [None]:
print('Size of the Dataset\n')
print('X_TRAIN: {}'.format(np.shape(X_train)))
print('Y_TRAIN: {}'.format(np.shape(y_train)))

print('X_TEST: {}'.format(np.shape(X_test)))
print('Y_TEST: {}'.format(np.shape(y_test)))

## Convert the labels to categorical. 

This is what they look like now

In [None]:
for i in y_train:
    print(i)
    break

### Start conversion

Training labels

In [None]:
for i in range(0, len(y_train)):
    y_train[i] = to_categorical(y_train[i], num_classes=4)

Testing labels <br>
There are empty lists in the test set, let's remove them. 

In [None]:
X_test = [t for t in X_test if t != []]
y_test = [t for t in y_test if t != []]

In [None]:
for i in range(0, len(y_test)):
    y_test[i] = to_categorical(y_test[i], num_classes=4)

This how it looks now

In [None]:
for i in y_train:
    print(i)
    break

Firstly, we must update the get_sequence() function to reshape the input and output sequences to be 3-dimensional to meet the expectations of the LSTM. The expected structure has the dimensions [samples, timesteps, features]. The classification problem has 1 sample (e.g. one sequence), a configurable number of timesteps, and one feature per timestep.

Let's reshape the training and testing data

In [None]:
# maximum length of the sequence
maxval = len(max(X_train,key=len))

In [None]:
X_train = sequence.pad_sequences(X_train, value=-1, maxlen=100)
y_train = sequence.pad_sequences(y_train, value=-1, maxlen=100)

X_test = sequence.pad_sequences(X_test, value=-1, maxlen=100)
y_test = sequence.pad_sequences(y_test, value=-1, maxlen=100)

In [None]:
# reshape input and output data to be suitable for LSTMs
n_timesteps = 100
X_train = X_train.reshape(np.shape(X_train)[0], n_timesteps, 1)
y_train = y_train.reshape(np.shape(y_train)[0], n_timesteps, 4)


X_test = X_test.reshape(np.shape(X_test)[0], n_timesteps, 1)
y_test = y_test.reshape(np.shape(y_test)[0], n_timesteps, 4)

In [None]:
X_train.shape

In [None]:
from keras.models import Sequential
from keras.layers import LSTM, TimeDistributed, Dense, Masking
# define LSTM
model = Sequential()
model.add(Masking(mask_value=-1, input_shape=(n_timesteps, 1)))
model.add(LSTM(20, return_sequences=True))
model.add(LSTM(20, return_sequences=True))
model.add(TimeDistributed(Dense(4, activation='softmax')))
model.compile(loss='categorical_crossentropy', optimizer='adadelta', metrics=['acc'])
model.summary()

In [None]:
history = model.fit(X_train, y_train, epochs=1, 
          batch_size=100, verbose=1, validation_data=[X_test, y_test])