## Loading the data, padding (based on 2.0)

In [1]:
import sys
import os
import numpy as np
import torch


##### I added Start Of Sentence token `<SOS>` and End Of Sentence token `<EOS>`

In [2]:
# Label data
#
# Start of Sentence: SOS: 0
# End of Sentence:   EOS: 1
# Word Start:             2
# Word End:               3
def read_chinese_data(inputfilename):
    with open(inputfilename, "r") as inputfile:
        sentences = []
        collection_words = []
        collection_labels = []
        for line in inputfile:
            if line[0] == '#':
                continue
            columns = line.split()
            if columns == []:
                sent = "<SOS>" + ''.join(collection_words) + "<EOS>"
                sentences.append((sent, [0] + collection_labels + [1]))
                collection_words = []
                collection_labels = []
                continue
            collection_words.append(columns[1])
            collection_labels += [2] + ([3] * (len(columns[1]) - 1))

    return sentences

In [3]:
train_sentences = read_chinese_data(
    '/scratch/lt2316-h20-resources/zh_gsd-ud-train.conllu')


In [4]:
test_sentences = read_chinese_data(
    '/scratch/lt2316-h20-resources/zh_gsd-ud-test.conllu')


##### `PAD` index is zero, `<EOS>` and `<SOS>` at the end of word to index dictionary

In [5]:
import re


def index_chars(sentences):
    regx = r"(^<SOS>|<EOS><SOS>|<EOS>$)"
    megasentence = re.sub(regx, "", "".join(sentences))
    char_list = set()
    for c in megasentence:
        char_list.add(c)
    char_list = ["PAD"] + list(char_list) + ["<EOS>"] + ["<SOS>"]
    return char_list, {char_list[x]: x for x in range(len(char_list))}


In [6]:
int_index, char_index = index_chars(
    [x[0] for x in train_sentences + test_sentences])


In [7]:
# int_index: List of char in the train document
# char_index: char to index dict; index of a char in int_index.
for i in range(10):
    print(int_index[i], char_index[int_index[i]])

PAD 0
妍 1
敗 2
舉 3
蓓 4
蜜 5
輿 6
閻 7
悔 8
漳 9


In [8]:
# convert sequence of chars to the respective sequence of indecies
def convert_sentence(sentence, index):
    sent = sentence[5:-5]  # remove start and end word
    sos = sentence[:5]
    eos = sentence[-5:]
    return [index[sos]] + [index[x] for x in sent] + [index[eos]]

In [9]:
def pad_lengths(sentences, max_length, padding=0):
    return [x + ([padding] * (max_length - len(x))) for x in sentences]

In [10]:
import random

rand_id = random.randint(0, len(train_sentences) - 1)
sample_sent = train_sentences[rand_id][0]
sample_label = train_sentences[rand_id][1]
sent_convert = convert_sentence(sample_sent, char_index)

print("Sentence Text")
print(sample_sent)
print()
print("Sentence Segmentation Label")
print(sample_label)
print()
print("Sentence in int")
print(sent_convert)
print()
print("Sentence Padded")
pad_len = len(sent_convert) + 4
print(pad_lengths([sent_convert], pad_len))

Sentence Text
<SOS>索菲兩人逃出來後前往美國大使館受阻，於是躲到巴黎市中心的一個公園。<EOS>

Sentence Segmentation Label
[0, 2, 3, 2, 2, 2, 2, 3, 2, 2, 3, 2, 3, 2, 3, 2, 2, 3, 2, 2, 3, 2, 3, 2, 3, 2, 2, 3, 2, 2, 2, 2, 3, 2, 1]

Sentence in int
[3649, 3636, 1511, 170, 3346, 339, 2254, 3552, 1551, 212, 1558, 2408, 458, 1348, 3139, 3478, 118, 1503, 531, 704, 2493, 375, 3105, 1162, 673, 744, 146, 3085, 2705, 1643, 222, 571, 2347, 2544, 3648]

Sentence Padded
[[3649, 3636, 1511, 170, 3346, 339, 2254, 3552, 1551, 212, 1558, 2408, 458, 1348, 3139, 3478, 118, 1503, 531, 704, 2493, 375, 3105, 1162, 673, 744, 146, 3085, 2705, 1643, 222, 571, 2347, 2544, 3648, 0, 0, 0, 0]]


In [11]:
def create_dataset(x, device="cpu"):
    converted = [(convert_sentence(x1[0], char_index), x1[1]) for x1 in x]
    X, y = zip(*converted)
    lengths = [len(x2) for x2 in X]
    padded_X = pad_lengths(X, max(lengths))
    Xt = torch.LongTensor(padded_X).to(device)
    padded_y = pad_lengths(y, max(lengths), padding=-1)
    yt = torch.LongTensor(padded_y).to(device)
    lengths_t = torch.LongTensor(lengths).to(device)
    return Xt, lengths_t, yt

In [12]:
train_X_tensor, train_lengths_tensor, train_y_tensor = create_dataset(
    train_sentences, torch.device("cuda:1"))
test_X_tensor, test_lengths_tensor, test_y_tensor = create_dataset(
    test_sentences, torch.device("cuda:1"))

In [13]:
print("Train X:\n", train_X_tensor[rand_id])

Train X:
 tensor([3649, 3636, 1511,  170, 3346,  339, 2254, 3552, 1551,  212, 1558, 2408,
         458, 1348, 3139, 3478,  118, 1503,  531,  704, 2493,  375, 3105, 1162,
         673,  744,  146, 3085, 2705, 1643,  222,  571, 2347, 2544, 3648,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0

In [14]:
print("\nTrain Y:\n", train_y_tensor[rand_id])


Train Y:
 tensor([ 0,  2,  3,  2,  2,  2,  2,  3,  2,  2,  3,  2,  3,  2,  3,  2,  2,  3,
         2,  2,  3,  2,  3,  2,  3,  2,  2,  3,  2,  2,  2,  2,  3,  2,  1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1], device='cuda:1')


## Batching

In [15]:
class Batcher:
    def __init__(self, X, lengths, y, device, batch_size=50, max_iter=None):
        self.X = X
        self.lengths = lengths.to(torch.device("cpu"))
        self.y = y
        self.device = device
        self.batch_size = batch_size
        self.max_iter = max_iter
        self.curr_iter = 0

    def __iter__(self):
        return self

    def __next__(self):
        if self.curr_iter == self.max_iter:
            raise StopIteration
        permutation = torch.randperm(self.X.size()[0], device=self.device)
        permX = self.X[permutation]
        permlengths = self.lengths[permutation]
        permy = self.y[permutation]
        splitX = torch.split(permX, self.batch_size)
        splitlengths = torch.split(permlengths, self.batch_size)
        splity = torch.split(permy, self.batch_size)

        self.curr_iter += 1
        return splitX, splitlengths, splity

In [16]:
batches = Batcher(train_X_tensor,
                  train_lengths_tensor,
                  train_y_tensor,
                  torch.device('cuda:1'),
                  max_iter=100)


In [17]:
X0, l0, y0 = next(batches)
print(train_X_tensor.size())
print(len(X0), len(l0), len(y0))
print(X0[0].size())
print(l0[0].size())
print(y0[0].size())

torch.Size([3997, 184])
80 80 80
torch.Size([50, 184])
torch.Size([50])
torch.Size([50, 184])


## Modeling

##### I Added an Encoder layer (LSTM) under the word segmentation layer.
##### I Removed the `log_softmax` from the model as I will goint to use `CrossEntropyLoss`.
##### I moved the embedding layer to a separate class.
##### I added a layer for char generation `self.generator` above the encoder layer.
##### I added a layer for word segmentation `self.word_seg` above the encoder layer.

In [18]:
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [19]:
class SingleDualObj(nn.Module):
    def __init__(self, vocab_size, emb_size, enc_h_size, seg_h_size,
                 gen_h_size):
        super().__init__()

        self.embedding = SeqEmbd(vocab_size, emb_size)
        self.encoder = SeqEncoder(emb_size, enc_h_size)
        self.word_seg = Segmenter(enc_h_size, seg_h_size)
        self.generator = CharGenerator(enc_h_size, gen_h_size, vocab_size - 1)

    def forward(self, x, lengths):

        embs = self.embedding(x)
        encdd = self.encoder(embs, lengths)
        gen_trn_seq = encdd.clone()[:, :-1]
        gen_logits = self.generator(gen_trn_seq, lengths - 1)
        seg_logits = self.word_seg(encdd, lengths)

        return gen_logits, seg_logits

In [20]:
class SeqEmbd(nn.Module):
    def __init__(self, vocab_size, emb_size):
        super().__init__()

        self.emb = nn.Embedding(vocab_size, emb_size, 0)

    def forward(self, x):
        embs = self.emb(x)

        return embs

In [21]:
class SeqEncoder(nn.Module):
    def __init__(self, input_size, enc_h_size):
        super().__init__()

        self.lstm = nn.LSTM(input_size, enc_h_size, batch_first=True)

    def forward(self, x, lengths):
        packed = pack_padded_sequence(x,
                                      lengths,
                                      batch_first=True,
                                      enforce_sorted=False)
        output1, _ = self.lstm(packed)
        unpacked, _ = pad_packed_sequence(output1, batch_first=True)

        return unpacked

In [22]:
class Segmenter(nn.Module):
    def __init__(self, input_size, seg_h_size):
        super().__init__()

        self.lstm = nn.LSTM(input_size, seg_h_size, batch_first=True)
        self.fc =  nn.Sequential(nn.Sigmoid(), nn.Linear(seg_h_size, 4))

    def forward(self, x_embd, lengths):
        packed = pack_padded_sequence(x_embd,
                                      lengths,
                                      batch_first=True,
                                      enforce_sorted=False)
        last_out, _ = self.lstm(packed)
        unpacked, _ = pad_packed_sequence(last_out, batch_first=True)
        logits = self.fc(unpacked)

        return logits


##### The char generation layer consists of an LSTM layer and a classifier consisting of three Linear layers with a `Tanh` activation function in between. I normalized the output of the LSTM layer, the first and second Linear layers to enhance the performance. 

In [23]:
class CharGenerator(nn.Module):
    def __init__(self, input_size, gen_h_size, vocab_size):
        super().__init__()

        self.sequence = nn.LSTM(input_size,
                                gen_h_size,
                                num_layers=1,
                                batch_first=True)

        h_size1 = (gen_h_size + vocab_size) // 2
        h_size2 = (h_size1 + vocab_size) // 3
        self.clf1 = nn.Linear(gen_h_size, h_size1)
        self.clf2 = nn.Linear(h_size1, h_size2)
        self.fc = nn.Linear(h_size2, vocab_size)

        self.bn_enc = nn.BatchNorm1d(gen_h_size)
        self.bn_clf1 = nn.BatchNorm1d(h_size1)
        self.bn_clf2 = nn.BatchNorm1d(h_size2)

        self.act1 = nn.Tanh()
        self.act2 = nn.Tanh()

    def forward(self, x_embd, lengths):
        packed = pack_padded_sequence(x_embd,
                                      lengths,
                                      batch_first=True,
                                      enforce_sorted=False)
        output, _ = self.sequence(packed)
        output_unpacked, _ = pad_packed_sequence(output, batch_first=True)
        output_norm = self.bn_enc(output_unpacked.permute(0, 2, 1))

        clf1 = self.clf1(output_norm.permute(0, 2, 1))
        clf1_norm = self.act1(self.bn_clf1(clf1.permute(0, 2, 1)))

        clf2 = self.clf2(clf1_norm.permute(0, 2, 1))
        clf2_norm = self.act1(self.bn_clf2(clf2.permute(0, 2, 1)))

        return self.fc(clf2_norm.permute(0, 2, 1))

In [24]:
import torch.optim as optim

##### Each task (word segmentation and char generation) has its loss function.
##### To enhance the performance, the word segmentation layer is frozen until the char generation layer nearly converges to a solution. As the training loop may take a long time, the unfreezing occurs at loop number 450. This freezing/unfreezing approach allows the base sequence layer `self.encoder` to come with meaningful weights when training the word segmentation layer. Also, it prevents the gradients from the word segmentation layer from being a source of noise for the base sequence layer `self.encoder`.

In [25]:
def train(X,
          lengths,
          y,
          vocab_size,
          emb_size,
          enc_h_size,
          seg_h_size,
          gen_h_size,
          batch_size,
          epochs,
          device,
          model=None):

    batches = Batcher(X,
                      lengths,
                      y,
                      device,
                      batch_size=batch_size,
                      max_iter=epochs)

    if not model:
        m = SingleDualObj(vocab_size, emb_size, enc_h_size,
                          seg_h_size, gen_h_size).to(device)
    else:
        m = model

    loss_seg = nn.CrossEntropyLoss(ignore_index=-1)
    loss_gen = nn.CrossEntropyLoss(ignore_index=0)

    epoch = 0
    # freeze segmentation layer
    for layer in m.word_seg.children():
        for parameter in layer.parameters():
            parameter.requires_grad = False
    
    optimizer = optim.Adam(m.parameters(), lr=0.001)

    for batch in batches:

        tot_loss = 0
        for X, ls, y in zip(*batch):
            optimizer.zero_grad()

            # Unfreeze segmentation layer
            if epoch == 449:
                for layer in m.word_seg.children():
                    for parameter in layer.parameters():
                        parameter.requires_grad = True

            o_gen, o_seg = m(X, ls)
            l = loss_gen(o_gen.permute(0, 2, 1), X[:, 1:max(ls)])
            if epoch >= 449:
                l += loss_seg(o_seg.permute(0, 2, 1), y[:, :max(ls)])
            
            tot_loss += l.item()
            l.backward()
            optimizer.step()

        if (epoch + 1) % 25 == 0 or epoch == 0:
            print("Total loss in epoch {:<3d} is {:.3f}.".format(
                epoch, tot_loss))

        epoch += 1

    # print("Total loss in epoch {:<2d} is {:.3f}.".format(epoch - 1, tot_loss))
    return m

In [26]:
kwargs = {
    "vocab_size": len(int_index),
    "emb_size": 300,
    "enc_h_size": 200,
    "seg_h_size": 150,
    "gen_h_size": 700,
    "batch_size": 64,
    "epochs": 500,
    "device": torch.device("cuda:1")
}

In [27]:
model = train(train_X_tensor, train_lengths_tensor, train_y_tensor, **kwargs)


Total loss in epoch 0   is 385.279.
Total loss in epoch 24  is 20.162.
Total loss in epoch 49  is 18.570.
Total loss in epoch 74  is 53.748.
Total loss in epoch 99  is 18.481.
Total loss in epoch 124 is 18.579.
Total loss in epoch 149 is 18.107.
Total loss in epoch 174 is 19.980.
Total loss in epoch 199 is 17.854.
Total loss in epoch 224 is 17.788.
Total loss in epoch 249 is 17.272.
Total loss in epoch 274 is 17.017.
Total loss in epoch 299 is 37.176.
Total loss in epoch 324 is 16.206.
Total loss in epoch 349 is 15.990.
Total loss in epoch 374 is 15.949.
Total loss in epoch 399 is 15.516.
Total loss in epoch 424 is 15.257.
Total loss in epoch 449 is 67.710.
Total loss in epoch 474 is 15.518.
Total loss in epoch 499 is 15.130.


## Evaluation

##### Use two separates lists to hold the ground truth  and the predections for each task. 

In [28]:
from sklearn.metrics import accuracy_score, classification_report
from torch.nn import functional as F

In [29]:
model.eval()
loss = nn.CrossEntropyLoss(ignore_index=0)

probs = []
gen_preds = []
gen_gtruth = []
seg_preds = []
seg_gtruth = []
test_lengs = test_lengths_tensor.to(torch.device("cpu"))
for i, X_test in enumerate(test_X_tensor):
    with torch.no_grad():
        output = model(X_test.view(1, -1), test_lengs[i].view(-1))
        probs.append(
            loss(output[0].permute(0, 2, 1),
                 X_test[1:test_lengs[i]].view(1, -1)).item())

    gen_prediction = torch.argmax(F.log_softmax(output[0], 2), 2)
    seg_prediction = torch.argmax(F.log_softmax(output[1], 2), 2)

    gen_preds.extend(gen_prediction.float().view(-1).tolist())
    gen_gtruth.extend(X_test[1:test_lengs[i]].float().tolist())

    seg_preds.extend(seg_prediction.float().view(-1).tolist())
    seg_gtruth.extend(test_y_tensor[i][:test_lengs[i]].float().tolist())


In [30]:
from statistics import mean

accuracy = accuracy_score(gen_gtruth, gen_preds)

print("Next character prediction Report:\n")
print(f"Accuracy: {accuracy:.2%}")


print(f"Min: {min(probs):.3f}\nMax: {max(probs):.3f}\nAVG: {mean(probs):.3f}")

Next character prediction Report:

Accuracy: 16.54%
Min: 1.204
Max: 16.440
AVG: 9.823


In [31]:
accuracy = accuracy_score(seg_gtruth, seg_preds)

print("Word Segmentation Report:\n")
print(classification_report(seg_gtruth, seg_preds))
print(f"Accuracy: {accuracy:.2%}")

Word Segmentation Report:

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       500
         1.0       1.00      1.00      1.00       500
         2.0       0.92      0.94      0.93     12012
         3.0       0.89      0.87      0.88      7194

    accuracy                           0.92     20206
   macro avg       0.95      0.95      0.95     20206
weighted avg       0.92      0.92      0.92     20206

Accuracy: 91.68%
