## Loading the data, padding (based on 2.0)

In [1]:
import sys
import os
import numpy as np
import torch


##### I added Start Of Sentence token `<SOS>` and End Of Sentence token `<EOS>`

In [2]:
# Label data
#
# Start of Sentence: SOS: 0
# End of Sentence:   EOS: 1
# Word Start:             2
# Word End:               3
def read_chinese_data(inputfilename):
    with open(inputfilename, "r") as inputfile:
        sentences = []
        collection_words = []
        collection_labels = []
        for line in inputfile:
            if line[0] == '#':
                continue
            columns = line.split()
            if columns == []:
                sent = "<SOS>" + ''.join(collection_words) + "<EOS>"
                sentences.append((sent, [0] + collection_labels + [1]))
                collection_words = []
                collection_labels = []
                continue
            collection_words.append(columns[1])
            collection_labels += [2] + ([3] * (len(columns[1]) - 1))

    return sentences

In [3]:
train_sentences = read_chinese_data(
    '/scratch/lt2316-h20-resources/zh_gsd-ud-train.conllu')


In [4]:
test_sentences = read_chinese_data(
    '/scratch/lt2316-h20-resources/zh_gsd-ud-test.conllu')


##### `PAD` index is zero, `<EOS>` and `<SOS>` at the end of word to index dictionary

In [5]:
import re


def index_chars(sentences):
    regx = r"(^<SOS>|<EOS><SOS>|<EOS>$)"  # remove added token from sentences
    megasentence = re.sub(regx, "", "".join(sentences))
    char_list = set()
    for c in megasentence:
        char_list.add(c)
    # add EOS and SOS at the end of index list
    char_list = ["PAD"] + list(char_list) + ["<EOS>"] + ["<SOS>"]
    return char_list, {char_list[x]: x for x in range(len(char_list))}


In [6]:
int_index, char_index = index_chars(
    [x[0] for x in train_sentences + test_sentences])


In [7]:
# int_index: List of char in the train document
# char_index: char to index dict; index of a char in int_index.
for i in range(10):
    print(int_index[i], char_index[int_index[i]])

PAD 0
望 1
錳 2
遼 3
孫 4
皋 5
談 6
下 7
嶽 8
圃 9


In [8]:
# convert sequence of chars to the respective sequence of indecies
def convert_sentence(sentence, index):
    sent = sentence[5:-5]  # remove start and end tokens then add them
    sos = sentence[:5]
    eos = sentence[-5:]
    return [index[sos]] + [index[x] for x in sent] + [index[eos]]

In [9]:
def pad_lengths(sentences, max_length, padding=0):
    return [x + ([padding] * (max_length - len(x))) for x in sentences]

In [10]:
import random

rand_id = random.randint(0, len(train_sentences) - 1)
sample_sent = train_sentences[rand_id][0]
sample_label = train_sentences[rand_id][1]
sent_convert = convert_sentence(sample_sent, char_index)

print("Sentence Text")
print(sample_sent)
print()
print("Sentence Segmentation Label")
print(sample_label)
print()
print("Sentence in int")
print(sent_convert)
print()
print("Sentence Padded")
pad_len = len(sent_convert) + 4
print(pad_lengths([sent_convert], pad_len))

Sentence Text
<SOS>活化歷史建築諮詢委員會在2010年9月完成評審建議書，經發展局局長接納及批准，由嘉道理農場暨植物園公司建議的「綠滙學苑」方案獲選。<EOS>

Sentence Segmentation Label
[0, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 2, 2, 3, 3, 3, 2, 2, 2, 2, 3, 2, 3, 2, 3, 2, 2, 2, 2, 3, 2, 2, 3, 2, 3, 2, 2, 3, 2, 2, 2, 3, 3, 2, 3, 2, 2, 3, 2, 2, 3, 2, 3, 2, 2, 2, 3, 2, 3, 2, 2, 3, 2, 3, 2, 1]

Sentence in int
[3649, 3412, 1658, 1013, 268, 578, 1624, 2396, 3321, 1024, 63, 1476, 2827, 611, 3023, 1379, 3023, 2890, 1956, 2985, 559, 2772, 2341, 2621, 578, 1450, 3526, 3521, 1952, 1972, 2456, 2829, 2829, 472, 2137, 1258, 1799, 2239, 2284, 3521, 2831, 3230, 525, 2007, 1366, 3551, 1998, 343, 571, 346, 1617, 1790, 578, 1450, 798, 3604, 2480, 2866, 2223, 2893, 407, 1857, 2575, 1284, 1581, 64, 3648]

Sentence Padded
[[3649, 3412, 1658, 1013, 268, 578, 1624, 2396, 3321, 1024, 63, 1476, 2827, 611, 3023, 1379, 3023, 2890, 1956, 2985, 559, 2772, 2341, 2621, 578, 1450, 3526, 3521, 1952, 1972, 2456, 2829, 2829, 472, 2137, 1258, 1799, 2239, 2284, 3521, 2831, 3230, 525, 2007, 13

In [11]:
def create_dataset(x, device="cpu"):
    converted = [(convert_sentence(x1[0], char_index), x1[1]) for x1 in x]
    X, y = zip(*converted)
    lengths = [len(x2) for x2 in X]
    padded_X = pad_lengths(X, max(lengths))
    Xt = torch.LongTensor(padded_X).to(device)
    padded_y = pad_lengths(y, max(lengths), padding=-1)
    yt = torch.LongTensor(padded_y).to(device)
    lengths_t = torch.LongTensor(lengths).to(device)
    return Xt, lengths_t, yt

In [12]:
train_X_tensor, train_lengths_tensor, train_y_tensor = create_dataset(
    train_sentences, torch.device("cuda:1"))
test_X_tensor, test_lengths_tensor, test_y_tensor = create_dataset(
    test_sentences, torch.device("cuda:1"))

In [13]:
print("Train X:\n", train_X_tensor[rand_id])

Train X:
 tensor([3649, 3412, 1658, 1013,  268,  578, 1624, 2396, 3321, 1024,   63, 1476,
        2827,  611, 3023, 1379, 3023, 2890, 1956, 2985,  559, 2772, 2341, 2621,
         578, 1450, 3526, 3521, 1952, 1972, 2456, 2829, 2829,  472, 2137, 1258,
        1799, 2239, 2284, 3521, 2831, 3230,  525, 2007, 1366, 3551, 1998,  343,
         571,  346, 1617, 1790,  578, 1450,  798, 3604, 2480, 2866, 2223, 2893,
         407, 1857, 2575, 1284, 1581,   64, 3648,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0

In [14]:
print("\nTrain Y:\n", train_y_tensor[rand_id])


Train Y:
 tensor([ 0,  2,  3,  2,  3,  2,  3,  2,  3,  2,  3,  2,  2,  2,  3,  3,  3,  2,
         2,  2,  2,  3,  2,  3,  2,  3,  2,  2,  2,  2,  3,  2,  2,  3,  2,  3,
         2,  2,  3,  2,  2,  2,  3,  3,  2,  3,  2,  2,  3,  2,  2,  3,  2,  3,
         2,  2,  2,  3,  2,  3,  2,  2,  3,  2,  3,  2,  1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1], device='cuda:1')


## Batching

In [15]:
class Batcher:
    def __init__(self, X, lengths, y, device, batch_size=50, max_iter=None):
        self.X = X
        self.lengths = lengths.to(torch.device("cpu"))
        self.y = y
        self.device = device
        self.batch_size = batch_size
        self.max_iter = max_iter
        self.curr_iter = 0

    def __iter__(self):
        return self

    def __next__(self):
        if self.curr_iter == self.max_iter:
            raise StopIteration
        permutation = torch.randperm(self.X.size()[0], device=self.device)
        permX = self.X[permutation]
        permlengths = self.lengths[permutation]
        permy = self.y[permutation]
        splitX = torch.split(permX, self.batch_size)
        splitlengths = torch.split(permlengths, self.batch_size)
        splity = torch.split(permy, self.batch_size)

        self.curr_iter += 1
        return splitX, splitlengths, splity

In [16]:
batches = Batcher(train_X_tensor,
                  train_lengths_tensor,
                  train_y_tensor,
                  torch.device('cuda:1'),
                  max_iter=100)


In [17]:
X0, l0, y0 = next(batches)
print(train_X_tensor.size())
print(len(X0), len(l0), len(y0))
print(X0[0].size())
print(l0[0].size())
print(y0[0].size())

torch.Size([3997, 184])
80 80 80
torch.Size([50, 184])
torch.Size([50])
torch.Size([50, 184])


## Modeling

##### I Added an Encoder layer (LSTM) under the word segmentation layer
##### I Removed the `log_softmax` from the model as I will goint to use `CrossEntropyLoss`

In [18]:
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [19]:
class SingleDualObj(nn.Module):
    def __init__(self, vocab_size, emb_size, enc_h_size, seg_h_size):
        super().__init__()

        self.emb = nn.Embedding(vocab_size, emb_size, 0)
        self.encoder = SeqEncoder(emb_size, enc_h_size)
        self.word_seg = Segmenter(enc_h_size, seg_h_size)

    def forward(self, x, lengths):

        embs = self.emb(x)
        encdd = self.encoder(embs, lengths)
        seg_logits = self.word_seg(encdd, lengths)

        return seg_logits

In [20]:
class SeqEncoder(nn.Module):
    def __init__(self, input_size, enc_h_size):
        super().__init__()

        self.lstm = nn.LSTM(input_size, enc_h_size, batch_first=True)

    def forward(self, x, lengths):
        packed = pack_padded_sequence(x,
                                      lengths,
                                      batch_first=True,
                                      enforce_sorted=False)
        output1, _ = self.lstm(packed)
        unpacked, _ = pad_packed_sequence(output1, batch_first=True)

        return unpacked

In [21]:
class Segmenter(nn.Module):
    def __init__(self, input_size, seg_h_size):
        super().__init__()

        self.lstm = nn.LSTM(input_size, seg_h_size, batch_first=True)
        self.sig1 = nn.Tanh()
        self.lin = nn.Linear(seg_h_size, 4)

    def forward(self, x_embd, lengths):
        packed = pack_padded_sequence(x_embd,
                                      lengths,
                                      batch_first=True,
                                      enforce_sorted=False)
        output1, _ = self.lstm(packed)
        unpacked, _ = pad_packed_sequence(output1, batch_first=True)
        output2 = self.sig1(unpacked)
        output3 = self.lin(output2)

        return output3


In [22]:
import torch.optim as optim

In [23]:
def train(X,
          lengths,
          y,
          vocab_size,
          emb_size,
          enc_h_size,
          seg_h_size,
          batch_size,
          epochs,
          device,
          model=None):

    batches = Batcher(X,
                      lengths,
                      y,
                      device,
                      batch_size=batch_size,
                      max_iter=epochs)

    if not model:
        m = SingleDualObj(vocab_size, emb_size, enc_h_size,
                          seg_h_size).to(device)
    else:
        m = model

    loss = nn.CrossEntropyLoss(ignore_index=-1)
    optimizer = optim.Adam(m.parameters(), lr=0.005)

    epoch = 0
    for batch in batches:

        tot_loss = 0
        for X, ls, y in zip(*batch):
            optimizer.zero_grad()

            o = m(X, ls)
            l = loss(o.permute(0, 2, 1), y[:, :max(ls)])
            tot_loss += l.item()
            l.backward()
            optimizer.step()

        if epoch % 4 == 0:
            print("Total loss in epoch {:<2d} is {:.3f}.".format(
                epoch, tot_loss))

        epoch += 1

    print("Total loss in epoch {:<2d} is {:.3f}.".format(epoch - 1, tot_loss))
    return m

In [24]:
kwargs = {
    "vocab_size": len(int_index),
    "emb_size": 300,
    "enc_h_size": 200,
    "seg_h_size": 150,
    "batch_size": 50,
    "epochs": 30,
    "device": torch.device("cuda:1")
}

In [25]:
model = train(train_X_tensor, train_lengths_tensor, train_y_tensor, **kwargs)


Total loss in epoch 0  is 29.987.
Total loss in epoch 4  is 3.383.
Total loss in epoch 8  is 1.349.
Total loss in epoch 12 is 0.726.
Total loss in epoch 16 is 0.763.
Total loss in epoch 20 is 0.577.
Total loss in epoch 24 is 0.539.
Total loss in epoch 28 is 0.547.
Total loss in epoch 29 is 0.725.


## Evaluation

In [30]:
from sklearn.metrics import accuracy_score, classification_report
from torch.nn import functional as F

In [33]:
model.eval()

preds = []
gtruth = []
test_lengs = test_lengths_tensor.to(torch.device("cpu"))
for i, X_test in enumerate(test_X_tensor):
    with torch.no_grad():
        output = model(X_test.view(1, -1), test_lengs[i].view(-1))
    prediction = torch.argmax(F.log_softmax(output, 2), 2)

    preds.extend(prediction[:test_lengs[i]].float().view(-1).tolist())
    gtruth.extend(test_y_tensor[i][:test_lengs[i]].float().tolist())

In [34]:
accuracy = accuracy_score(gtruth, preds)
print(classification_report(gtruth, preds))
print(f"Accuracy: {accuracy:.2%}")

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       500
         1.0       1.00      1.00      1.00       500
         2.0       0.95      0.96      0.95     12012
         3.0       0.93      0.91      0.92      7194

    accuracy                           0.94     20206
   macro avg       0.97      0.97      0.97     20206
weighted avg       0.94      0.94      0.94     20206

Accuracy: 94.33%
