## Loading the data, padding (based on 2.0)

In [1]:
import sys
import os
import numpy as np
import torch


In [2]:
def read_chinese_data(inputfilename):
    with open(inputfilename, "r") as inputfile:
        sentences = []
        collection_words = []
        collection_labels = []
        for line in inputfile:
            if line[0] == '#':
                continue
            columns = line.split()
            #print(words)
            if columns == []:
                sentences.append(
                    (''.join(collection_words), collection_labels))
                collection_words = []
                collection_labels = []
                continue
            collection_words.append(columns[1])
            collection_labels += [1] + ([0] * (len(columns[1]) - 1))

    return sentences

In [3]:
train_sentences = read_chinese_data(
    '/scratch/lt2316-h20-resources/zh_gsd-ud-train.conllu')


In [4]:
test_sentences = read_chinese_data(
    '/scratch/lt2316-h20-resources/zh_gsd-ud-test.conllu')


In [5]:
def index_chars(sentences):
    megasentence = ''.join(sentences)
    char_list = set()
    for c in megasentence:
        char_list.add(c)
    char_list = [0] + list(char_list)
    return char_list, {char_list[x]: x for x in range(len(char_list))}


In [6]:
int_index, char_index = index_chars(
    [x[0] for x in train_sentences + test_sentences])


In [7]:
# int_index: List of char in the train document
# char_index: char to index dict; index of a char in int_index.
for i in range(10):
    print(int_index[i], char_index[int_index[i]])

0 0
靼 1
忒 2
邵 3
酉 4
渣 5
延 6
賃 7
辰 8
赤 9


In [8]:
# convert sequence of chars to the respective sequence of indecies
def convert_sentence(sentence, index):
    return [index[x] for x in sentence]

In [9]:
def pad_lengths(sentences, max_length, padding=0):
    return [x + ([padding] * (max_length - len(x))) for x in sentences]

In [10]:
def create_dataset(x, device="cpu"):
    converted = [(convert_sentence(x1[0], char_index), x1[1]) for x1 in x]
    X, y = zip(*converted)
    lengths = [len(x2) for x2 in X]
    padded_X = pad_lengths(X, max(lengths))
    Xt = torch.LongTensor(padded_X).to(device)
    padded_y = pad_lengths(y, max(lengths), padding=-1)
    yt = torch.LongTensor(padded_y).to(device)
    lengths_t = torch.LongTensor(lengths).to(device)
    return Xt, lengths_t, yt

In [11]:
train_X_tensor, train_lengths_tensor, train_y_tensor = create_dataset(
    train_sentences, "cuda:2")
test_X_tensor, test_lengths_tensor, test_y_tensor = create_dataset(
    test_sentences, "cuda:2")

In [12]:
print("Train X:\n", train_X_tensor[0])

Train X:
 tensor([1553,  312, 2836, 1507, 1707,  311, 3311, 1133, 1044,  308, 2221, 2932,
        1458, 1707, 2566, 1731, 3214, 1643, 1278, 1109, 1765, 1091, 3311, 2595,
        1909, 2339, 1091, 1021,  701, 2774,  324, 1707,  144, 2928, 2024, 2595,
         648, 3353, 1091, 1144, 2274, 1707, 2566, 1033, 2253, 3398,  100, 1707,
        3557, 1569, 2932,  882, 1091, 2286, 3311, 1967,  843, 1396,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0

In [13]:
print("\nTrain Y:\n", train_y_tensor[0])


Train Y:
 tensor([ 1,  0,  1,  0,  1,  1,  1,  1,  1,  1,  1,  1,  0,  1,  1,  1,  0,  1,
         0,  1,  0,  1,  1,  1,  1,  0,  1,  1,  0,  1,  0,  1,  1,  1,  1,  1,
         1,  0,  1,  1,  0,  1,  1,  1,  0,  0,  0,  1,  1,  0,  1,  0,  1,  1,
         0,  1,  0,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1], device='cuda:2')


## Batching

In [14]:
class Batcher:
    def __init__(self, X, lengths, y, device, batch_size=50, max_iter=None):
        self.X = X
        self.lengths = lengths.to("cpu")
        self.y = y
        self.device = device
        self.batch_size = batch_size
        self.max_iter = max_iter
        self.curr_iter = 0

    def __iter__(self):
        return self

    def __next__(self):
        if self.curr_iter == self.max_iter:
            raise StopIteration
        permutation = torch.randperm(self.X.size()[0], device=self.device)
        permX = self.X[permutation]
        permlengths = self.lengths[permutation]
        permy = self.y[permutation]
        splitX = torch.split(permX, self.batch_size)
        splitlengths = torch.split(permlengths, self.batch_size)
        splity = torch.split(permy, self.batch_size)

        self.curr_iter += 1
        return splitX, splitlengths, splity

In [15]:
batches = Batcher(train_X_tensor,
                  train_lengths_tensor,
                  train_y_tensor,
                  torch.device('cuda:2'),
                  max_iter=100)


In [16]:
X0, l0, y0 = next(batches)
print(train_X_tensor.size())
print(len(X0), len(l0), len(y0))
print(X0[0].size())
print(l0[0].size())
print(y0[0].size())

torch.Size([3997, 182])
80 80 80
torch.Size([50, 182])
torch.Size([50])
torch.Size([50, 182])


## Modeling

In [17]:
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [18]:
class Segmenter(nn.Module):
    def __init__(self, vocab_size, emb_size):
        super().__init__()

        self.vocab_size = vocab_size
        self.emb_size = emb_size

        self.emb = nn.Embedding(self.vocab_size, self.emb_size, 0)
        self.lstm = nn.LSTM(self.emb_size, 150, batch_first=True)
        self.sig1 = nn.Sigmoid()
        self.lin = nn.Linear(150, 2)
        self.softmax = nn.LogSoftmax(2)

    def forward(self, x, lengths):
        embs = self.emb(x)
        packed = pack_padded_sequence(embs,
                                      lengths,
                                      batch_first=True,
                                      enforce_sorted=False)
        output1, _ = self.lstm(packed)
        unpacked, _ = pad_packed_sequence(output1, batch_first=True)
        output2 = self.sig1(unpacked)
        output3 = self.lin(output2)
        return self.softmax(output3)


In [19]:
import torch.optim as optim

In [20]:
def train(X,
          lengths,
          y,
          vocab_size,
          emb_size,
          batch_size,
          epochs,
          device,
          model=None):

    batches = Batcher(X,
                      lengths,
                      y,
                      device,
                      batch_size=batch_size,
                      max_iter=epochs)

    if not model:
        m = Segmenter(vocab_size, emb_size).to(device)
    else:
        m = model

    loss = nn.NLLLoss(ignore_index=-1)
    optimizer = optim.Adam(m.parameters(), lr=0.005)

    epoch = 0
    for batch in batches:

        tot_loss = 0
        for X, ls, y in zip(*batch):
            optimizer.zero_grad()

            o = m(X, ls)
            l = loss(o.permute(0, 2, 1), y[:, :max(ls)])
            tot_loss += l.item()
            l.backward()
            optimizer.step()

        if epoch % 4 == 0:
            print("Total loss in epoch {:<2d} is {:.3f}.".format(
                epoch, tot_loss))

        epoch += 1

    print("Total loss in epoch {:<2d} is {:.3f}.".format(epoch - 1, tot_loss))
    return m

In [21]:
model = train(train_X_tensor, train_lengths_tensor, train_y_tensor,
              len(int_index), 200, 50, 30, "cuda:2")


Total loss in epoch 0  is 35.846.
Total loss in epoch 4  is 9.246.
Total loss in epoch 8  is 4.132.
Total loss in epoch 12 is 2.154.
Total loss in epoch 16 is 2.076.
Total loss in epoch 20 is 1.237.
Total loss in epoch 24 is 0.277.
Total loss in epoch 28 is 0.129.
Total loss in epoch 29 is 0.116.


## Evaluation

In [22]:
from sklearn.metrics import accuracy_score, classification_report

In [23]:
model.eval()

preds = []
gtruth = []
for i, X_test in enumerate(test_X_tensor):
    output = model(X_test.view(1, -1), [test_lengths_tensor[i].item()])
    prediction = torch.argmax(output, 2)

    preds.extend(prediction[:test_lengths_tensor[i]].float().view(-1).tolist())
    gtruth.extend(test_y_tensor[i][:test_lengths_tensor[i]].float().tolist())

In [24]:
accuracy = accuracy_score(gtruth, preds)
print(classification_report(gtruth, preds))
print(f"Accuracy: {accuracy:.2%}")

              precision    recall  f1-score   support

         0.0       0.90      0.89      0.90      7194
         1.0       0.93      0.94      0.94     12012

    accuracy                           0.92     19206
   macro avg       0.92      0.92      0.92     19206
weighted avg       0.92      0.92      0.92     19206

Accuracy: 92.27%
