In [1]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
gpu = torch.device('cuda')
cpu = torch.device('cpu')

word_to_ix = {}
tag_to_ix = {}
ix_to_tag = {}

In [4]:
START_TAG = "<START>"
STOP_TAG = "<STOP>"
EMBEDDING_DIM = 20
HIDDEN_DIM = 20

# Model Customize

In [5]:
def argmax(vector):
    _, idx = torch.max(vector, 1)
    return idx.item()

In [6]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype = torch.long)

In [7]:
# @vec: in GPU environment
# @outputs: in GPU environment
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]

    max_score_boardcast = max_score.view(1, -1).expand(1, vec.size()[1])

    exp = torch.exp(vec - max_score_boardcast)

    sum = torch.sum(exp)

    log = torch.log(sum)

    outputs = max_score + log

    return outputs

In [8]:
class BiLSTM_CRF(nn.Module):
    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim).to(gpu)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers = 1, bidirectional = True).to(gpu)
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size).to(gpu)

        transition_temp = nn.Parameter(torch.randn(self.tagset_size, self.tagset_size))
        transition_temp.data[tag_to_ix[START_TAG], :] = -10000
        transition_temp.data[:, tag_to_ix[STOP_TAG]] = - 10000
        self.transition = transition_temp.to(gpu)

        self.hidden = self.init_hidden()

    def init_hidden(self):
        h = torch.randn(2, 1, self.hidden_dim)
        c = torch.randn(2, 1, self.hidden_dim)

        return (h.to(gpu), c.to(gpu))
    
    # @sentence: in GPU
    # lstm_feats: in GPU
    def lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        embeds = self.word_embeds(sentence)
        embeds = embeds.reshape(len(sentence), 1, -1)

        lstm_out, self.hidden = self.lstm(embeds, self.hidden)

        lstm_forward = lstm_out[:, :, : self.hidden_dim]
        lstm_backward = lstm_out[:, :, self.hidden_dim : ]

        combine = torch.cat([lstm_forward.unsqueeze(0), lstm_backward.unsqueeze(0)], 
                            dim = 0)
        lstm_out = torch.mean(combine, dim = 0)

        lstm_out = lstm_out.reshape((lstm_out.shape[0], self.hidden_dim))
        lstm_feats = self.hidden2tag(lstm_out)

        return lstm_feats
    
    # @feats: in CPU
    # @tags: in CPU
    def score_sentence(self, feats, tags):
        score = torch.zeros(1)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype= torch.long), tags])

        transition_cpu = self.transition.to(cpu)
        for i, feat in enumerate(feats):
            score = score + transition_cpu[tags[i + 1], tags[i]] + feat[tags[i + 1]]

        score = score + transition_cpu[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score
        
    # @feats: in GPU because we do not use this very much
    # @return_value: in CPU
    def viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.).to(gpu)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars

        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                next_tag_var = forward_var + self.transition[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transition[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.

        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score.to(cpu), best_path

    # The parameters is in GPU
    # @feats: (num_word, num_tag)
    def all_case_score(self, feats):
        init_alphas = torch.full((1, self.tagset_size), -10000).to(gpu)

        init_alphas[0][self.tag_to_ix[START_TAG]] = 0

        forward_var = init_alphas

        for feat in feats:
            alphas_t = []
            for next_tag in range(self.tagset_size):
                emit_score = feat[next_tag].reshape(1, -1).expand(1, self.tagset_size)

                trans_score = self.transition[next_tag].view(1, -1)
                next_tag_var = forward_var + trans_score + emit_score

                alphas_t.append(log_sum_exp(next_tag_var).reshape(1))

            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transition[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

                
    # @sentence: in CPU
    # @tags: in CPU
    def neg_log_likelihood(self, sentence, tags):
        feats = self.lstm_features(sentence.to(gpu))
        feats_cpu = feats.to(cpu)

        forward_score = self.all_case_score(feats).to(cpu)
        gold_score = self.score_sentence(feats_cpu, tags)

        return forward_score - gold_score
    
    def forward(self, sentence):
        lstm_feats = self.lstm_features(sentence.to(gpu))

        score, tag_seq = self.viterbi_decode(lstm_feats)

        return score, tag_seq

# Read training set and test set

In [9]:
# seq_list: A list of list tokens
# tag_list: A list of list tags
def read_data(filename):
    csv_reader = pd.read_excel(filename, header=1)
    
    seqs = []
    tags = []

    for _,row in csv_reader.iterrows():
        seqs.append(row[0].split(','))
        tags.append(row[1].split(','))

    return (seqs, tags)

In [10]:
def train_data(filename):
    seq_list, tag_list = read_data(filename)
    
    training_data = []
    for words, tag in zip(seq_list, tag_list):
      
        if(len(words) != len(tag)):
            continue
        training_data.append((words, tag))

    return training_data

In [11]:
def test_data(filename):
    seq_list, tag_list = read_data(filename)
    testing_data = []

    for seqs, tags in zip(seq_list, tag_list):
        word_verify = [x in word_to_ix for x in seqs]
        tag_verify = [x in tag_to_ix for x in tags]

        if all(word_verify) and all(tag_verify):
            testing_data.append((seqs, tags))

    print(len(testing_data))
    return testing_data

In [12]:
training_data = train_data('/content/drive/MyDrive/Dataset/Training_Bahnar/Training_set.xlsx')

# Split only 20 sentence for test training data
# training_data = training_data[:20]

for sentence, tags in training_data:
    for word, tag in zip(sentence, tags):

        if word not in word_to_ix:
            # Function: WORD -> INDEX
            word_to_ix[word] = len(word_to_ix)
        
        if tag not in tag_to_ix:
            # Function TAG -> INDEX
            tag_to_ix[tag] = len(tag_to_ix)

            # Function INDEX -> TAG
            ix_to_tag[len(tag_to_ix) - 1] = tag

tag_to_ix['<START>'] = len(tag_to_ix)
tag_to_ix['<STOP>'] = len(tag_to_ix)

print(f"Number of sentence: {len(training_data)}")

Number of sentence: 100


In [13]:
print(training_data[0][0])
print(training_data[0][1])

['đôi', 'bĭ', 'alê̆', 'đơ̆i', 'năr', 'lơnga', 'lu bơ̆n', 'jơh', 'lêch', 'sơnglŏng']
['N', 'R', 'V', 'R', 'N', 'V', 'P', 'R', 'V', 'N']


# Training Section

In [14]:
model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM)
optimizer = torch.optim.SGD(model.parameters(), lr = 0.01, weight_decay= 1e-4)

In [20]:
with torch.no_grad():
    precheck_sent = prepare_sequence(training_data[0][0], word_to_ix)
    precheck_tags = training_data[0][1]

    score, predict_tags = model(precheck_sent.to(gpu))

    print(score)
    print('Predict tag:\t', [ix_to_tag[tag] for tag in predict_tags])
    print("True tag:\t", precheck_tags)

tensor(27.0021)
Predict tag:	 ['F', 'X', 'QUESTION', 'Np', 'N', 'QUESTION', 'Np', 'N', 'V', 'A']
True tag:	 ['N', 'R', 'V', 'R', 'N', 'V', 'P', 'R', 'V', 'N']


In [21]:
for epoch in range(100):
    totalLoss = 0
    for sentence, tags in training_data:
        optimizer.zero_grad()

        sentence = prepare_sequence(sentence, word_to_ix)
        tags = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long)

        loss = model.neg_log_likelihood(sentence, tags)

        loss.backward()
        optimizer.step()

        with torch.no_grad():
            totalLoss +=  loss.item()
    print(f"End epoch {epoch} with loss {totalLoss}")

End epoch 0 with loss 7258.371290206909
End epoch 1 with loss 6085.455087661743
End epoch 2 with loss 5336.983615875244
End epoch 3 with loss 4661.423761367798
End epoch 4 with loss 4054.0585193634033
End epoch 5 with loss 3523.1279430389404
End epoch 6 with loss 3069.406349182129
End epoch 7 with loss 2709.944366455078
End epoch 8 with loss 2400.266975402832
End epoch 9 with loss 2153.0834922790527
End epoch 10 with loss 1932.2214584350586
End epoch 11 with loss 1761.8711280822754
End epoch 12 with loss 1602.4198112487793
End epoch 13 with loss 1466.6021614074707
End epoch 14 with loss 1328.7052383422852
End epoch 15 with loss 1216.3342247009277
End epoch 16 with loss 1122.8028373718262
End epoch 17 with loss 1038.800765991211
End epoch 18 with loss 950.663501739502
End epoch 19 with loss 900.2775993347168
End epoch 20 with loss 807.7732315063477
End epoch 21 with loss 763.1480331420898
End epoch 22 with loss 698.6418228149414
End epoch 23 with loss 658.9648361206055
End epoch 24 with

# Testing Section

In [28]:
with torch.no_grad():

    testing_data = test_data('/content/drive/MyDrive/Dataset/Training_Bahnar/Testing_set.xlsx')

    for words, tags in testing_data:
      precheck_sent = prepare_sequence(words, word_to_ix)
      
      predict_tags = model(precheck_sent)[1]

      count = 0
      for tag, predict_tag in zip(tags, predict_tags):
          if(tag_to_ix[tag] == predict_tag):
              count += 1

      print(f"{count}/{len(tags)}")

      # break

20
9/10
10/10
9/10
9/10
9/11
15/15
15/15
7/7
9/9
10/10
8/10
10/10
12/14
12/14
11/11
16/16
11/11
12/13
13/16
13/16


# Saving Model

In [29]:
# Save the entire model
torch.save(model, 'model_complete.pth')