In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
import re
import gensim
from gensim.models import Word2Vec, KeyedVectors
import torch
import torch.nn as nn
torch.manual_seed(1)
import numpy as np
import torch.optim as optim

from sklearn.model_selection import train_test_split

In [None]:
print(torch.cuda.get_device_name(0))

Tesla T4


###**Prepare data for training word2vec from scratch**

In [None]:
reg = re.compile(r"[A-Za-z_]+")
re.match(reg, "7") ==  None

True

In [None]:
datapath = '/content/drive/My Drive/nlp/vlsp 2018/data/train2.txt'
outfile = open('/content/drive/My Drive/nlp/vlsp 2018/data/brown_input_rmv.txt', 'w+')
reg = re.compile(r"[A-Za-z_]+")
reg_num = re.compile(r"[0-9_/]+")
with open(datapath, 'rb') as f:
  text = f.read().decode("utf-8")
  br_input = []
  for sent in text.split("\n\n"):
    w_sent = []
    for line in sent.split('\n'):
      for i in line.split('\t'):
        if (re.match(reg, i)!=None): w_sent.append(i.lower())
        elif (re.match(reg_num, i) != None): w_sent.append("$$$$")
        break
    a = " ".join(w_sent)
    outfile.write("%s\n" %(a))
    br_input.append(w_sent)

outfile.close()

In [None]:
w2v_model = Word2Vec(min_count = 1, window = 5, size = 200, sample=1e-1, alpha=0.01,
                     min_alpha = 0.0001, negative=10, sg=1)

In [None]:
w2v_model.build_vocab(br_input)

In [None]:
w2v_model.train(br_input, total_examples=w2v_model.corpus_count, epochs=10, report_delay=1)

(2916980, 2916980)

In [None]:
w2v_model.wv.most_similar(positive="bà")

[('anh', 0.8594333529472351),
 ('bé', 0.8481131792068481),
 ('chị', 0.8467159271240234),
 ('mẹ', 0.8396469950675964),
 ('yingluck', 0.8348666429519653),
 ('con', 0.8320862054824829),
 ('lời', 0.8315073847770691),
 ('cô', 0.8233706951141357),
 ('cụ', 0.8202928900718689),
 ('vợ', 0.8182156085968018)]

In [None]:
w2v_model.wv.most_similar(positive="xã")

[('huyện', 0.9597066044807434),
 ('thôn', 0.888346791267395),
 ('thị_trấn', 0.8658595085144043),
 ('hà_tĩnh', 0.8563859462738037),
 ('phường', 0.8525856733322144),
 ('bắc_ninh', 0.8458022475242615),
 ('quận', 0.8443245887756348),
 ('tỉnh', 0.8420199155807495),
 ('nghệ_an', 0.8245270252227783),
 ('thanh_hoá', 0.8220493793487549)]

In [None]:
w2v_model.wv.most_similar(positive="việt_nam")

[('nhật_bản', 0.831634521484375),
 ('trung_quốc', 0.8109543323516846),
 ('hàn_quốc', 0.8017451763153076),
 ('châu_á', 0.7991105318069458),
 ('bóng_đá', 0.7960209846496582),
 ('quân_đội', 0.7810149788856506),
 ('xuất_sắc', 0.7781849503517151),
 ('nội_dung', 0.7760263681411743),
 ('châu_âu', 0.7749448418617249),
 ('lào', 0.7748377323150635)]

In [None]:
w2v_model.wv.most_similar(positive="mùa_thị_sinh")

[('hồ_thị_lệ_hà', 0.9975352883338928),
 ('dương_bích_nguyệt', 0.997015118598938),
 ('nguyễn_thị_thu_hằng', 0.9968255758285522),
 ('okimoto-kaewtathip', 0.996513307094574),
 ('hứa_thị_phấn', 0.9964773058891296),
 ('nguyễn_huy_hoàng', 0.9963103532791138),
 ('nguyễn_phương_đông', 0.996132493019104),
 ('hường_văn_minh', 0.996028482913971),
 ('khúc_thị_hoa_phượng', 0.9959829449653625),
 ('trần_văn_lĩnh', 0.9959742426872253)]

In [None]:
def text2idx(sentences):
  idx_corpus = []
  for sent in sentences:
    idx_sent = []
    for w in sent:
      idx_sent.append(w2v_model.wv.vocab[w].index)
    idx_corpus.append(idx_sent)
  return idx_corpus

idx_corpus = text2idx(br_input)

###**Get data for training word2vec BiLSTM-CRF**

In [None]:
datapath= '/content/drive/My Drive/nlp/vlsp 2018/data/train2.txt'
with open(datapath, 'rb') as f:
  text = f.read().decode('utf-8')

In [None]:
training_data = []
tag_to_idx = {} 
idx_to_tag = {}
cnt = 0
for sent in text.split('\n\n'):
  words = []
  tags = []
  for line in sent.split('\n'):
    r = line.split('\t')
    if len(r) == 3: w, t, pos = r[0], r[1], r[2]
    words.append(w)
    tags.append(t)
    if(t not in tag_to_idx): 
      tag_to_idx[t] = cnt
      idx_to_tag[cnt] = t
      cnt += 1
  training_data.append((words, tags))

In [None]:
START_TAG = "<START>"
STOP_TAG = "<STOP>"
tag_to_idx[START_TAG] = cnt
tag_to_idx[STOP_TAG] = cnt+1
idx_to_tag[cnt] = START_TAG
idx_to_tag[cnt+1] = STOP_TAG

In [None]:
#vec has size : 1 x tagset_size
def argmax(vec):
  _, idx = torch.max(vec, 1)
  return idx.item()

def log_sum_exp(vec):
  """
    log_sum_exp(x) = log(sum_i(exp(x_i)))
                   = max_i(x) - log(sum_i(exp(max_i(x) - x_i)))
    reference: https://nhigham.com/2021/01/05/what-is-the-log-sum-exp-function/ 
  """
  max_score = vec[0, argmax(vec)]
  max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
  return max_score + torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

In [None]:
class BiLSTM_CRF(nn.Module):
  def __init__(self, vocab_size, tag_to_idx, pretrained_embedding, hidden_dim):
    super(BiLSTM_CRF, self).__init__()
    self.vocab_size = vocab_size # for embedding layer - num_embeddings
    self.tag_to_idx = tag_to_idx
    self.hidden_dim = hidden_dim # hidden_dim of LSTM : dim of LSTM's output
    self.tagset_size = len(tag_to_idx)

    # neural layer
    weight = torch.FloatTensor(pretrained_embedding)
    self.word_embeds = nn.Embedding.from_pretrained(weight)
    self.lstm = nn.LSTM(weight.shape[1], hidden_dim//2,
                        num_layers = 1, bidirectional=True)
    self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size) # like Dense

    # init param for crf layer
    # transition[i][j] = p(y_i | y_j) tuc la: j -> i
    self.transitions = nn.Parameter(torch.randn(self.tagset_size, self.tagset_size))
    self.transitions.data[tag_to_idx[START_TAG], :] = -10000 # make it impossible
    self.transitions.data[:, tag_to_idx[STOP_TAG]] = -10000 # make it impossible

    #init param for lstm layer
    self.hidden = self.init_hidden()

  def init_hidden(self): # ????
    return (torch.randn(2, 1, self.hidden_dim//2), 
            torch.randn(2, 1, self.hidden_dim//2))
    
  def _forward_alg(self, feats):
    """
      feats:  feats (got from lstm) of one sentence 
    """
    forward_var = torch.full((1, self.tagset_size), -10000.0)
    forward_var[0][self.tag_to_idx[START_TAG]] = 0.0
    
    for feat in feats: # each step
      alphas = []
      for next_tag in range(self.tagset_size):
        emit_score = feat[next_tag].view(1, -1).expand(1, self.tagset_size)
        trans_score = self.transitions[next_tag].view(1, -1)
        alpha_arr = forward_var + trans_score + emit_score
        alpha = log_sum_exp(alpha_arr).view(1)
        alphas.append(alpha)

      forward_var = torch.cat(alphas).view(1, -1) # seq of tensor -> tensor

    terminal_var = forward_var + self.transitions[tag_to_idx[STOP_TAG]]
    alpha = log_sum_exp(terminal_var)
    return alpha

  def _get_lstm_features(self, sentence): # forward neural network
    self.hidden = self.init_hidden()
    embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)

    lstm_out, self.hidden = self.lstm(embeds, self.hidden) # lstm_out: output features; out_hidden: hidden state for last step; in_hidden: initial hidden
    lstm_out = lstm_out.view(len(sentence), self.hidden_dim)

    lstm_feats = self.hidden2tag(lstm_out)
    return lstm_feats

  def _score_sentence(self, feats, tags):
    # Gives the score of a provided tag sequence
    score = torch.zeros(1)
    tags = torch.cat([torch.tensor([self.tag_to_idx[START_TAG]], dtype=torch.long), tags]) #push start tag to front of tags

    for i, feat in enumerate(feats):
      score = score + \
            self.transitions[tags[i+1], tags[i]] + \
            feat[tags[i+1]]

    score = score + self.transitions[self.tag_to_idx[STOP_TAG], tags[-1]]
    return score

  def _viterbi_decode(self, feats):
    backpointers = []
    forward_var = torch.full((1, self.tagset_size), -10000.0)
    forward_var[0][self.tag_to_idx[START_TAG]] = 0.0

    for feat in feats:
      bptrs = [] # hold backpointers for tags at this step
      alphas = []
      for next_tag in range(self.tagset_size):
        trans_score = self.transitions[next_tag]
        scores = forward_var + trans_score
        best_tag_id = argmax(scores)
        bptrs.append(best_tag_id)
        alphas.append(scores[0][best_tag_id].view(1))

      forward_var = (torch.cat(alphas) + feat).view(1, -1) #thu sua cho nay xem sao
      backpointers.append(bptrs)

    terminal_var = forward_var + \
                self.transitions[self.tag_to_idx[STOP_TAG]]
    best_tag_id = argmax(terminal_var)
    path_score = terminal_var[0][best_tag_id]

    best_path = [best_tag_id]
    for bpters_t in reversed(backpointers):
      best_tag_id = bpters_t[best_tag_id]
      best_path.append(best_tag_id)

    start = best_path.pop()
    assert  start == self.tag_to_idx[START_TAG]
    best_path.reverse()
    return path_score, best_path

  def neg_log_likelihood(self, sentence, tags):
    feats = self._get_lstm_features(sentence)
    forward_score = self._forward_alg(feats) # ~ z
    gold_score = self._score_sentence(feats, tags)
    return forward_score - gold_score

  def forward(self,sentence):
    lstm_feats = self._get_lstm_features(sentence)
    score, tag_seq = self._viterbi_decode(lstm_feats)
    return score, tag_seq

  

In [None]:
w2v_model.wv.index2word[1454]

'tiếp_nhận'

In [None]:
EMBEDDING_DIM = 200
def word2idx(w):
  w = w.lower()
  if (w in w2v_model.wv.vocab):
    return w2v_model.wv.vocab[w].index
  else:
    return w2v_model.wv.vocab['$$$$'].index


In [None]:
def prepare_sequence(seq):
  """
    convert sentence to idxs sequence (a tensor of idx)
  """
  idxs = [word2idx(w) for w in seq]
  return torch.tensor(idxs, dtype=torch.long) 

In [None]:
torch.FloatTensor(w2v_model.wv.vectors).shape[0]

15709

In [None]:
prepare_sequence(training_data[0][0])

tensor([1454,   84,    0,  150,    0,    0,    0,   36,  866,    0,  392,  551,
        1726,  303,  915,    2,    0,  582,  456, 1778, 1919,   37,    0, 2698,
           0,   15,  880, 3006,    0,    0,  392,  551,  801,  668,   70,  616,
          76,    0,  107,   24,    0,   26,    0,    0,  775,  269,    0,  582,
        2698,  103,  953,    1,  582, 2698,   66, 1852,    0,  765,   76,  106,
           0,   84,    0,  150,    0,    0])

In [None]:
y_trivial = [0]*len(training_data)

In [None]:
train_data, val_data,_ , _ = train_test_split(training_data, y_trivial, test_size = 0.2, random_state=26)

In [None]:
HIDDEN_DIM = 32
model = BiLSTM_CRF(len(w2v_model.wv.vocab), tag_to_idx, w2v_model.wv.vectors, HIDDEN_DIM)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

In [None]:
with torch.no_grad():
  precheck_sent = prepare_sequence(training_data[0][0])
  precheck_tags = torch.tensor([tag_to_idx[t] for t in training_data[0][1]], dtype=torch.long)
  print(len(model(precheck_sent)[1]))

66


In [None]:
# Make sure prepare_sequence from earlier in the LSTM section is loaded
for epoch in range(10):
    i = 0
    for sentence, tags in train_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance         
        model.zero_grad()
        i+=1
        if(i%100) == 0: print(i)
        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = prepare_sequence(sentence)
        targets = torch.tensor([tag_to_idx[t] for t in tags], dtype=torch.long)

        # Step 3. Run our forward pass.
        loss = model.neg_log_likelihood(sentence_in, targets)
        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()
    preY_val = []

    with torch.no_grad():
        for sent in X_val:
            precheck_sent = prepare_sequence(sent)
            preY_val.append(model(precheck_sent)[1])
    y_pred = []
    for n_sent in preY_val:
        t_sent = []
        for nt in n_sent:
            t_sent.append(idx_to_tag[nt])
        y_pred.append(t_sent)
    evaluate(y_pred, Y_val)
    torch.save(model.state_dict(), '/content/drive/My Drive/nlp/vlsp 2018/model/word2vecBiLSTM')

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
10000
10100
10200
10300
10400
10500
10600
10700
10800
10900
11000
11100
11200
NE_true:  883 	 NE_sys:  1314 	 NE_ref:  4112
F1_score:  0.32546995945447843
Recall on Nested Entities:  0.28852459016393445
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400


In [None]:
import random

In [None]:
# Make sure prepare_sequence from earlier in the LSTM section is loaded
for epoch in range(10):
    i = 0
    random.shuffle(train_data)
    for sentence, tags in train_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance         
        model.zero_grad()
        i+=1
        if(i%100) == 0: print(i)
        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = prepare_sequence(sentence)
        targets = torch.tensor([tag_to_idx[t] for t in tags], dtype=torch.long)

        # Step 3. Run our forward pass.
        loss = model.neg_log_likelihood(sentence_in, targets)
        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()
    preY_val = []

    with torch.no_grad():
        for sent in X_val:
            precheck_sent = prepare_sequence(sent)
            preY_val.append(model(precheck_sent)[1])
    y_pred = []
    for n_sent in preY_val:
        t_sent = []
        for nt in n_sent:
            t_sent.append(idx_to_tag[nt])
        y_pred.append(t_sent)
    evaluate(y_pred, Y_val)
    torch.save(model.state_dict(), '/content/drive/My Drive/nlp/vlsp 2018/model/word2vecBiLSTM_shuffle')

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
10000
10100
10200
10300
10400
10500
10600
10700
10800
10900
11000
11100
11200
NE_true:  1414 	 NE_sys:  1906 	 NE_ref:  4112
F1_score:  0.46992356264539714
Recall on Nested Entities:  0.4819672131147541
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400


###**Evaluation**

In [None]:
X_val, Y_val = [], []
for sent in val_data:
  X_val.append(sent[0])
  Y_val.append(sent[1])

In [None]:
X_val

In [None]:
preY_val = []

with torch.no_grad():
    for sent in X_val:
      precheck_sent = prepare_sequence(sent)
      preY_val.append(model(precheck_sent)[1])

In [None]:
y_pred = []
for n_sent in preY_val:
  t_sent = []
  for nt in n_sent:
    t_sent.append(idx_to_tag[nt])
  y_pred.append(t_sent)

In [None]:
y_pred

In [None]:
def isOpen(tag):
  return tag[0] == 'B'

def isCloseOf(tag, open_tag):
  if tag == "O": return True
  if(len(tag.split()) < len(open_tag.split())):
    return True
  # case same len
  if(tag[0] == 'B' or tag[2:5] != open_tag[2:5]) and (len(tag.split()) == len(open_tag.split())):
    return True
  return False

In [None]:
def getEntSpanSet(sent):
  """
    return set of ent spans & set of nested ent spans
  """
  stack = []
  ent_spans = set()
  nent_spans = set()
  isNested = False
  for i, w in enumerate(sent):
    if len(stack) > 1: isNested = True

    while (stack and isCloseOf(w, stack[-1][0]) and stack[-1][1] != i):
      tag, begin = stack[-1][0][2:5], stack[-1][1]
      ent_spans.add((tag, begin, i))
      if isNested: nent_spans.add((tag, begin, i))
      stack.pop()

    if not stack : isNested = False
    if isOpen(w):
      stack.append((w, i))

  while stack:
    tag, begin = stack[-1][0][2:5], stack[-1][1]
    ent_spans.add((tag, begin, len(sent)))
    stack.pop()
  return ent_spans, nent_spans
sent = ['O', 'O', 'O', 'B-ORG', 'I-ORG', 'B-LOC I-ORG', 'B-ORG', 'B-LOC I-ORG', 'I-ORG', 'B-ORG', 'B-LOC I-ORG', 'B-ORG', 'B-LOC I-ORG', 'B-PER I-ORG', 'O', 'O', 'B-PER']
getEntSpanSet(sent)

({('LOC', 5, 6),
  ('LOC', 7, 8),
  ('LOC', 10, 11),
  ('LOC', 12, 13),
  ('ORG', 3, 6),
  ('ORG', 6, 9),
  ('ORG', 9, 11),
  ('ORG', 11, 14),
  ('PER', 13, 14),
  ('PER', 16, 17)},
 {('LOC', 5, 6),
  ('LOC', 7, 8),
  ('LOC', 10, 11),
  ('LOC', 12, 13),
  ('ORG', 3, 6),
  ('ORG', 6, 9),
  ('ORG', 9, 11),
  ('ORG', 11, 14),
  ('PER', 13, 14)})

In [None]:
def evaluateSent(sent_pred, sent_true):
  """
    return the num of found ents, the num of found nested ents
  """
  pred_ents, pred_nents = getEntSpanSet(sent_pred)
  true_ents, true_nents = getEntSpanSet(sent_true)
  ne_ref = len(true_ents)
  ne_sys = len(pred_ents)
  ne_true = 0
  nen_ref = len(true_nents)
  nen_true = 0
  for ent in pred_ents:
    if ent in true_ents:
      ne_true += 1
    if ent in true_nents:
      nen_true += 1
  return ne_ref, ne_sys, ne_true, nen_ref, nen_true

In [None]:
def evaluate(y_pred, y_true):
  NE_ref = 0
  NE_sys = 0
  NE_true = 0
  NEN_ref = 0
  NEN_true = 0

  for s in range(len(y_true)):
    sent_pred, sent_true = y_pred[s], y_true[s]
    ne_ref, ne_sys, ne_true, nen_ref, nen_true = evaluateSent(sent_pred, sent_true)
    NE_true += ne_true
    NE_sys += ne_sys
    NE_ref += ne_ref
    NEN_ref += nen_ref
    NEN_true += nen_true

  print("NE_true: ", NE_true, "\t NE_sys: ", NE_sys, "\t NE_ref: ", NE_ref)
  p = NE_true / NE_sys
  r = NE_true / NE_ref
  f1 = 2*p*r / (p+r)
  print("F1_score: ", f1)
  NEN_recall = NEN_true/NEN_ref
  print("Recall on Nested Entities: ", NEN_recall)
  return f1, NEN_recall

In [None]:
evaluate(y_pred, Y_val)

NE_true:  1214 	 NE_sys:  1792 	 NE_ref:  4112
F1_score:  0.41124661246612465
Recall on Nested Entities:  0.3704918032786885


(0.41124661246612465, 0.3704918032786885)