# Import Necessary Libraries

In [2]:
import collections
import numpy as np
import nltk
from nltk.corpus import treebank
from nltk.chunk import tree2conlltags
nltk.download('treebank')
nltk.download('universal_tagset')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [15]:
import pandas as pd
df = pd.concat([pd.DataFrame(list(treebank.tagged_sents())[3]),pd.DataFrame(list(treebank.tagged_sents(tagset='universal'))[3])[1]] , axis = 1)
df.columns = ['Words'  , 'Normal' , 'Universal']
df

Unnamed: 0,Words,Normal,Universal
0,A,DT,DET
1,form,NN,NOUN
2,of,IN,ADP
3,asbestos,NN,NOUN
4,once,RB,ADV
5,used,VBN,VERB
6,*,-NONE-,X
7,*,-NONE-,X
8,to,TO,PRT
9,make,VB,VERB


In [None]:
# get the corpus
sentences=list(treebank.tagged_sents())

#ne = nltk.ne_chunk(sentences[1])
#iob = tree2conlltags(ne)
iob = [tree2conlltags(nltk.ne_chunk(sent)) for sent in  sentences]
iob[:2]

[[('Pierre', 'NNP', 'B-PERSON'),
  ('Vinken', 'NNP', 'B-ORGANIZATION'),
  (',', ',', 'O'),
  ('61', 'CD', 'O'),
  ('years', 'NNS', 'O'),
  ('old', 'JJ', 'O'),
  (',', ',', 'O'),
  ('will', 'MD', 'O'),
  ('join', 'VB', 'O'),
  ('the', 'DT', 'O'),
  ('board', 'NN', 'O'),
  ('as', 'IN', 'O'),
  ('a', 'DT', 'O'),
  ('nonexecutive', 'JJ', 'O'),
  ('director', 'NN', 'O'),
  ('Nov.', 'NNP', 'O'),
  ('29', 'CD', 'O'),
  ('.', '.', 'O')],
 [('Mr.', 'NNP', 'B-PERSON'),
  ('Vinken', 'NNP', 'B-PERSON'),
  ('is', 'VBZ', 'O'),
  ('chairman', 'NN', 'O'),
  ('of', 'IN', 'O'),
  ('Elsevier', 'NNP', 'B-ORGANIZATION'),
  ('N.V.', 'NNP', 'O'),
  (',', ',', 'O'),
  ('the', 'DT', 'O'),
  ('Dutch', 'NNP', 'B-GPE'),
  ('publishing', 'VBG', 'O'),
  ('group', 'NN', 'O'),
  ('.', '.', 'O')]]

In [None]:
sentences_ner = [[(item[0],item[2]) for item in sent] for sent in iob]
sentences_ner[:2]

[[('Pierre', 'B-PERSON'),
  ('Vinken', 'B-ORGANIZATION'),
  (',', 'O'),
  ('61', 'O'),
  ('years', 'O'),
  ('old', 'O'),
  (',', 'O'),
  ('will', 'O'),
  ('join', 'O'),
  ('the', 'O'),
  ('board', 'O'),
  ('as', 'O'),
  ('a', 'O'),
  ('nonexecutive', 'O'),
  ('director', 'O'),
  ('Nov.', 'O'),
  ('29', 'O'),
  ('.', 'O')],
 [('Mr.', 'B-PERSON'),
  ('Vinken', 'B-PERSON'),
  ('is', 'O'),
  ('chairman', 'O'),
  ('of', 'O'),
  ('Elsevier', 'B-ORGANIZATION'),
  ('N.V.', 'O'),
  (',', 'O'),
  ('the', 'O'),
  ('Dutch', 'B-GPE'),
  ('publishing', 'O'),
  ('group', 'O'),
  ('.', 'O')]]

In [None]:
#Splitting the data for train and test
import random
random.shuffle(sentences_ner)
split_num_train = int(len(sentences_ner)*0.8)
split_num_valid = int(len(sentences_ner)*0.9)
train_data = sentences_ner[0:split_num_train]
valid_data = sentences_ner[split_num_train:split_num_valid]
test_data  = sentences_ner[split_num_valid:]

# Build a Vocabulary

In [None]:
def build_vocab(corpus, freq):
  # get the unique words and tags
  words=[]
  tags=[]
  for sent in corpus:
    for tokens in sent:
      words.append(tokens[0])
      tags.append(tokens[1])
  tag_cols=list(set(tags))
  tag_cols.sort()
  
  # count the word freqency
  word_counts=collections.Counter(words).most_common() # sort the word in dictionary by their frequencies.
  idx=0
  vocab={}
  for word, counts in word_counts:
    if counts>freq: # Set a boundray to add only words with frequency greater than specefied frequency.
      vocab[word]=idx
      idx+=1
  vocab['UNK']=idx #add an Unknown tag at the end of the vocab.
  return vocab, words, tag_cols

In [None]:
# build vocab
vocab, words, tag_cols=build_vocab(corpus=train_data, freq=3)
tag_cols.insert(0, 'START') 
tag_cols.insert(len(tag_cols), 'END')

In [None]:
tag_cols = tag_cols[:7] + [tag_cols[13]] + tag_cols[7:13] + [tag_cols[14]]
tag_cols

['START',
 'B-FACILITY',
 'B-GPE',
 'B-GSP',
 'B-LOCATION',
 'B-ORGANIZATION',
 'B-PERSON',
 'O',
 'I-FACILITY',
 'I-GPE',
 'I-GSP',
 'I-LOCATION',
 'I-ORGANIZATION',
 'I-PERSON',
 'END']

# Calculate Transition Matrix

In [None]:
def compute_transition_matrix(corpus, tag_cols):
  # get the tags
  tags_in_line=[]
  for sent in corpus:
    tags_per_line=[]
    for tokens in sent:
      tags_per_line.append(tokens[1])
    tags_in_line.append(tags_per_line)
  #print(tags_in_line)
  # compute the transition counts matrix
  cor_matrix=np.zeros((len(tag_cols), len(tag_cols)))
  for tags in tags_in_line:
    for i in range(len(tags)):
      if i==0:
        idx_x=tag_cols.index('START')
        idx_y=tag_cols.index(tags[i])
      elif i==len(tags)-1:
        idx_x=tag_cols.index(tags[i])
        idx_y=tag_cols.index('END')
      else:
        idx_x=tag_cols.index(tags[i])     # an arbitrary index
        idx_y=tag_cols.index(tags[i+1])   # next index
      cor_matrix[idx_x][idx_y]+=1
  return cor_matrix


  # compute the transition **counts matrix**


In [None]:
import pandas as pd

trans_matrix=compute_transition_matrix(corpus=train_data, tag_cols=tag_cols)
pd.DataFrame(trans_matrix , columns = tag_cols , index = tag_cols )

Unnamed: 0,START,B-FACILITY,B-GPE,B-GSP,B-LOCATION,B-ORGANIZATION,B-PERSON,O,I-FACILITY,I-GPE,I-GSP,I-LOCATION,I-ORGANIZATION,I-PERSON,END
START,0.0,1.0,290.0,4.0,0.0,72.0,319.0,2445.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B-FACILITY,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0
B-GPE,0.0,0.0,4.0,0.0,1.0,6.0,4.0,1028.0,0.0,197.0,0.0,0.0,0.0,0.0,3.0
B-GSP,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
B-LOCATION,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,22.0,0.0,0.0,0.0
B-ORGANIZATION,0.0,0.0,0.0,0.0,0.0,6.0,1.0,645.0,0.0,0.0,0.0,0.0,519.0,0.0,1.0
B-PERSON,0.0,0.0,0.0,0.0,0.0,2.0,0.0,479.0,0.0,0.0,0.0,0.0,0.0,834.0,2.0
O,0.0,33.0,1146.0,24.0,24.0,935.0,1132.0,64775.0,0.0,0.0,0.0,0.0,0.0,0.0,3123.0
I-FACILITY,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0
I-GPE,0.0,0.0,0.0,0.0,0.0,2.0,2.0,200.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0


  # compute the emission **counts matrix**


In [None]:
def compute_emission_matrix(corpus, vocab, tag_cols):
  # compute the emission counts matrix
  cor_matrix=np.zeros((len(tag_cols), len(vocab.keys())))
  for sent in corpus:
    for tokens in sent:
      idx_x=tag_cols.index(tokens[1])
      if tokens[0] in vocab.keys():
        idx_y=vocab[tokens[0]]
      else:
        idx_y=vocab['UNK']
      cor_matrix[idx_x][idx_y]+=1
  return cor_matrix

In [None]:
# get emission table
emission_matrix=compute_emission_matrix(corpus=train_data, vocab=vocab, tag_cols=tag_cols)

In [None]:
pd.DataFrame(emission_matrix , index = tag_cols)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2539,2540,2541,2542,2543,2544,2545,2546,2547,2548
START,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B-FACILITY,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
B-GPE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,384.0
B-GSP,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0
B-LOCATION,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
B-ORGANIZATION,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,459.0
B-PERSON,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,632.0
O,3817.0,3233.0,3058.0,1854.0,1725.0,1499.0,1286.0,1213.0,873.0,862.0,...,4.0,4.0,4.0,4.0,4.0,0.0,4.0,4.0,4.0,9673.0
I-FACILITY,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
I-GPE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25.0


# Now Convert to Probability

## Transition probability

In [None]:
def estimate_transition_prob(y_pos, y_pre, trans_matrix, tag_cols, beta=0):
  idx_x=tag_cols.index(y_pre)
  idx_y=tag_cols.index(y_pos)
  p=(trans_matrix[idx_x][idx_y]+beta)/(np.sum(trans_matrix[idx_x])+len(tag_cols)*beta)
  return p

## Emission Probability

In [None]:
def estimate_emission_prob(c_word, c_tag , emission_matrix, vocab, tag_cols, alpha=0):
  idx_x=tag_cols.index(c_tag)
  if c_word in vocab.keys():
    idx_y=vocab[c_word]
  else:
    idx_y=vocab['UNK']
  p=(emission_matrix[idx_x][idx_y]+alpha)/(np.sum(emission_matrix[idx_x])+alpha*len(vocab.keys()))
  return p

## Define Viterbi Algorithm

In [None]:
def viterbi(sentence, vocab, tag_cols, trans_matrix, emission_matrix, alpha, beta):
  v=np.zeros((len(sentence)+1, len(tag_cols)-2)) # Hold best probabilities
  b=np.zeros((len(sentence)+1, len(tag_cols)-2)) # Hold the indexes of best probabilities.
  s=np.zeros((1, len(tag_cols)-2))
  ### calculate s(y0, START), v(x0) ###
  for k in range(1, len(tag_cols)-1): # Ignoring Tag sequence starting with letter B.
    tp=estimate_transition_prob(tag_cols[k], 'START', trans_matrix=trans_matrix, 
                                tag_cols=tag_cols, beta=beta)
    ep=estimate_emission_prob(sentence[0], tag_cols[k], emission_matrix, vocab, tag_cols, alpha)
    v[0 , k-1]=np.log(tp)+np.log(ep) # log (tp * ep)
    b[0 , k-1]=tag_cols.index('START')

  ### calculate s(END, yi), v(END) ###
  for k in range(1, len(tag_cols)-1):
    for kk in range(1, len(tag_cols)-1):
      tp=estimate_transition_prob('END', tag_cols[kk], trans_matrix, tag_cols, beta)
      ep=1
      s[0 , kk-1]=np.log(tp)+np.log(ep)

    v[len(sentence) , k-1]=np.max(v[len(sentence)-1]+s[0])      # Max Probability 
    b[len(sentence) , k-1]=np.argmax(v[len(sentence)-1]+s[0])+1 # Max Index
  

  ### calculate s(yi, yi_1), v(xi) ###
  for m in range(1, len(sentence)):
    for k in range(1, len(tag_cols)-1):
      for kk in range(1, len(tag_cols)-1):

        tp=estimate_transition_prob(tag_cols[k], tag_cols[kk], trans_matrix, tag_cols, beta)
        ep=estimate_emission_prob(sentence[m], tag_cols[k], emission_matrix, vocab, tag_cols, alpha)

        s[0 , kk-1]=np.log(tp)+np.log(ep)

      v[m , k-1]=np.max(v[m-1]+s[0])
      b[m , k-1]=np.argmax(v[m-1]+s[0])+1    # plus 1 to align with the index in tag_cols
    

    
  # get the predict tags
  m_idx=np.array(np.arange(1, len(sentence)))
  m_idx=m_idx[::-1]
  y_m=[]
  y_m.append(tag_cols[int(b[len(sentence)][0])])
  for i, m in enumerate(m_idx):
    b_last=tag_cols.index(y_m[i])
    b_now=b[m][int(b_last)-1]
    y_m.append(tag_cols[int(b_now)])
  y_m.reverse() 
  return y_m, v, b

In [None]:
from sklearn.metrics import f1_score, precision_score , recall_score , accuracy_score

In [None]:
! pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l[K     |███████▌                        | 10 kB 18.0 MB/s eta 0:00:01[K     |███████████████                 | 20 kB 21.4 MB/s eta 0:00:01[K     |██████████████████████▌         | 30 kB 25.5 MB/s eta 0:00:01[K     |██████████████████████████████  | 40 kB 9.5 MB/s eta 0:00:01[K     |████████████████████████████████| 43 kB 1.3 MB/s 
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16180 sha256=2fae444c190c7f9bd3984ca529ac0c0f4db44167b06e2c7e1662bc28f8b4a7db
  Stored in directory: /root/.cache/pip/wheels/05/96/ee/7cac4e74f3b19e3158dce26a20a1c86b3533c43ec72a549fd7
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [None]:
from seqeval.metrics import f1_score , accuracy_score , precision_score , recall_score
def get_dev_acc(corpus, alpha, beta, vocab, tag_cols, trans_matrix, emission_matrix):
  P = []
  T = []
  acc_num=0.
  total_num=0.
  for sent in corpus:
    words=[]
    label=[]
    for tokens in sent:
      words.append(tokens[0])
      label.append(tokens[1])
    preds, v, b=viterbi(sentence=words, vocab=vocab, tag_cols=tag_cols, 
                        trans_matrix=trans_matrix, emission_matrix=emission_matrix, 
                        alpha=alpha, beta=beta)
    
    #print(list(zip(preds, label)))
    #print(v)
    #print(b)
    T.append(label)
    P.append(preds)
  
  return f1_score(T , P ) , accuracy_score(T , P) , precision_score(T , P) , recall_score ( T , P)#f1_score(y_test , y_pred , average = 'macro') , precision_score(y_test , y_pred , average = 'macro') , recall_score(y_test , y_pred ,  average = 'macro') , accuracy_score(y_test , y_pred ) #acc_num/total_num

# Evaluate on dev set

In [None]:
AC = np.zeros((3 , 3))
for i,a in enumerate(np.linspace(0.03 , 0.06 , 3)):
  for j,b in enumerate(np.linspace(3 , 6 , 3)):
    AC[i , j] =get_dev_acc(corpus = valid_data, alpha=a, beta=b, vocab=vocab, tag_cols=tag_cols, 
                    trans_matrix=trans_matrix, emission_matrix=emission_matrix)[0]
    print(f'alpha={a}, beta={b}, overal_f1={AC[i , j]}')

alpha=0.03, beta=3.0, overal_f1=0.3322033898305085
alpha=0.03, beta=4.5, overal_f1=0.3257372654155496
alpha=0.03, beta=6.0, overal_f1=0.32797858099062915
alpha=0.045, beta=3.0, overal_f1=0.3317535545023697
alpha=0.045, beta=4.5, overal_f1=0.32685867381111855
alpha=0.045, beta=6.0, overal_f1=0.3272971160295104
alpha=0.06, beta=3.0, overal_f1=0.3255503669112742
alpha=0.06, beta=4.5, overal_f1=0.32530120481927716
alpha=0.06, beta=6.0, overal_f1=0.3241795043536504


In [None]:
AC.argmax() #=> alpha = 0.03 , beta = 3

0

# Find the accuracy on the test data

In [None]:
f1 , accuracy , precision , recall = get_dev_acc(corpus = test_data, alpha=0.03, beta=3, vocab=vocab, tag_cols=tag_cols, 
                trans_matrix=trans_matrix, emission_matrix=emission_matrix)
print(f'alpha={0.03}, beta={3}, overal_accuracy={accuracy} , overal_precision={precision} , overall recall = {recall} , overall F1-score = {f1}')

alpha=0.03, beta=3, overal_accuracy=0.8991695163654128 , overal_precision=0.28440366972477066 , overall recall = 0.4343257443082312 , overall F1-score = 0.3437283437283437
