### Benchmarking the performance of tokenizers
- Spacy
- Underthesea
- VnCoreNLP

In [5]:
# Format for data [sentence, [[word, entity], [word, entity],...]]
import pprint
from time import time as timer

import pandas as pd
from conllu import parse

from tokenizer_wrappers.spacy import Spacy_tokenize
from tokenizer_wrappers.underthesea import Underthesea_tokenize
from tokenizer_wrappers.vncorenlp import VncoreNLP_tokenize

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
data = {
  "Method": [],
  "Word Segmentation": [],
  "POS Tagging": [],
  "Entity Recognition": [],
  "Tagging Time": [],
}

df = pd.DataFrame(data)

#### Open the conllu
- [Universal Dependencies - Vietnamese](https://github.com/UniversalDependencies/UD_Vietnamese-VTB)

In [7]:
with open("UD_Vietnamese-VTB/vi_vtb-ud-dev.conllu", "r", encoding="utf-8") as f:
  text = f.read()

sentences = parse(text)
pprint.pprint(sentences[0][0])

{'deprel': 'advcl',
 'deps': None,
 'feats': None,
 'form': 'Trả lời',
 'head': 6,
 'id': 1,
 'lemma': 'trả lời',
 'misc': None,
 'upos': 'VERB',
 'xpos': 'V'}


In [8]:
text = ""
sents: list[str] = []
groundtruth = []

for tokenlist in sentences:
  tagged = []
  sent = tokenlist.metadata["text"]
  for item in tokenlist:
    # print(item['form'],item['xpos'])
    tagged.append([item["form"], item["xpos"], ""])
  text += sent + " "

  groundtruth.append(tagged)
  sents.append(sent)

In [9]:
for t in [Spacy_tokenize(), Underthesea_tokenize(), VncoreNLP_tokenize()]:
  count = 0
  wordcount = 0
  poscount = 0
  sercount = 0

  time = 0
  index = 0

  for sent in sents:
    start = timer()
    predict = t.tokenize(sent)
    time += timer() - start
    count += len(groundtruth[index])

    # 'Predict: ',predict,'Ground-truth: ', groundtruth[index]
    if len(predict) == len(groundtruth[index]):
      for item, gt in zip(predict, groundtruth[index]):  # item = [word, pos, entity]
        if item[0] == gt[0]:
          wordcount += 1
        if item[1] == gt[1]:
          poscount += 1
        if item[2] == gt[2]:
          sercount += 1
    index += 1

  # Corrected segmented word and entity / total word count
  wordsegacc = wordcount / count
  posacc = poscount / count
  seracc = sercount / count

  df.loc[len(df)] = [t.info(), wordsegacc, posacc, seracc, time]

  _C._set_default_tensor_type(t)


VnCoreNLP model folder . already exists! Please load VnCoreNLP from this folder!
2025-01-08 16:20:23 INFO  WordSegmenter:24 - Loading Word Segmentation model
2025-01-08 16:20:23 INFO  PosTagger:23 - Loading POS Tagging model
2025-01-08 16:20:25 INFO  DependencyParser:32 - Loading Dependency Parsing model


In [10]:
df

Unnamed: 0,Method,Word Segmentation,POS Tagging,Entity Recognition,Tagging Time
0,PyVi,0.4333,0.228576,0.545868,15.567256
1,Underthesea,0.50172,0.222384,0.524769,8.921412
2,VnCoreNLP,0.624761,0.278343,0.637681,3.133295
