In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install tokenizers
!pip install sentencepiece
!pip install sacremoses
!pip install transformers
!pip install langid

Collecting tokenizers
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m68.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.13.3
Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m880.6/880.6 kB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: sacremoses
  Building wheel for sacrem

In [3]:
%cd '/content/drive/MyDrive/trankit-master'

/content/drive/MyDrive/trankit-master


In [4]:
%ls

[0m[01;34mdocs[0m/      LICENSE      README.md      setup.py  [01;34mTrankit_Data[0m/
[01;34mexamples[0m/  MANIFEST.in  [01;34mSaved_Models[0m/  [01;34mtrankit[0m/


In [12]:
import sys
# sys.path.append('/content/drive/MyDrive/trankit-master')

from trankit import tpipeline
from trankit.tpipeline import TPipeline as tpip
from trankit.iterators.tagger_iterators import TaggerDataset
import os
import re

def lines_to_relations(true_lines):

    relations_list = []
    relations_oneline = []
    for line in true_lines:
        if line == '\n':
            relations_oneline = [relation for relation in relations_oneline if (relation[2]!= 'No_rel' and relation[2]!= 'root')]
            relations_list.append(relations_oneline)
            relations_oneline = []
        else:
            lst = line.strip().split('\t')
            relation = [lst[0],lst[6],lst[7]]
            relations_oneline.append(relation)

    return relations_list

def comps_from_relations(relations):
    lst = []
    nested_comp = []
    for rel in relations:
        if 'Comp_root' in rel:
            lst.append(rel)
            nested_comp.append(lst)
            lst = []
        else:
            lst.append(rel)
    return nested_comp

def spans_from_comps(comps_list):
    comps_list_new = []
    for comps in comps_list:
        comps_new = []
        for comp in comps:
            comp = re.sub(',\w+$','',comp)
            comp_ = re.findall('\d+',comp)
            comps_new.append(comp_)
        comps_list_new.append(comps_new)

    return comps_list_new

def metric(true_lines, pred_lines,setting_name):
    true_relations = lines_to_relations(true_lines)
    pred_relations = lines_to_relations(pred_lines)
    correct = 0
    predict_count = 0
    true_count = 0
    match = []
    true_labels = []
    em = 0
    tot_comps = 0
    for i in range(len(pred_relations)):

        true_relation_oneline = true_relations[i]
        pred_relation_oneline = pred_relations[i]
#         print(true_relation_oneline)
        tr_copy = [','.join(lst) for lst in true_relation_oneline]
        pr_copy = [','.join(lst) for lst in pred_relation_oneline]

        tr_comps = comps_from_relations(tr_copy)
        pr_comps = comps_from_relations(pr_copy)
#         print(tr_comps)
        for comp in pr_comps:
            if comp in tr_comps:
                em += 1
        tot_comps += len(tr_comps)
#         if set(tr_copy) == set(pr_copy):
#             em += 1
        for rel in pred_relation_oneline:
            if rel in true_relation_oneline:
                correct += 1
        predict_count += len(pred_relation_oneline)
        true_count += len(true_relation_oneline)

    if correct == 0:
        p = 0
        r = 0
    else:
        p = correct / predict_count
        r = correct / true_count
    if p == 0 or r == 0:
        f1 = 0
    else:
        f1 = 2 * p * r / (p + r)
    a = 1.0*correct/(predict_count+true_count-correct)
#     em_per = em/len(pred_relations)
    em_per = em/tot_comps
    metrics_list = [100*p, 100*r, 100*f1, 100*a, 100*em_per]
#     metrics_list = [100*p, 100*r, 100*f1]
    metrics = [round(i,2) for i in metrics_list]
    print(f'Results for {setting_name} are:\n')
    print(f'Precision: {metrics[0]}\nRecall: {metrics[1]}\nF1: {metrics[2]}\nExact match: {metrics[4]}\n')
    return metrics

def train_and_test(setting_name):

  save_dir = './Saved_Models/'+setting_name
  train_file = './Trankit_Data/'+setting_name+'/train.conllu'
  dev_file = './Trankit_Data/'+setting_name+'/dev.conllu'
  test1 = './Trankit_Data/'+setting_name+'/test.conllu'
  test2 = './Trankit_Data/'+setting_name+'/outofDomain.conllu'
  test_files = [test1,test2]
  trainer = tpip(
      training_config={
      'category': 'customized-mwt-ner', # pipeline category
      'task': 'posdep', # task name
      'save_dir': save_dir, # directory for saving trained model
      'train_conllu_fpath': train_file, # annotations file in CONLLU format  for training
      'dev_conllu_fpath': dev_file # annotations file in CONLLU format for development
      # 'max_epoch': 1
      }
  )

  # start training
  trainer.train()
  for x in range(2):
    test_set = TaggerDataset(
        config=trainer._config,
        gold_conllu=test_files[x],
        input_conllu = test_files[x],
        evaluate=False
    )
    test_set.numberize()
    test_batch_num = len(test_set) // trainer._config.batch_size + (len(test_set) % trainer._config.batch_size != 0)
    result = trainer._eval_posdep(data_set=test_set, batch_num=test_batch_num,
                              name='test', epoch=-1)
    os.rename(os.path.join(save_dir,'xlm-roberta-base/customized-mwt-ner/preds/tagger.test.conllu.epoch--1'), os.path.join(save_dir,'xlm-roberta-base/customized-mwt-ner/preds/tagger.test'+str(x)+' '+setting_name+'.conllu.epoch--1'))
    pred_conllu = os.path.join(save_dir,'xlm-roberta-base/customized-mwt-ner/preds/tagger.test'+str(x)+' '+setting_name+'.conllu.epoch--1')
    gold_conllu = test_files[x]
    with open(gold_conllu) as t:
      with open(pred_conllu) as p:
          true_lines = t.readlines()
          pred_lines = p.readlines()
    metrics = metric(true_lines,pred_lines,setting_name+str(x))

Data_folder = './Trankit_Data'
lst = []
for path,subdirs,files in os.walk(Data_folder):
  if 'Trankit' not in os.path.basename(path):
    setting_name = os.path.basename(path)
    print(setting_name)
    train_and_test(setting_name)

With Context Coarse
Setting up training config...
Loaded 11000 entries from ./Trankit_Data/With Context Coarse/train.conllu
Loaded 2000 entries from ./Trankit_Data/With Context Coarse/dev.conllu
******************************
Posdep tagger: Epoch: 0


Train 0: 688it [01:12,  9.53it/s]
dev 0: 100%|█████████████████████████████| 125/125 [00:07<00:00, 17.76it/s]


Saving adapter weights to ... ./Saved_Models/With Context Coarse/xlm-roberta-base/customized-mwt-ner/customized-mwt-ner.tagger.mdl (11.48 MB)
------------------------------ Best dev CoNLLu score: epoch 0------------------------------
Metric     | Precision |    Recall |  F1 Score | AligndAcc
-----------+-----------+-----------+-----------+-----------
Tokens     |    100.00 |    100.00 |    100.00 |
Sentences  |    100.00 |    100.00 |    100.00 |
Words      |    100.00 |    100.00 |    100.00 |
UPOS       |    100.00 |    100.00 |    100.00 |    100.00
XPOS       |    100.00 |    100.00 |    100.00 |    100.00
UFeats     |    100.00 |    100.00 |    100.00 |    100.00
AllTags    |    100.00 |    100.00 |    100.00 |    100.00
Lemmas     |      0.00 |      0.00 |      0.00 |      0.00
UAS        |    100.00 |    100.00 |    100.00 |    100.00
LAS        |     82.71 |     82.71 |     82.71 |     82.71
CLAS       |    100.00 |    100.00 |    100.00 |    100.00
MLAS       |    100.00 |    

test -1: 100%|███████████████████████████| 184/184 [00:10<00:00, 17.77it/s]


Results for With Context Coarse0 are:

Precision: 56.7
Recall: 51.86
F1: 54.17
Exact match: 16.18

Loaded 1139 entries from ./Trankit_Data/With Context Coarse/outofDomain.conllu


test -1: 100%|█████████████████████████████| 72/72 [00:02<00:00, 31.94it/s]


Results for With Context Coarse1 are:

Precision: 57.4
Recall: 52.65
F1: 54.92
Exact match: 17.03

With Context Finegrain
Setting up training config...
Loaded 11000 entries from ./Trankit_Data/With Context Finegrain/train.conllu
Loaded 2000 entries from ./Trankit_Data/With Context Finegrain/dev.conllu
******************************
Posdep tagger: Epoch: 0


Train 0:  56%|███████████████            | 382/687 [00:41<00:34,  8.72it/s]

KeyboardInterrupt: ignored