In [1]:
import json
from pprint import pprint
from tqdm import tqdm
# !pip install -U nervaluate
from nervaluate import Evaluator

from exciton.nlp.named_entity_recognition import Exciton_NER
from exciton.nlp.named_entity_recognition.utils import clean_result

In [2]:
def process_spans(input_data):
    input_data["named_entities"] = sorted(input_data["named_entities"], key=lambda x: x["span"][0])
    sents = []
    x = 0
    for sen in input_data["named_entities"]:
        text = input_data["text"][x:sen["span"][0]]
        sents.append({"text": text, "label": "O"})
        text = input_data["text"][sen["span"][0]:sen["span"][1]]
        sents.append({"text": text, "label": sen["label"]})
        x = sen["span"][1]
    text = input_data["text"][x:]
    sents.append({"text": text, "label": "O"})

    tokens = []
    for itm in sents:
        for k, sen in enumerate(itm["text"].split()):
            if itm["label"] == "O":
                tokens.append({"token": sen, "label": itm["label"]})
            else:
                if k == 0:
                    tokens.append({"token": sen, "label": "B-" + itm["label"]})
                else:
                    tokens.append({"token": sen, "label": "I-" + itm["label"]})
    return tokens

In [3]:
model = Exciton_NER(path_to_model="/tmp/exciton_xlmroberta/", device="cuda:0")

In [4]:
data = []
with open("/home/tshi/exciton/datasets/nlp/named_entity_recognition/exciton_xlm_v1/test.jsonl", "r") as fp:
    for line in fp:
        itm = json.loads(line)
        itm["text"] = " ".join(itm["tokens"])
        itm["etokens"] = itm["tokens"]
        data.append(itm)
print(len(data))

gold = []
for itm in tqdm(data):
    results = process_spans(clean_result(itm))
    gold.append([sen["label"] for sen in results])
pred = []
for itm in tqdm(model.predict(data)):
    results = process_spans(itm)
    pred.append([sen["label"] for sen in results])

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2400/2400 [00:00<00:00, 31646.96it/s]

2400



100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2400/2400 [00:00<00:00, 78933.65it/s]


In [5]:
evaluator = Evaluator(gold, pred, tags=['LOC', 'PER', "ORG"], loader="list")
results, results_by_tag = evaluator.evaluate()
pprint(results)

{'ent_type': {'actual': 4026,
              'correct': 2974,
              'f1': 0.7499684781238178,
              'incorrect': 264,
              'missed': 667,
              'partial': 0,
              'possible': 3905,
              'precision': 0.7386984600099354,
              'recall': 0.7615877080665813,
              'spurious': 788},
 'exact': {'actual': 4026,
           'correct': 2649,
           'f1': 0.668011600050435,
           'incorrect': 589,
           'missed': 667,
           'partial': 0,
           'possible': 3905,
           'precision': 0.657973174366617,
           'recall': 0.6783610755441741,
           'spurious': 788},
 'partial': {'actual': 4026,
             'correct': 2649,
             'f1': 0.7422771403353929,
             'incorrect': 0,
             'missed': 667,
             'partial': 589,
             'possible': 3905,
             'precision': 0.7311227024341779,
             'recall': 0.7537772087067862,
             'spurious': 788},
 'stric

In [6]:
text = ["La presencia femenina en el homenaje a los socios del Oviedo: De aquella en el Tartiere todo eran paisanos"]
pprint(model.predict(text))

[{'named_entities': [{'label': 'ORG', 'span': [54, 58], 'text': 'Ovie'},
                     {'label': 'LOC', 'span': [58, 60], 'text': 'do'}],
  'text': 'La presencia femenina en el homenaje a los socios del Oviedo: De '
          'aquella en el Tartiere todo eran paisanos'}]


In [7]:
text = ["小明明天要去山西。"]
pprint(model.predict(text))

[{'named_entities': [], 'text': '小明明天要去山西。'}]


In [2]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
tokenizer = AutoTokenizer.from_pretrained("/home/tshi/exciton/models/nlp/named_entity_recognition/xlm_roberta_large_ner_hrl/tokenizer")
model = AutoModelForTokenClassification.from_pretrained(
    "/home/tshi/exciton/models/nlp/named_entity_recognition/xlm_roberta_large_ner_hrl/models",
    output_hidden_states=True, output_attentions=True
)
nlp = pipeline("ner", model=model, tokenizer=tokenizer)

In [5]:
import torch
from torch.autograd import Variable

input_data = tokenizer.encode("I like apple")
input_var = Variable(torch.LongTensor([input_data]))
with torch.no_grad():
    print(model(input_var)[0].size())
    print(model(input_var)[0])

torch.Size([1, 5, 9])
tensor([[[12.5734, -8.5643, -9.2152, -2.2894, -1.4954, -1.2884,  0.6909,
          -2.6070, -2.1160],
         [14.5701, -3.6156, -3.8085, -2.7521, -1.6829, -2.1126, -1.9309,
          -2.1725, -2.2054],
         [14.2506, -3.2643, -3.2314, -3.3014, -1.1430, -2.0232, -1.3108,
          -3.0898, -1.8969],
         [12.0747, -4.1659, -3.7890, -1.6128, -2.3243,  1.9635, -0.9602,
          -3.6512, -4.4395],
         [11.9663, -8.5594, -9.1297, -1.4407, -0.7283,  0.2500, -0.0907,
          -0.7336, -2.7137]]])


In [16]:
model = XLMHRL_NER()
model.predict("小明明天要去山西。")

[{'entity': 'B-LOC',
  'score': 0.99992675,
  'index': 5,
  'word': '山西',
  'start': 6,
  'end': 8}]

In [23]:
from pprint import pprint
from exciton.nlp.named_entity_recognition import XLMHRL_NER

In [30]:
model = XLMHRL_NER(device="cuda:0")

In [33]:
results = model.predict([
    "Elon Mask will go to New York.", 
    "马斯克明天要去纽约。特斯拉股价上涨。", 
    "홍천두촌면 홍운화씨, 두촌면에 어려운 이웃돕기 성금 전달"])
pprint(results)

[{'named_entities': [{'label': 'PER', 'span': [0, 9], 'text': 'Elon Mask'},
                     {'label': 'LOC', 'span': [21, 29], 'text': 'New York'}],
  'text': 'Elon Mask will go to New York.'},
 {'named_entities': [{'label': 'PER', 'span': [0, 0], 'text': ''},
                     {'label': 'PER', 'span': [0, 3], 'text': '马斯克'},
                     {'label': 'LOC', 'span': [7, 9], 'text': '纽约'},
                     {'label': 'ORG', 'span': [10, 13], 'text': '特斯拉'}],
  'text': '马斯克明天要去纽约。特斯拉股价上涨。'},
 {'named_entities': [{'label': 'LOC', 'span': [0, 5], 'text': '홍천두촌면'},
                     {'label': 'PER', 'span': [6, 9], 'text': '홍운화'},
                     {'label': 'LOC', 'span': [12, 15], 'text': '두촌면'}],
  'text': '홍천두촌면 홍운화씨, 두촌면에 어려운 이웃돕기 성금 전달'}]


In [17]:
# import json

# labels = ['O', 'B-MISC', 'I-MISC', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']
# fout = open("/home/tshi/exciton/models/nlp/named_entity_recognition/xlm_roberta_large_ner_hrl/models/labels.json", "w")
# json.dump(labels, fout)
# fout.close()