In [6]:
from datasets import load_dataset
dataset = load_dataset('docred', trust_remote_code=True)
print(dataset)

DatasetDict({
    validation: Dataset({
        features: ['title', 'sents', 'vertexSet', 'labels'],
        num_rows: 998
    })
    test: Dataset({
        features: ['title', 'sents', 'vertexSet', 'labels'],
        num_rows: 1000
    })
    train_annotated: Dataset({
        features: ['title', 'sents', 'vertexSet', 'labels'],
        num_rows: 3053
    })
    train_distant: Dataset({
        features: ['title', 'sents', 'vertexSet', 'labels'],
        num_rows: 101873
    })
})


In [7]:
import pandas as pd
from ipydatagrid import DataGrid
import itertools

ds = pd.DataFrame(dataset['train_annotated'])

grid = DataGrid(ds)
#grid

12212


In [11]:
ds = pd.DataFrame(dataset['train_distant'])
print(ds)

                                            title  \
0                       Worker-Peasant Red Guards   
1                       Jewish art music movement   
2       Norwalk-La Mirada Unified School District   
3                                  Chunibala Devi   
4                                 Ouane Rattikone   
...                                           ...   
101868                           Last Flag Flying   
101869                               Hillman Minx   
101870                               Knute Nelson   
101871           Who Do You Think You Are? (book)   
101872                           Neal Lane Bridge   

                                                    sents  \
0       [[The, Worker, -, Peasant, Red, Guards, (, WPR...   
1       [[The, Jewish, art, music, movement, began, at...   
2       [[The, Norwalk, -, La, Mirada, Unified, School...   
3       [[Chunibala, Devi, ,, born, circ, .], [1875, —...   
4       [[Major, -, General, Ouane, Rattikone, (, Ouan... 

In [12]:
print(ds.iloc[0])

title                                Worker-Peasant Red Guards
sents        [[The, Worker, -, Peasant, Red, Guards, (, WPR...
vertexSet    [[{'name': 'Worker-Peasant Red Guards', 'sent_...
labels       {'head': [4, 6, 6, 7, 7, 7, 9, 9, 11, 11, 11],...
Name: 0, dtype: object


In [13]:
def get_info(docred_instance):
    title = docred_instance['title']

    flattened_sents = [' '.join(sublist) for sublist in docred_instance['sents']]
    text = '\n'.join(flattened_sents)

    nested_entities = docred_instance['vertexSet']
    entities = list(itertools.chain(*nested_entities))
    
    head = docred_instance['labels']['head']
    tail = docred_instance['labels']['tail']
    r_id = docred_instance['labels']['relation_id']
    r_text = docred_instance['labels']['relation_text']
    evidence = docred_instance['labels']['evidence']

    return title, text, entities, head, tail, r_id, r_text, evidence

In [14]:
from transformers import AutoTokenizer, pipeline

model_name = 'dslim/distilbert-NER'
tokenizer = AutoTokenizer.from_pretrained(model_name)
ner_pipeline = pipeline('ner', model=model_name, tokenizer=tokenizer)

In [16]:
import importlib
import _NER
importlib.reload(_NER)
from _NER import merge_result, combine_entities

title, text, original_entities, _, _, _, _, _ = get_info(ds.iloc[9])

print(text)

result = ner_pipeline(text)
merged_result = merge_result(result, model_name)
distilbert_entities = combine_entities(merged_result)

print('ORIGINAL:')
for e in original_entities:
    print(e)
    #pass

print('DISTILBERT:')
for e in distilbert_entities:
    print(e)

Lakshmana ( , IAST : lakṣmaṇa , lit .
he who has the signs of fortune ) , also spelled as Laxman or Lakhan , is the younger brother of Rama and his aide in the Hindu epic , the Ramayana .
He is also known by other names- Saumitra ( , IAST : saumitra , lit .
son of Sumitra ) , Ramanuja ( , IAST : rāmānuja , lit .
younger brother of Rama ) and Bharatanuja ( , IAST : bharatānuja , lit .
younger brother of Bharata ) or Laxman .
Lakshmana is the twin brother of Shatrughna .
According to the Valmiki Ramayana , Lakshmana is one quarter ( 25 % ) component of the manifestation of Lord Vishnu and is considered to be an avatar of Vishnu .
However some puranas of later times regard him as the avatar of Shesha , the thousand - headed serpent associated with Lord Vishnu , the supreme deity in Hinduism .
ORIGINAL:
{'name': 'Lakshmana', 'sent_id': 0, 'pos': [0, 1], 'type': 'MISC'}
{'name': 'Laxman', 'sent_id': 1, 'pos': [12, 13], 'type': 'PER'}
{'name': 'Laxman', 'sent_id': 5, 'pos': [6, 7], 'type': '