In [16]:
from datasets import load_dataset
import pandas as pd

dataset = load_dataset('docred', trust_remote_code=True)
annotated = pd.DataFrame(dataset['train_annotated'])
distant = pd.DataFrame(dataset['train_distant'])

In [17]:
print(dataset)

DatasetDict({
    validation: Dataset({
        features: ['title', 'sents', 'vertexSet', 'labels'],
        num_rows: 998
    })
    test: Dataset({
        features: ['title', 'sents', 'vertexSet', 'labels'],
        num_rows: 1000
    })
    train_annotated: Dataset({
        features: ['title', 'sents', 'vertexSet', 'labels'],
        num_rows: 3053
    })
    train_distant: Dataset({
        features: ['title', 'sents', 'vertexSet', 'labels'],
        num_rows: 101873
    })
})


In [None]:
'''
Data Format:
(source: https://github.com/thunlp/DocRED/blob/master/data/README.md)
{
  'title',
  'sents':     [
                  [word in sent 0], # list of lists of words forming sentences
                  [word in sent 1]
               ]
  'vertexSet': [
                  [
                    { 'name': mention_name, # name of entity mention
                      'sent_id': mention in which sentence, --> index of the sentence where the mention occurs
                      'pos': postion of mention in a sentence,  --> start and end position (indices) of the mention in the sentence
                      'type': NER_type} #the NER type, e.g. PERSON, LOCATION
                    {anthor mention}
                  ], 
                  [anthoer entity]
                ]
  'labels':   [
                {
                  'h': idx of head entity in vertexSet,
                  't': idx of tail entity in vertexSet,
                  'r': relation,
                  'evidence': evidence sentences' id --> the sentences from which the relation is supported
                }
              ]
}'''

In [30]:
import importlib
import _RE
importlib.reload(_RE)
from _RE import join_text

sents = annotated.iloc[0]['sents']
text = join_text(sents, fancy=False)
print(text)

Zest Airways , Inc. operated as AirAsia Zest ( formerly Asian Spirit and Zest Air ) , was a low - cost airline based at the Ninoy Aquino International Airport in Pasay City , Metro Manila in the Philippines .
It operated scheduled domestic and international tourist services , mainly feeder services linking Manila and Cebu with 24 domestic destinations in support of the trunk route operations of other airlines .
In 2013 , the airline became an affiliate of Philippines AirAsia operating their brand separately .
Its main base was Ninoy Aquino International Airport , Manila .
The airline was founded as Asian Spirit , the first airline in the Philippines to be run as a cooperative .
On August 16 , 2013 , the Civil Aviation Authority of the Philippines ( CAAP ) , the regulating body of the Government of the Republic of the Philippines for civil aviation , suspended Zest Air flights until further notice because of safety issues .
Less than a year after AirAsia and Zest Air 's strategic allian

In [77]:
def make_triplets(vertexSet, labels):
    '''
    Returns a triplet of format <head, relation, tail>
    '''

    names = []
    types = []
    triplets = []

    head = labels['head']
    tail = labels['tail']
    relation = labels['relation_id']
    relation_names = labels['relation_text']

    for entities in vertexSet:
        sub_names = []
        sub_types = []
        for entity in entities:
            sub_names.append(entity['name'])
            sub_types.append(entity['type'])
        names.append(sub_names)
        types.append(sub_types)

    for i in range(len(labels['head'])):
        triplets.append([names[head[i]], [relation[i], relation_names[i]], names[tail[i]]])
    
    for t in triplets:
        print(t)

In [78]:
'''print(annotated.columns)
vertexSet = annotated['vertexSet'][0]
for subset in vertexSet:
    print (subset)
labels = annotated.iloc[0]['labels']
for l in labels:
    print(l, labels[l])'''

make_triplets(vertexSet, labels)

# The first triplet here makes no sense

[['Zest Airways, Inc.', 'Asian Spirit and Zest Air', 'AirAsia Zest', 'AirAsia Zest'], ['P159', 'headquarters location'], ['Pasay City']]
[['Zest Airways, Inc.', 'Asian Spirit and Zest Air', 'AirAsia Zest', 'AirAsia Zest'], ['P17', 'country'], ['Philippines', 'Philippines', 'Republic of the Philippines']]
[['Zest Air', 'Zest Air'], ['P17', 'country'], ['Philippines', 'Philippines', 'Republic of the Philippines']]
[['Pasay City'], ['P17', 'country'], ['Philippines', 'Philippines', 'Republic of the Philippines']]
[['Pasay City'], ['P131', 'located in the administrative territorial entity'], ['Metro Manila']]
[['Philippines', 'Philippines', 'Republic of the Philippines'], ['P150', 'contains administrative territorial entity'], ['Metro Manila']]
[['Manila', 'Manila'], ['P17', 'country'], ['Philippines', 'Philippines', 'Republic of the Philippines']]
[['Metro Manila'], ['P150', 'contains administrative territorial entity'], ['Pasay City']]
[['Metro Manila'], ['P131', 'located in the administ

In [60]:
print(annotated.columns)
N = 61
vertexSet = annotated['vertexSet'][N]
for subset in vertexSet:
    print (subset)
labels = annotated.iloc[N]['labels']
for l in labels:
    print(l, labels[l])

# The first triplet here makes no sense

Index(['title', 'sents', 'vertexSet', 'labels'], dtype='object')
[{'name': 'Jackie Beat', 'sent_id': 0, 'pos': [0, 2], 'type': 'PER'}, {'name': 'Kent Fuher', 'sent_id': 0, 'pos': [21, 23], 'type': 'PER'}, {'name': 'Jackie Beat', 'sent_id': 4, 'pos': [0, 2], 'type': 'PER'}, {'name': 'Beat', 'sent_id': 1, 'pos': [0, 1], 'type': 'PER'}, {'name': 'Beat', 'sent_id': 2, 'pos': [3, 4], 'type': 'PER'}, {'name': 'Beat', 'sent_id': 5, 'pos': [0, 1], 'type': 'PER'}, {'name': 'Beat', 'sent_id': 6, 'pos': [0, 1], 'type': 'PER'}, {'name': 'Beat', 'sent_id': 3, 'pos': [0, 1], 'type': 'PER'}]
[{'name': 'July 24, 1965', 'sent_id': 0, 'pos': [4, 8], 'type': 'TIME'}]
[{'name': 'Flawless', 'sent_id': 1, 'pos': [19, 20], 'type': 'MISC'}]
[{'name': 'Adam & Steve', 'sent_id': 1, 'pos': [22, 25], 'type': 'MISC'}]
[{'name': 'Dance Off', 'sent_id': 1, 'pos': [36, 38], 'type': 'MISC'}]
[{'name': 'Sex and the City', 'sent_id': 2, 'pos': [7, 11], 'type': 'MISC'}]
[{'name': 'Hype', 'sent_id': 2, 'pos': [23, 24], 't

In [68]:
import json

filenames = [
    #'docred_metadata/char2id.json',
    #'docred_metadata/ner2id.json',
    #'docred_metadata/rel2id.json',
    #'docred_metadata/word2id.json',
    'docred_metadata/rel_info.json'
    ]

for filename in filenames:
    with open(filename, 'r') as f:
        data = json.load(f)

    with open(filename, 'w') as f:
        json.dump(data, f, indent=4)

In [13]:
import json

with open ('docred_metadata/rel_info.json', 'r') as f:
    data = json.load(f)

print('Number of classes:', len(data))
for i, key in enumerate(data):
    print(f"{i+1}. {key}: {data[key]}")

Number of classes: 96
1. P6: head of government
2. P17: country
3. P19: place of birth
4. P20: place of death
5. P22: father
6. P25: mother
7. P26: spouse
8. P27: country of citizenship
9. P30: continent
10. P31: instance of
11. P35: head of state
12. P36: capital
13. P37: official language
14. P39: position held
15. P40: child
16. P50: author
17. P54: member of sports team
18. P57: director
19. P58: screenwriter
20. P69: educated at
21. P86: composer
22. P102: member of political party
23. P108: employer
24. P112: founded by
25. P118: league
26. P123: publisher
27. P127: owned by
28. P131: located in the administrative territorial entity
29. P136: genre
30. P137: operator
31. P140: religion
32. P150: contains administrative territorial entity
33. P155: follows
34. P156: followed by
35. P159: headquarters location
36. P161: cast member
37. P162: producer
38. P166: award received
39. P170: creator
40. P171: parent taxon
41. P172: ethnic group
42. P175: performer
43. P176: manufacturer
4