In [2]:
from datasets import load_dataset
import pandas as pd

dataset = load_dataset('docred', trust_remote_code=True)
annotated = pd.DataFrame(dataset['train_annotated'])
distant = pd.DataFrame(dataset['train_distant'])

In [61]:
print(dataset)

DatasetDict({
    validation: Dataset({
        features: ['title', 'sents', 'vertexSet', 'labels'],
        num_rows: 998
    })
    test: Dataset({
        features: ['title', 'sents', 'vertexSet', 'labels'],
        num_rows: 1000
    })
    train_annotated: Dataset({
        features: ['title', 'sents', 'vertexSet', 'labels'],
        num_rows: 3053
    })
    train_distant: Dataset({
        features: ['title', 'sents', 'vertexSet', 'labels'],
        num_rows: 101873
    })
})


In [None]:
'''
Data Format:
(source: https://github.com/thunlp/DocRED/blob/master/data/README.md)
{
  'title',
  'sents':     [
                  [word in sent 0], # list of lists of words forming sentences
                  [word in sent 1]
               ]
  'vertexSet': [
                  [
                    { 'name': mention_name, # name of entity mention
                      'sent_id': mention in which sentence, # index of the sentence where the mention occurs
                      'pos': postion of mention in a sentence,  # start and end position (indices) of the mention in the sentence
                      'type': NER_type} #the NER type, e.g. PERSON, LOCATION
                    {anthor mention}
                  ], 
                  [anthoer entity]
                ]
  'labels':   [
                {
                  'h': idx of head entity in vertexSet,
                  't': idx of tail entity in vertexSet,
                  'r': relation,
                  'evidence': evidence sentences' id
                }
              ]
}'''

In [60]:
# Sentences
from _RE import join_text

sents = annotated.iloc[3]['sents']

for sent in sents:
    print(' '.join(sent))

print('JOINED:')
print(join_text(sents))

Pacific Fair is a major shopping centre in Broadbeach Waters on the Gold Coast , Queensland , Australia .
It was Queensland 's largest regional shopping centre until 2006 .
Pacific Fair was developed by Hooker Retail Developments and opened in 1977 on what was swampland with 96 specialty stores and two anchor tenants .
Since then , Pacific Fair has undergone numerous expansions and has grown to have more than 300 specialty stores and four anchor tenants .
In January 2014 , work began on a major redevelopment project to meet the predicted regional growth on the Gold Coast .
Prior to the redevelopment , the shopping centre had four main major stores including a four - level Myer , Kmart , Target , Coles and Toys ' R ' Us .
Daimaru operated in the centre before its Australian withdrawal , albeit briefly .
It also had a 12-screen Birch Carroll and Coyle Cinema ( re - opened as Event Cinemas in late 2015 ) .
Pacific Fair is a major public transport interchange on the Gold Coast , serviced b

In [9]:
labels = annotated.iloc[0]['labels']
for l in labels:
    print(l, labels[l])

head [0, 0, 12, 2, 2, 4, 5, 3, 3, 3, 1, 1, 10]
tail [2, 4, 4, 4, 3, 3, 4, 2, 4, 4, 2, 4, 4]
relation_id ['P159', 'P17', 'P17', 'P17', 'P131', 'P150', 'P17', 'P150', 'P131', 'P17', 'P131', 'P17', 'P17']
relation_text ['headquarters location', 'country', 'country', 'country', 'located in the administrative territorial entity', 'contains administrative territorial entity', 'country', 'contains administrative territorial entity', 'located in the administrative territorial entity', 'country', 'located in the administrative territorial entity', 'country', 'country']
evidence [[0], [2, 4, 7], [6, 7], [0], [0], [0], [0, 3], [0], [0, 3], [0, 3], [0, 3], [0, 3], [4]]


In [65]:
import json

filename = 'docred_metadata/char2id.json'
with open(filename, 'r') as f:
    data = json.load(f)

print(json.dumps(data, indent=4))

with open(filename, 'w') as f:
    json.dump(data, f, indent=4)

{
    "BLANK": 0,
    "UNK": 1,
    "t": 2,
    "h": 3,
    "e": 4,
    "w": 5,
    "o": 6,
    "r": 7,
    "k": 8,
    "-": 9,
    "p": 10,
    "a": 11,
    "s": 12,
    "n": 13,
    "d": 14,
    "g": 15,
    "u": 16,
    "(": 17,
    ";": 18,
    ")": 19,
    ",": 20,
    "l": 21,
    "'": 22,
    "m": 23,
    "i": 24,
    "y": 25,
    "f": 26,
    "c": 27,
    ".": 28,
    "v": 29,
    "b": 30,
    "j": 31,
    "1": 32,
    "4": 33,
    "9": 34,
    "5": 35,
    "2": 36,
    "0": 37,
    "6": 38,
    "z": 39,
    "/": 40,
    "q": 41,
    "3": 42,
    "x": 43,
    "8": 44,
    ":": 45,
    "7": 46,
    "\u2014": 47,
    "\u00e9": 48,
    "\"": 49,
    "\u014d": 50,
    "\u1e63": 51,
    "\u0101": 52,
    "%": 53,
    "\u0645": 54,
    "\u0648": 55,
    "\u0627": 56,
    "\u0631": 57,
    "\u00a3": 58,
    "$": 59,
    "\u0142": 60,
    "\u015b": 61,
    "\u0119": 62,
    "\u2019": 63,
    "&": 64,
    "\u00e8": 65,
    "\u00ed": 66,
    "\u00f1": 67,
    "\u00e1": 68,
    "\u0131": 