In [14]:
import ujson as json
data = {}
with open("dataset/docred/relation_sents.txt", "r") as f:
    for item in f.readlines():
        item = item.split('-')
        data[item[0].strip()] = item[1].strip()

with open("dataset/docred/relation_sents.json", 'w') as f:
    json.dump(data, f)

In [35]:
import ujson as json
import pandas as pd
from transformers import AutoTokenizer
with open("dataset/docred/rel_info.json", "r") as f:
	rel2name = json.load(f)
with open("meta/rel2id.json", "r") as f:
    rel2id = json.load(f)
with open("dataset/docred/relation_sents.json", "r") as f:
    name2example = json.load(f)

rel2description = {}
df = pd.read_csv("dataset/docred/wikidata-properties.csv", encoding="utf-8")
for index, row in df.iterrows():
    rel2description[row["ID"]] = row["description"]

In [36]:
df

Unnamed: 0,ID,label,description,aliases,Data type,Count
0,P0,no relation,there is no relation between the two entities,"no_relation, NA",,
1,P6,head of government,"head of the executive power of this town, city...","president, chancellor, mayor, prime minister, ...",WikibaseItem,35697
2,P10,video,"relevant video. For images, use the property P...","animation, media, gif, trailer (Commons)",CommonsMedia,5586
3,P14,traffic sign,"graphic symbol describing the item, used at th...","road sign, highway shield, shield, highway mar...",CommonsMedia,19614
4,P15,route map,image of route map at Wikimedia Commons,"schema, railroad map, railway map, highway map...",CommonsMedia,22902
...,...,...,...,...,...,...
9338,P10058,IRIS UNIBO author ID,"identifier for an author in IRIS UNIBO, the op...","IRIS UNIBO author identifier, IRIS Università ...",ExternalId,2647
9339,P10059,Philosophica ID,identifier for an entry in the Philosophica on...,Philosophica identifier,ExternalId,80
9340,P10060,Castforward ID,identifier for a person on the Castforward web...,Castforward identifier,ExternalId,60
9341,P10061,Baidu Scholar journal ID,identifier for journals in Baidu Scholar,,ExternalId,26


In [54]:
rel_informations = [(rel2id[d], d, rel2name[d], rel2description[d], name2example[rel2name[d]]) for d in rel2name]
rel_informations.sort(key=lambda x: x[0])
no_relation = (0, "P0", "no relation", "there is no relation between the two entities", "Banana and beef are no relation.")
rel_informations = [no_relation, ] + rel_informations

In [55]:
columns = ['id', 'rel', 'name', 'description', 'example']
df = pd.DataFrame(columns = columns, data = rel_informations)
df.to_csv('rel_informations.csv', encoding='utf-8')

In [62]:
df.loc[93]["description"]

'this person, idea, etc. is informed by that other person, idea, etc., e.g. “Heidegger was influenced by Aristotle”'

In [79]:
tokenizer = AutoTokenizer.from_pretrained("/data1/jiantingtang/.cache/huggingface/hub/models--roberta-large/snapshots/716877d372b884cad6d419d828bac6c85b3b18d9/")
rels = [tokenizer.encode(item[2] + ": " + item[3]) for item in rel_informations]
rels_len_pad = max([len(r) for r in rels])
input_mask = [[1] * len(item) + [0] * (rels_len_pad - len(item)) for item in rels]
input_ids = [item + [tokenizer.sep_token_id]*(rels_len_pad - len(item)) for item in rels]

# input_ids = [tokenizer.convert_tokens_to_ids(item) for item in rels]
# input_ids = [tokenizer.build_inputs_with_special_tokens(item) for item in input_ids]

In [84]:
print([len(d) for d in input_mask])

[74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74]


In [81]:
save_data = {"input_ids": input_ids,  "input_mask": input_mask}

In [83]:
import torch
torch.save(save_data, "rel_name_description.pt")

In [28]:
rel_informations

[(0,
  'P0',
  'no relation',
  'there is no relation between the two entities',
  'Banana and beef are no relation.'),
 (1,
  'P17',
  'country',
  'sovereign state of this item (not to be used for human beings)',
  'Greenland is an autonomous territory of Denmark.'),
 (2,
  'P131',
  'located in the administrative territorial entity',
  'the item is located on the territory of the following administrative entity. Use P276 for specifying locations that are non-administrative places and for items about events. Use P1382 if the item falls only partially into the administrative entity.',
  'Paris is located in the Île'),
 (3,
  'P27',
  'country of citizenship',
  'the object is a country that recognizes the subject as its citizen',
  'Elon Musk has South African citizenship.'),
 (4,
  'P150',
  'contains administrative territorial entity',
  '(list of) direct subdivisions of an administrative territorial entity',
  'Los Angeles County contains the city of Los Angeles.'),
 (5,
  'P577',


In [37]:
tokenizer = AutoTokenizer.from_pretrained("roberta-large")

In [38]:
tokenizer.special_tokens_map

{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'sep_token': '</s>',
 'pad_token': '<pad>',
 'cls_token': '<s>',
 'mask_token': '<mask>'}

In [41]:
id_rels = [(docred_rel2id[d], d, data[d]) for d in data]
id_rels.sort(key=lambda x: x[0])
id_rels = [(0, "", 'no relation')] + id_rels
rels = [["*"]+tokenizer.tokenize(item[2])+["*"] for item in id_rels]
rels, rel_pos = [], []
for item in id_rels:
    rel_pos.append(len(rels)+1)
    rels.extend(["*"]+tokenizer.tokenize(item[2])+["*"])

input_ids = tokenizer.convert_tokens_to_ids(rels)
input_ids = tokenizer.build_inputs_with_special_tokens(input_ids)

In [52]:
print(len(id_rels))
print(len(input_ids))

97
453


In [44]:
print(id_rels)
print(rels)
print(rel_pos)
print(input_ids)

[(0, '', 'no relation'), (1, 'P17', 'country'), (2, 'P131', 'located in the administrative territorial entity'), (3, 'P27', 'country of citizenship'), (4, 'P150', 'contains administrative territorial entity'), (5, 'P577', 'publication date'), (6, 'P175', 'performer'), (7, 'P569', 'date of birth'), (8, 'P570', 'date of death'), (9, 'P161', 'cast member'), (10, 'P264', 'record label'), (11, 'P527', 'has part'), (12, 'P361', 'part of'), (13, 'P495', 'country of origin'), (14, 'P19', 'place of birth'), (15, 'P571', 'inception'), (16, 'P54', 'member of sports team'), (17, 'P102', 'member of political party'), (18, 'P463', 'member of'), (19, 'P3373', 'sibling'), (20, 'P40', 'child'), (21, 'P30', 'continent'), (22, 'P50', 'author'), (23, 'P1441', 'present in work'), (24, 'P1001', 'applies to jurisdiction'), (25, 'P69', 'educated at'), (26, 'P26', 'spouse'), (27, 'P607', 'conflict'), (28, 'P57', 'director'), (29, 'P159', 'headquarters location'), (30, 'P22', 'father'), (31, 'P400', 'platform')

In [53]:
import torch
a = torch.load("dataset/docred/rel_info.json.roberta-large.pt")

In [57]:
len(a["input_ids"])

453

In [63]:
rel_informations

[(0,
  'P0',
  'no relation',
  'there is no relation between the two entities',
  'Banana and beef are no relation.'),
 (1,
  'P17',
  'country',
  'sovereign state of this item (not to be used for human beings)',
  'Greenland is an autonomous territory of Denmark.'),
 (2,
  'P131',
  'located in the administrative territorial entity',
  'the item is located on the territory of the following administrative entity. Use P276 for specifying locations that are non-administrative places and for items about events. Use P1382 if the item falls only partially into the administrative entity.',
  'Paris is located in the Île'),
 (3,
  'P27',
  'country of citizenship',
  'the object is a country that recognizes the subject as its citizen',
  'Elon Musk has South African citizenship.'),
 (4,
  'P150',
  'contains administrative territorial entity',
  '(list of) direct subdivisions of an administrative territorial entity',
  'Los Angeles County contains the city of Los Angeles.'),
 (5,
  'P577',


In [65]:
rel = "publication date"
des = 'date or point in time when a work was first published or released'

In [71]:
prompt = """You will be provided with a relation type delimited by <rel></rel> and \
a description for the relation type delimited by <des></des>. \
You should understand the meaning of the relation type and write a sentence that contains the relation type.
- The sentence should contain one subject and one object and they should be delimited by *.
- The subject-object relation in the sentence should be consistent with the given relation type.
- The sentence should not contains other relation type.
- The sentence should be as short as possible.
You can follow such steps:
- step1: define the subject.
- step2: define the object.
- step3: write a sentence containing the subject and object.
----------------
Here are three examples:
input:
<rel>place of birth</rel>
<des>most specific known (e.g. city instead of country, or hospital instead of city) birth location of a person, animal or fictional character</des>
output:
<subject>Woolsthorpe<subject>
<object>Newton<object>
<sent>* Newton * was born in * Woolsthorpe *, England.<sent>

input:
<rel>mother</rel>
<des>female parent of the subject. For stepmother, use "stepparent" (P3448)</des>
output:
<subject>Tina Knowles<subject>
<object>Beyonce<object>
<sent>* Beyonce * 's mother is * Tina Knowles * .<sent>

input:
<rel>follows</rel>
<des>immediately prior item in a series of which the subject is a part [if the subject has replaced the preceding item, e.g. political offices, use "replaces" (P1365)]</des>
output:
<subject>Barack Obama<subject>
<object>George W. Bush<object>
<sent>* Barack Obama * follows * George W. Bush * as the President of the United States.<sent>
----------------
input:
<rel>{}</rel>
<des>{}</des>
output:
"""

In [73]:
print(prompt.format(rel, des))

You will be provided with a relation type delimited by <rel></rel> and a description for the relation type delimited by <des></des>. You should understand the meaning of the relation type and write a sentence that contains the relation type.
- The sentence should contain one subject and one object and they should be delimited by *.
- The subject-object relation in the sentence should be consistent with the given relation type.
- The sentence should not contains other relation type.
- The sentence should be as short as possible.
You can follow such steps:
- step1: define the subject
- step2: define the object
- step3: write a sentence containing the subject and object
----------------
Here are three examples:
input:
<rel>place of birth</rel>
<des>most specific known (e.g. city instead of country, or hospital instead of city) birth location of a person, animal or fictional character</des>
output:
<subject>Woolsthorpe<subject>
<object>Newton<object>
<sent>* Newton * was born in * Woolstho