# Convert RE-TACRED to UniRel Training Format

## Reads Re-TACRED Dataset and Transforms records to the UniRel Format below
```
{
    "text": "In Queens , North Shore Towers , near the Nassau border , supplanted a golf course , and housing replaced a gravel quarry in Douglaston .",
    "id": "valid_0",
    "relation_list": [
        {
            "subject": "Douglaston",
            "object": "Queens",
            "predicate": "/location/neighborhood/neighborhood_of",
            "subj_tok_span": [26, 28],
            "obj_tok_span": [1, 2]
        }
    ]
}
```

## Create Data Directory and Relationship Maps

In [6]:
import json

from pathlib import Path

id2label = {
    0: "no_relation",
    1: "org:founded_by",
    2: "per:identity",
    3: "org:alternate_names",
    4: "per:children",
    5: "per:origin",
    6: "per:countries_of_residence",
    7: "per:employee_of",
    8: "per:title",
    9: "org:city_of_branch",
    10: "per:religion",
    11: "per:age",
    12: "per:date_of_death",
    13: "org:website",
    14: "per:stateorprovinces_of_residence",
    15: "org:top_members/employees",
    16: "org:number_of_employees/members",
    17: "org:members",
    18: "org:country_of_branch",
    19: "per:spouse",
    20: "org:stateorprovince_of_branch",
    21: "org:political/religious_affiliation",
    22: "org:member_of",
    23: "per:siblings",
    24: "per:stateorprovince_of_birth",
    25: "org:dissolved",
    26: "per:other_family",
    27: "org:shareholders",
    28: "per:parents",
    29: "per:charges",
    30: "per:schools_attended",
    31: "per:cause_of_death",
    32: "per:city_of_death",
    33: "per:stateorprovince_of_death",
    34: "org:founded",
    35: "per:country_of_death",
    36: "per:country_of_birth",
    37: "per:date_of_birth",
    38: "per:cities_of_residence",
    39: "per:city_of_birth"
}

label2id = {
    "no_relation": 0,
    "org:founded_by": 1,
    "per:identity": 2,
    "org:alternate_names": 3,
    "per:children": 4,
    "per:origin": 5,
    "per:countries_of_residence": 6,
    "per:employee_of": 7,
    "per:title": 8,
    "org:city_of_branch": 9,
    "per:religion": 10,
    "per:age": 11,
    "per:date_of_death": 12,
    "org:website": 13,
    "per:stateorprovinces_of_residence": 14,
    "org:top_members/employees": 15,
    "org:number_of_employees/members": 16,
    "org:members": 17,
    "org:country_of_branch": 18,
    "per:spouse": 19,
    "org:stateorprovince_of_branch": 20,
    "org:political/religious_affiliation": 21,
    "org:member_of": 22,
    "per:siblings": 23,
    "per:stateorprovince_of_birth": 24,
    "org:dissolved": 25,
    "per:other_family": 26,
    "org:shareholders": 27,
    "per:parents": 28,
    "per:charges": 29,
    "per:schools_attended": 30,
    "per:cause_of_death": 31,
    "per:city_of_death": 32,
    "per:stateorprovince_of_death": 33,
    "org:founded": 34,
    "per:country_of_death": 35,
    "per:country_of_birth": 36,
    "per:date_of_birth": 37,
    "per:cities_of_residence": 38,
    "per:city_of_birth": 39
}

data_dir = Path("./data/retacred")
data_dir.mkdir(parents=True, exist_ok=True)

# Write labels to ID mapping for relationship types
(data_dir / 'rel2id.json').write_text(
    json.dumps(label2id),
    encoding='utf8')

# This model does not use Entity Types so add default for all
(data_dir / "ent2id.json").write_text(
    '{"DEFAULT": 0}',
    encoding='utf8')

14

## Load the Dataset for the Re-TACRED version
- Note: Re-TACRED is a somewhat corrected version of TACRED
- Re-TACRED still have many invalid triples but is a more accurate indicator or performance
- Also only a subset of columns are required for UniRel format (e.g. entity types not used)

In [7]:
import datasets

re_ds = datasets.load_dataset(
    "DFKI-SLT/tacred",
    name="re-tacred",
    data_dir='../dataset/tacred/json',
    num_proc=8,
    trust_remote_code=True
)

# filter to subset of columns needed
cols = [
    'id',
    'token',
    'subj_start',
    'subj_end',
    'obj_start',
    'obj_end',
    'relation'
]
re_ds = re_ds.select_columns(cols)
re_ds

DatasetDict({
    train: Dataset({
        features: ['id', 'token', 'subj_start', 'subj_end', 'obj_start', 'obj_end', 'relation'],
        num_rows: 58465
    })
    test: Dataset({
        features: ['id', 'token', 'subj_start', 'subj_end', 'obj_start', 'obj_end', 'relation'],
        num_rows: 13418
    })
    validation: Dataset({
        features: ['id', 'token', 'subj_start', 'subj_end', 'obj_start', 'obj_end', 'relation'],
        num_rows: 19584
    })
})

In [45]:
# df = re_ds["validation"].to_pandas()
# df["token"].str.len().max()  #95

# df = re_ds["test"].to_pandas()
# df["token"].str.len().max() # 96

df = re_ds["train"].to_pandas()
df["token"].apply(len).max()

96

## Transform Function

In [8]:
def to_unirel(df):
    df["text"] = df["token"].str.join(' ')

    df["subject"] = df.apply(
        lambda row: row['token'][row['subj_start']:row['subj_end']], axis=1).str.join(' ')

    df["object"] = df.apply(
        lambda row: row['token'][row['obj_start']:row['obj_end']], axis=1).str.join(' ')

    df['relation_list'] = df.apply(
        lambda row: [{
            "subject": row['subject'],
            "object": row['object'],
            "predicate": id2label[row['relation']],
            "subj_tok_span": [row['subj_start'], row['subj_end']],
            "obj_tok_span": [row['obj_start'], row['obj_end']],
        }], axis=1)

    # Drop unused columns
    df.drop([
        'token',
        'subj_start',
        'subj_end',
        'obj_start',
        'obj_end',
        'relation',
        'subject',
        'object'
    ], axis=1, inplace=True)

    return df.to_dict(orient='records')


## Write JSON Content for Data Splits

### Validation Split

In [9]:
val_dicts = to_unirel(re_ds["validation"].to_pandas())
print(f"Examples: {len(val_dicts)}")
(data_dir / 'valid_data.json').write_text(
    json.dumps(val_dicts),
    encoding='utf8')
del val_dicts

Examples: 19584


### Train Split

In [10]:
train_dicts = to_unirel(re_ds["train"].to_pandas())
print(f"Examples: {len(train_dicts)}")
(data_dir / 'train_split.json').write_text(
    json.dumps(train_dicts),
    encoding='utf8')
del train_dicts

Examples: 58465


### Test Split

In [None]:
test_dicts = to_unirel(re_ds["test"].to_pandas())
print(f"Examples: {len(test_dicts)}")
(data_dir / 'test_data.json').write_text(
    json.dumps(test_dicts),
    encoding='utf8')
# del test_dicts

Examples: 13418


5136098

: 