In [13]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import joblib
import numpy as np
import scipy.special as sp
from transformers import LukeTokenizer, LukeForEntityPairClassification
import json
import torch
from tqdm import trange
import pdb
import random
from collections import Counter

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer, device = 0)
example = "Berlin"

ner_results = nlp(example)
print(ner_results)

[{'entity': 'B-LOC', 'score': 0.99975425, 'index': 1, 'word': 'Berlin', 'start': 0, 'end': 6}]


In [22]:
# This function loads the dataset .json file and produce the list containing the relation extraction instances.
def load_examples(dataset_file):
    with open(dataset_file, "r") as f:
        data = json.load(f)

    examples = []
    for i, item in enumerate(data):
        tokens = item["token"]
        token_spans = dict(
            subj=(item["subj_start"], item["subj_end"] + 1),
            obj=(item["obj_start"], item["obj_end"] + 1)
        )

        if token_spans["subj"][0] < token_spans["obj"][0]:
            entity_order = ("subj", "obj")
        else:
            entity_order = ("obj", "subj")

        text = ""
        cur = 0
        char_spans = {}
        for target_entity in entity_order:
            token_span = token_spans[target_entity]
            text += " ".join(tokens[cur : token_span[0]])
            if text:
                text += " "
            char_start = len(text)
            text += " ".join(tokens[token_span[0] : token_span[1]])
            char_end = len(text)
            char_spans[target_entity] = (char_start, char_end)
            text += " "
            cur = token_span[1]
        text += " ".join(tokens[cur:])
        text = text.rstrip()

        examples.append(dict(
            text=text,
            entity_spans=[tuple(char_spans["subj"]), tuple(char_spans["obj"])],
            label=item["relation"],
            entity_type = (item['subj_type'], item['obj_type']),
        ))

    return examples

def overlap(range_a, range_ls):
    for range_ in range_ls:
        if range_[0] <= range_a[0] <= range_[1] or range_a[0] <= range_[0] <= range_a[1]:
            return True
    
    return False

def printREInstance(text, subj_span, obj_span, subj_type, obj_type):
    if subj_span[0] < obj_span[0]:
        new_text = text[:subj_span[0]] + '@' + text[subj_span[0] : subj_span[1]] + '@' + '(subj type:' + subj_type + ')' + \
                    text[subj_span[1] : obj_span[0]] + '#' + text[obj_span[0] : obj_span[1]] + '#' + '(obj type:' + obj_type + ')' + \
                    text[obj_span[1] : ]
    else:
        new_text = text[:obj_span[0]] + '#' + text[obj_span[0] : obj_span[1]] + '#'  + '(obj type:' + obj_type + ')' + \
                    text[obj_span[1] : subj_span[0]] + '@' + text[subj_span[0] : subj_span[1]] + '@' + '(subj type:' + subj_type + ')' + \
                    text[subj_span[1] : ]
        
    print(new_text)

In [24]:
test_examples = load_examples("test.json")
re_test_examples = load_examples("test_re.json")
rev_test_examples = load_examples("test_rev.json")
printREInstance(test_examples[0]['text'], 
                test_examples[0]['entity_spans'][0], test_examples[0]['entity_spans'][1],
                test_examples[0]['entity_type'][0], test_examples[0]['entity_type'][1],
               )

subj_per_name_ls = list(
        [
        test_examples[id_]['text'][
            test_examples[id_]['entity_spans'][0][0] 
            : 
            test_examples[id_]['entity_spans'][0][1]
            ].upper()
        for id_ in range(len(test_examples))
        if 
        (test_examples[id_]['entity_type'][0] == 'PERSON'
        )
        ]
)

subj_org_name_ls = list(
        [
        test_examples[id_]['text'][
            test_examples[id_]['entity_spans'][0][0] 
            : 
            test_examples[id_]['entity_spans'][0][1]
            ].upper()
        for id_ in range(len(test_examples))
        if 
        (test_examples[id_]['entity_type'][0] == 'ORGANIZATION'
        )
        ]
)

print('distinct number of tacred subj person names: ', len(set(subj_per_name_ls)))
print('distinct number of tacred subj organization names: ',len(set(subj_org_name_ls)))

re_subj_per_name_ls = list(
        [
        re_test_examples[id_]['text'][
            re_test_examples[id_]['entity_spans'][0][0] 
            : 
            re_test_examples[id_]['entity_spans'][0][1]
            ].upper()
        for id_ in range(len(re_test_examples))
        if 
        (re_test_examples[id_]['entity_type'][0] == 'PERSON'
        )
        ]
)

re_subj_org_name_ls = list(
        [
        re_test_examples[id_]['text'][
            re_test_examples[id_]['entity_spans'][0][0] 
            : 
            re_test_examples[id_]['entity_spans'][0][1]
            ].upper()
        for id_ in range(len(re_test_examples))
        if 
        (re_test_examples[id_]['entity_type'][0] == 'ORGANIZATION'
        )
        ]
)

print('distinct number of retacred subj person names: ', len(set(re_subj_per_name_ls)))
print('distinct number of retacred subj organization names: ',len(set(re_subj_org_name_ls)))

rev_subj_per_name_ls = list(
        [
        rev_test_examples[id_]['text'][
            rev_test_examples[id_]['entity_spans'][0][0] 
            : 
            rev_test_examples[id_]['entity_spans'][0][1]
            ].upper()
        for id_ in range(len(rev_test_examples))
        if 
        (rev_test_examples[id_]['entity_type'][0] == 'PERSON'
        )
        ]
)

rev_subj_org_name_ls = list(
        [
        rev_test_examples[id_]['text'][
            rev_test_examples[id_]['entity_spans'][0][0] 
            : 
            rev_test_examples[id_]['entity_spans'][0][1]
            ].upper()
        for id_ in range(len(rev_test_examples))
        if 
        (rev_test_examples[id_]['entity_type'][0] == 'ORGANIZATION'
        )
        ]
)

print('distinct number of tacrev subj person names: ', len(set(rev_subj_per_name_ls)))
print('distinct number of tacrev subj organization names: ',len(set(rev_subj_org_name_ls)))

He has served as a policy aide to the late U.S. Senator Alan Cranston , as National Issues Director for the 2004 presidential campaign of Congressman Dennis Kucinich , as a co-founder of @Progressive Democrats of America@(subj type:ORGANIZATION) and as a member of the #international policy department#(obj type:ORGANIZATION) at the RAND Corporation think tank before all that .
distinct number of tacred subj person names:  238
distinct number of tacred subj organization names:  204
distinct number of retacred subj person names:  217
distinct number of retacred subj organization names:  177
distinct number of tacrev subj person names:  238
distinct number of tacrev subj organization names:  204


In [25]:
subj_overlap_ls = []
batch_size = 128
for batch_start_idx in trange(0, len(test_examples), batch_size):
    batch_examples = test_examples[batch_start_idx:batch_start_idx + batch_size]
    texts = [example["text"] for example in batch_examples]
    entity_spans = [example["entity_spans"][0] for example in batch_examples]
    entity_types = [example["entity_type"][0] for example in batch_examples]
    
    ner_result_ls = nlp(texts)
    ner_per_ls = [[(ner_['start'], ner_['end']) for ner_ in entity_ls_ if 'PER' in ner_['entity']] for entity_ls_ in ner_result_ls]
    ner_org_ls = [[(ner_['start'], ner_['end']) for ner_ in entity_ls_ if 'ORG' in ner_['entity']] for entity_ls_ in ner_result_ls]

    overlap_flag = [
        False
        or (overlap(entity_spans[i_], ner_per_ls[i_]) and entity_types[i_] == 'PERSON')
        or (overlap(entity_spans[i_], ner_org_ls[i_]) and entity_types[i_] == 'ORGANIZATION')
        for i_ in range(len(batch_examples))
    ]
    
    subj_overlap_ls.extend([i_ + batch_start_idx for i_ in range(len(batch_examples)) if overlap_flag[i_]])

print(len(subj_overlap_ls), len(test_examples))

obj_overlap_ls = []
for batch_start_idx in trange(0, len(test_examples), batch_size):
    batch_examples = test_examples[batch_start_idx:batch_start_idx + batch_size]
    texts = [example["text"] for example in batch_examples]
    entity_spans = [example["entity_spans"][1] for example in batch_examples]
    entity_types = [example["entity_type"][1] for example in batch_examples]
    
    ner_result_ls = nlp(texts)
    ner_per_ls = [[(ner_['start'], ner_['end']) for ner_ in entity_ls_ if 'PER' in ner_['entity']] for entity_ls_ in ner_result_ls]
    ner_org_ls = [[(ner_['start'], ner_['end']) for ner_ in entity_ls_ if 'ORG' in ner_['entity']] for entity_ls_ in ner_result_ls]

    overlap_flag = [
        False
        or (overlap(entity_spans[i_], ner_per_ls[i_]) and entity_types[i_] == 'PERSON')
        or (overlap(entity_spans[i_], ner_org_ls[i_]) and entity_types[i_] == 'ORGANIZATION')
        for i_ in range(len(batch_examples))
    ]
    
    obj_overlap_ls.extend([i_ + batch_start_idx for i_ in range(len(batch_examples)) if overlap_flag[i_]])

print(len(obj_overlap_ls), len(test_examples))
print(len(set(obj_overlap_ls).union(set(subj_overlap_ls))), len(test_examples))

100%|██████████| 122/122 [03:21<00:00,  1.66s/it]


10145 15509


100%|██████████| 122/122 [03:21<00:00,  1.65s/it]

4912 15509
11661 15509





In [26]:
from collections import Counter

named_organization_name_ls = [
                test_examples[id_]['text'][
                    test_examples[id_]['entity_spans'][0][0] 
                    : 
                    test_examples[id_]['entity_spans'][0][1]
                    ].upper()
                for id_ in subj_overlap_ls
                if test_examples[id_]['entity_type'][0] == 'ORGANIZATION'
                ] + \
                [
                test_examples[id_]['text'][
                    test_examples[id_]['entity_spans'][1][0] 
                    : 
                    test_examples[id_]['entity_spans'][1][1]
                    ].upper()
                for id_ in obj_overlap_ls
                if test_examples[id_]['entity_type'][1] == 'ORGANIZATION'
                ]

print('number of distinct named organization: ', len(set(named_organization_name_ls)))
print('counter of named organization: ', Counter(named_organization_name_ls))
print()

named_person_name_ls = [
        test_examples[id_]['text'][
            test_examples[id_]['entity_spans'][0][0] 
            : 
            test_examples[id_]['entity_spans'][0][1]
            ].upper()
        for id_ in subj_overlap_ls
        if test_examples[id_]['entity_type'][0] == 'PERSON'
        ] + \
        [
        test_examples[id_]['text'][
            test_examples[id_]['entity_spans'][1][0] 
            : 
            test_examples[id_]['entity_spans'][1][1]
            ].upper()
        for id_ in obj_overlap_ls
        if test_examples[id_]['entity_type'][1] == 'PERSON'
        ]


print('number of distinct named person: ', len(set(named_person_name_ls)))
print('counter of named person: ', Counter(named_person_name_ls))

number of distinct named organization:  966
counter of named organization:  Counter({'CORPORATE LIBRARY': 243, 'SHOPPERTRAK': 240, 'ALICO': 236, 'ADF': 215, 'NATIONAL RESTAURANT ASSOCIATION': 212, 'PATA': 194, 'COUNTRYWIDE': 192, 'NATIONAL URBAN LEAGUE': 179, 'CHINA CHARITY FEDERATION': 167, 'NATIONAL DEVELOPMENT AND REFORM COMMISSION': 166, 'NDRC': 128, 'PACIFIC ASIA TRAVEL ASSOCIATION': 122, 'NATIONAL CONGRESS OF AMERICAN INDIANS': 118, 'PROGRESSIVE DEMOCRATS OF AMERICA': 102, 'OANA': 100, 'NATIONAL TAIWAN SYMPHONY ORCHESTRA': 99, 'NATIONAL ENERGY ADMINISTRATION': 94, 'AMERICAN FREE PRESS': 93, 'TPI': 93, 'UASR': 85, 'LIHOP': 82, 'MENIL COLLECTION': 81, 'NDA': 81, 'SYCAMORE': 77, 'NTSO': 75, 'PDA': 71, 'URBAN LEAGUE': 66, 'SEMEN GRESIK': 63, 'AFP': 56, 'PSIA': 47, 'LOOKING CLOUD': 43, 'KOCH FOODS': 43, 'COUNTRYWIDE FINANCIAL': 42, 'AAUW': 39, 'COUNTRYWIDE FINANCIAL CORP.': 38, 'NRA': 36, 'NUCLEAR DECOMMISSIONING AUTHORITY': 36, 'AIG': 36, 'TRIBECA ENTERPRISES': 34, 'AMERICAN ASSOCIAT

In [27]:
subj_no_overlap_ls = list(set(range(len(test_examples))) - set(subj_overlap_ls))
obj_no_overlap_ls = list(set(range(len(test_examples))) - set(obj_overlap_ls))

no_named_organization_name_ls = [
                test_examples[id_]['text'][
                    test_examples[id_]['entity_spans'][0][0] 
                    : 
                    test_examples[id_]['entity_spans'][0][1]
                    ].upper()
                for id_ in subj_no_overlap_ls
                if test_examples[id_]['entity_type'][0] == 'ORGANIZATION'
                ] + \
                [
                test_examples[id_]['text'][
                    test_examples[id_]['entity_spans'][1][0] 
                    : 
                    test_examples[id_]['entity_spans'][1][1]
                    ].upper()
                for id_ in obj_no_overlap_ls
                if test_examples[id_]['entity_type'][1] == 'ORGANIZATION'
                ]

no_named_person_name_ls = [
                test_examples[id_]['text'][
                    test_examples[id_]['entity_spans'][0][0] 
                    : 
                    test_examples[id_]['entity_spans'][0][1]
                    ].upper()
                for id_ in subj_no_overlap_ls
                if test_examples[id_]['entity_type'][0] == 'PERSON'
                ] + \
                [
                test_examples[id_]['text'][
                    test_examples[id_]['entity_spans'][1][0] 
                    : 
                    test_examples[id_]['entity_spans'][1][1]
                    ].upper()
                for id_ in obj_no_overlap_ls
                if test_examples[id_]['entity_type'][1] == 'PERSON'
                ]

bad_org_name = ['ASSOCIATION', 'IT', 'HER', 'GROUP', 'SHE', 'THE FACTORY', 'EMAIL', 'HIS', 'COMPANY', 'ITS', 'HER', 'HIS', 'HOUSE', 'THE', 'HIM']
good_per_name = ['SCRAPBLOG', 'PIEDRA', 'SYCAMORE', 'HELOISE', 'PIEDRA', 'ALESSI', 'AQUASH', 'CERNIGLIA', 'ANNA MAE', 'SEMEN GRESIK', 'ANDREW LANGE', 'MESSINA DENARO', 'RICHARD GAGE', 
                'NORRIS CHURCH MAILER', 'ASIA BIBI', 'FABRIS',  'NELL MINOW', 'PAUL HODGSON']
bad_per_name = list(set(no_named_person_name_ls) - set(good_per_name))
print('the bad person entities that cannot be replaced with other persons: ', bad_per_name)

the bad person entities that cannot be replaced with other persons:  ['HELPFUL HINTER', 'SHE', 'BURNS', 'BOFA', 'CASTRO', 'KALINEN', 'JUAREZ', 'ST. PAUL', 'SEARS', 'A SENIOR OFFICIAL', 'CREEK', 'LENOIR', 'PANEER BURJI', 'AWAITING THE VERDICT WITH OTHER FAMILY MEMBERS', "''", 'FRANKO PIZZA', 'TOM HAYDEN', 'PAKISTAN', 'THE BOY', 'DE MENILS', 'NIDAL', 'LEVY', 'KNOX', 'AL-QADA', 'HELL', 'ARMY OF THE MAHDI', 'VICTOR', 'MAN', 'KOBE', 'HER HUSBAND', 'TRAPANI', 'WOMAN', 'MASCARA', 'BERMAS', 'BABY', 'LOUIS GHOST', 'DEPUTY POLICE CHIEF', 'THE KILLER', 'HUSBAND', 'SON CITGO', 'PAUL', 'BELO HORIZONTE', 'REDSTONE', 'NASSER', 'PIHAKNYA SELAKU PENYELENGGARA MENGAKU BANGGA BISA MELAHIRKAN BINTANG-BINTANG MASA DEPAN DARI KALANGAN MASYARAKAT', 'DAVID', 'FINN', 'CODE PINK', 'THE AMERICAN', 'AN AMERICAN', 'SUSHI RAN', 'BROWN', 'CONTRACTOR', 'MUSLIM', 'JOHN GALT', 'MEMBER', 'DOE', 'LOOSE CHANGE', 'SHERROD', 'JACKSON', 'TAU', 'COSA NOSTRA', 'PLANTANUS', 'TOURISTS', 'BUD LITE', 'BEHROOZ', 'WORKER', 'PEOPLE',

In [29]:
bad_example_per_ls = ['BANK OF AMERICA']
bad_example_org_ls = ['HIS']
example_bad_id_ls = list(
        [
        id_
        for id_ in range(len(test_examples))
        if 
        (test_examples[id_]['entity_type'][0] == 'ORGANIZATION'
        and test_examples[id_]['text'][
            test_examples[id_]['entity_spans'][0][0] 
            : 
            test_examples[id_]['entity_spans'][0][1]
            ].upper() in bad_example_org_ls
        )
        or
        (test_examples[id_]['entity_type'][0] == 'PERSON'
        and test_examples[id_]['text'][
            test_examples[id_]['entity_spans'][0][0] 
            : 
            test_examples[id_]['entity_spans'][0][1]
            ].upper() in bad_example_per_ls
        )
        or
        (test_examples[id_]['entity_type'][1] == 'ORGANIZATION'
        and test_examples[id_]['text'][
            test_examples[id_]['entity_spans'][1][0] 
            : 
            test_examples[id_]['entity_spans'][1][1]
            ].upper() in bad_example_org_ls
        )
        or
        (test_examples[id_]['entity_type'][1] == 'PERSON'
        and test_examples[id_]['text'][
            test_examples[id_]['entity_spans'][1][0] 
            : 
            test_examples[id_]['entity_spans'][1][1]
            ].upper() in bad_example_per_ls
        )
        ]
)
print(example_bad_id_ls)

for id_ in example_bad_id_ls:
    print('instanced id: ', id_)
    printREInstance(test_examples[id_]['text'], 
                    test_examples[id_]['entity_spans'][0], test_examples[id_]['entity_spans'][1],
                    test_examples[id_]['entity_type'][0], test_examples[id_]['entity_type'][1],
                   )
    print()

[4190, 5949, 7523, 7606, 7919, 8939, 10909, 12135, 13112]
instanced id:  4190
It traveled to Rice University , where de Menil and #his#(obj type:ORGANIZATION) wife , Dominique de Menil , who later founded the @Menil Collection@(subj type:ORGANIZATION) , ran the art museum .

instanced id:  5949
AJ - complete psychopath , mentally ill DRG - weasel and snake oil salesmen of the nth degree #Fetzer#(obj type:PERSON) - Loony toons SJ - ignorant of facts , and morally compromising for the sake of @his@(subj type:ORGANIZATION) argument .

instanced id:  7523
He figured that he would sell his home before the interest rate on the loan , taken out from @Countrywide Financial@(subj type:ORGANIZATION) , now owned by #Bank of America#(obj type:PERSON) , reset at a higher level .

instanced id:  7606
To him the walls of #his#(obj type:ORGANIZATION) prison are invisible , and @he@(subj type:ORGANIZATION) believes himself to be free .

instanced id:  7919
when Avery had to calm him down by touching @h

In [30]:
subj_final_id_ls = list(
        [
        id_
        for id_ in range(len(test_examples))
        if 
        (test_examples[id_]['entity_type'][0] == 'ORGANIZATION'
        and test_examples[id_]['text'][
            test_examples[id_]['entity_spans'][0][0] 
            : 
            test_examples[id_]['entity_spans'][0][1]
            ].upper() not in bad_org_name
        )
        or
        (test_examples[id_]['entity_type'][0] == 'PERSON'
        and test_examples[id_]['text'][
            test_examples[id_]['entity_spans'][0][0] 
            : 
            test_examples[id_]['entity_spans'][0][1]
            ].upper() not in bad_per_name
        )
        ]
)

obj_final_id_ls = list(
        [
        id_
        for id_ in range(len(test_examples))
        if 
        (test_examples[id_]['entity_type'][1] == 'ORGANIZATION'
        and test_examples[id_]['text'][
            test_examples[id_]['entity_spans'][1][0] 
            : 
            test_examples[id_]['entity_spans'][1][1]
            ].upper() not in bad_org_name
        )
        or
        (test_examples[id_]['entity_type'][1] == 'PERSON'
        and test_examples[id_]['text'][
            test_examples[id_]['entity_spans'][1][0] 
            : 
            test_examples[id_]['entity_spans'][1][1]
            ].upper() not in bad_per_name
        )
        ]
)

print('number of instances having subject for replacements: ', len(subj_final_id_ls))
print('number of instances having object for replacements: ', len(obj_final_id_ls))

number of instances having subject for replacements:  10840
number of instances having object for replacements:  5368


In [31]:
from collections import Counter

final_organization_name_ls = [
                test_examples[id_]['text'][
                    test_examples[id_]['entity_spans'][0][0] 
                    : 
                    test_examples[id_]['entity_spans'][0][1]
                    ].upper()
                for id_ in subj_final_id_ls
                if test_examples[id_]['entity_type'][0] == 'ORGANIZATION'
                ] + \
                [
                test_examples[id_]['text'][
                    test_examples[id_]['entity_spans'][1][0] 
                    : 
                    test_examples[id_]['entity_spans'][1][1]
                    ].upper()
                for id_ in obj_final_id_ls
                if test_examples[id_]['entity_type'][1] == 'ORGANIZATION'
                ]

print('number of instances having organization for replacements: ', len(final_organization_name_ls))
print('number of distinct organization names for replacements: ', len(set(final_organization_name_ls)))
print('number of distinct organization names for replacements: ', Counter(final_organization_name_ls))

final_person_name_ls = [
                test_examples[id_]['text'][
                    test_examples[id_]['entity_spans'][0][0] 
                    : 
                    test_examples[id_]['entity_spans'][0][1]
                    ].upper()
                for id_ in subj_final_id_ls
                if test_examples[id_]['entity_type'][0] == 'PERSON'
                ] + \
                [
                test_examples[id_]['text'][
                    test_examples[id_]['entity_spans'][1][0] 
                    : 
                    test_examples[id_]['entity_spans'][1][1]
                    ].upper()
                for id_ in obj_final_id_ls
                if test_examples[id_]['entity_type'][1] == 'PERSON'
                ]

print(len(final_person_name_ls))
print(len(set(final_person_name_ls)))
print(Counter(final_person_name_ls))

number of instances having organization for replacements:  7681
number of distinct organization names for replacements:  1308
number of distinct organization names for replacements:  Counter({'SHOPPERTRAK': 271, 'CORPORATE LIBRARY': 247, 'ADF': 245, 'ALICO': 238, 'COUNTRYWIDE': 215, 'NATIONAL RESTAURANT ASSOCIATION': 212, 'PATA': 199, 'NATIONAL URBAN LEAGUE': 182, 'CHINA CHARITY FEDERATION': 167, 'NATIONAL DEVELOPMENT AND REFORM COMMISSION': 166, 'NDRC': 128, 'LIHOP': 124, 'PACIFIC ASIA TRAVEL ASSOCIATION': 122, 'NATIONAL CONGRESS OF AMERICAN INDIANS': 118, 'PROGRESSIVE DEMOCRATS OF AMERICA': 107, 'OANA': 106, 'TPI': 101, 'NATIONAL TAIWAN SYMPHONY ORCHESTRA': 99, 'NATIONAL ENERGY ADMINISTRATION': 94, 'AMERICAN FREE PRESS': 93, 'SYCAMORE': 87, 'UASR': 85, 'MENIL COLLECTION': 84, 'NDA': 81, 'NORRIS CHURCH': 79, 'PSIA': 76, 'NTSO': 75, 'PDA': 73, 'LOOKING CLOUD': 73, 'URBAN LEAGUE': 70, 'LOOSE CHANGE': 69, 'ARLO LOOKING CLOUD': 69, 'SEMEN GRESIK': 68, 'AFP': 56, 'KOCH FOODS': 43, 'AAUW': 

In [33]:
final_id_ls = list(set(subj_final_id_ls + obj_final_id_ls))

final_relation_ls = [
        test_examples[id_]['label']
        for id_ in list(set(subj_final_id_ls + obj_final_id_ls))
        ]

print('number of instances for replacements: ', len(final_relation_ls))
print('number of distinct relations for replacements: ', len(set(final_relation_ls)))

label_counter = Counter(final_relation_ls)
print('counter of relation labels for replacements: ', label_counter)
print()

minor_class = [
    class_
    for class_ in list(set(final_relation_ls))
    if label_counter[class_] < 20
]
print('classes of small number of instances: ', minor_class)

number of instances for replacements:  12193
number of distinct relations for replacements:  42
counter of relation labels for replacements:  Counter({'no_relation': 9247, 'per:title': 410, 'org:top_members/employees': 346, 'per:employee_of': 264, 'org:alternate_names': 213, 'per:age': 169, 'per:cities_of_residence': 129, 'per:origin': 114, 'org:country_of_headquarters': 107, 'per:countries_of_residence': 104, 'per:parents': 83, 'org:city_of_headquarters': 82, 'per:charges': 80, 'org:founded_by': 68, 'org:parents': 62, 'per:spouse': 62, 'per:stateorprovinces_of_residence': 57, 'per:siblings': 51, 'per:other_family': 51, 'org:stateorprovince_of_headquarters': 51, 'org:subsidiaries': 44, 'per:cause_of_death': 40, 'per:date_of_death': 39, 'org:founded': 36, 'per:children': 32, 'org:members': 31, 'per:religion': 31, 'per:schools_attended': 30, 'org:website': 24, 'per:city_of_death': 23, 'org:member_of': 18, 'org:number_of_employees/members': 17, 'org:shareholders': 13, 'per:alternate_names

In [34]:
final_id_resample_ls = final_id_ls + [i_ for i_ in final_id_ls if test_examples[i_]['label'] in minor_class] * 2
joblib.dump((final_id_resample_ls, subj_final_id_ls, obj_final_id_ls), 'final_id_resample_ls.output')

['final_id_resample_ls.output']