In [1]:
import pandas as pd
import random
import sys
from transformers import AutoTokenizer, pipeline
from collections import Counter

sys.path.insert(0, '../comparing_LLMs/')
from _NER import join_tokens, merge_result, get_predicted_tags, calculate_metrics

articles = pd.read_json('../../scraping/articles/all_articles.json')
text = articles['text'][45]


In [9]:
model = "dslim/distilbert-NER"
tokenizer = AutoTokenizer.from_pretrained(model)
ner_pipeline = pipeline('ner', model=model, tokenizer=tokenizer)

text = [articles['text'][45][0]]

total_result = []
for paragrpah in text:
    result = ner_pipeline(paragrpah)
    for r in result:
        total_result.append(r)

merged_total = merge_result(total_result, model)

def merge_entities(entities):
    merged_entities = []
    labels = []
    current = None

    for entity in entities:
        if current is None:
            current = entity
            labels.append(entity['entity'][2:])
        else:
            if entity['entity'].startswith('I') and current['entity'].startswith('B'):
                current['word'] += ' ' + entity['word']
                current['end'] = entity['end']
                current['score'] = min(current['score'], entity['score'])
                labels.append(entity['entity'][2:])
            else:
                dominant_label = Counter(labels).most_common(1)[0][0]
                current['entity'] = 'B-' + dominant_label
                merged_entities.append(current)
                labels = [entity['entity'][2:]]
                current = entity

    if current is not None:
        dominant_label = Counter(labels).most_common(1)[0][0]
        current['entity'] = 'B-' + dominant_label
        merged_entities.append(current)
    
    return merged_entities

print('\nJoined:')
joined_result = merge_entities(merged_total)
for r in joined_result:
    print(r['entity'],':',r['word'])


Joined:
B-ORG : Department for Energy Security and Net Zero
B-ORG : DESNZ
B-ORG : Great British Nuclear
B-ORG : GBN
B-LOC : UK


In [14]:
from transformers import pipeline
model = 'Babelscape/rebel-large'

triplet_extractor = pipeline('text2text-generation', model=model, tokenizer=model)

# CODE FROM DOCUMENTATION
def extract_triplets(text):
    triplets = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
    return triplets

extracted_text = triplet_extractor.tokenizer.batch_decode([triplet_extractor(text, return_tensors=True, return_text=False)[0]["generated_token_ids"]])

<s><triplet> Department for Energy Security and Net Zero <subj> Great British Nuclear <obj> subsidiary <triplet> Great British Nuclear <subj> Department for Energy Security and Net Zero <obj> parent organization</s>
[{'head': 'Department for Energy Security and Net Zero', 'type': 'subsidiary', 'tail': 'Great British Nuclear'}, {'head': 'Great British Nuclear', 'type': 'parent organization', 'tail': 'Department for Energy Security and Net Zero'}]


In [19]:
text = articles['text'][45][0] + articles['text'][45][1]
print(text)
print(extracted_text[0])

extracted_triplets = extract_triplets(extracted_text[0])
for e in extracted_triplets:
    print(e)

The Department for Energy Security and Net Zero (DESNZ) is today launching Great British Nuclear (GBN), the new government body that will be the delivery vehicle for all of the UK’s nuclear projects, includingAccording to DESNZ: “GBN will drive the rapid expansion of new nuclear power plants in the UK at an unprecedented scale and pace.
<s><triplet> Department for Energy Security and Net Zero <subj> Great British Nuclear <obj> subsidiary <triplet> Great British Nuclear <subj> Department for Energy Security and Net Zero <obj> parent organization</s>
{'head': 'Department for Energy Security and Net Zero', 'type': 'subsidiary', 'tail': 'Great British Nuclear'}
{'head': 'Great British Nuclear', 'type': 'parent organization', 'tail': 'Department for Energy Security and Net Zero'}


In [20]:
print(articles['text'][45])

['The Department for Energy Security and Net Zero (DESNZ) is today launching Great British Nuclear (GBN), the new government body that will be the delivery vehicle for all of the UK’s nuclear projects, including', 'According to DESNZ: “GBN will drive the rapid expansion of new nuclear power plants in the UK at an unprecedented scale and pace.', '“This will boost UK energy security, reduce dependence on volatile fossil fuel imports, create more affordable power and grow the economy, with the nuclear industry estimated to generate around £6bn for the UK economy.”', 'DESNZ added that GBN will play a key role in helping the government hit its ambition to provide up to a quarter of the UK’s electricity from homegrown nuclear energy by 2050.', 'GBN’s official launch is supported by a competition for organisations to bid for funding support for the development of their nuclear products, including small modular reactors (SMRs).', 'Current SMR projects include', 'Rolls-Royce SMR proposals looki