In [1]:
import pandas as pd
import random
import sys
from transformers import AutoTokenizer, pipeline
from collections import Counter

articles = pd.read_json('../scraping/articles/all_articles.json')
text = articles['text'][45]
text = articles['text'][45][1:6]

In [4]:
# For entity extraction
model = "dslim/distilbert-NER"
tokenizer = AutoTokenizer.from_pretrained(model)
ner_pipeline = pipeline('ner', model=model, tokenizer=tokenizer)

'''def merge_entities(entities):
    merged_entities = []
    labels = []
    current = None

    for entity in entities:
        if current is None:
            current = entity
            labels.append(entity['entity'][2:])
        else:
            if entity['entity'].startswith('I') and current['entity'].startswith('B'):
                current['word'] += ' ' + entity['word']
                current['end'] = entity['end']
                current['score'] = min(current['score'], entity['score'])
                labels.append(entity['entity'][2:])
            else:
                dominant_label = Counter(labels).most_common(1)[0][0]
                current['entity'] = 'B-' + dominant_label
                merged_entities.append(current)
                labels = [entity['entity'][2:]]
                current = entity

    if current is not None:
        dominant_label = Counter(labels).most_common(1)[0][0]
        current['entity'] = dominant_label
        merged_entities.append(current)
    
    return merged_entities'''

# NOTE: with current process, entities' start and end indices are reset for each new paragraph
import importlib
import _NER
importlib.reload(_NER)
from _NER import merge_result

entities = []
for paragraph in text:
    print(paragraph)

    # Entities
    entity_result = ner_pipeline(paragraph)
    merged_result = merge_result(entity_result, "dslim/distilbert-NER")
    for eR in merged_result:
        print(eR)
        entities.append(eR)

According to DESNZ: “GBN will drive the rapid expansion of new nuclear power plants in the UK at an unprecedented scale and pace.
{'entity': 'B-ORG', 'score': 0.9915059, 'index': 3, 'word': 'DESNZ', 'start': 13, 'end': 18}
{'entity': 'B-ORG', 'score': 0.73248434, 'index': 8, 'word': 'GBN', 'start': 21, 'end': 24}
{'entity': 'B-LOC', 'score': 0.9970687, 'index': 22, 'word': 'UK', 'start': 91, 'end': 93}
“This will boost UK energy security, reduce dependence on volatile fossil fuel imports, create more affordable power and grow the economy, with the nuclear industry estimated to generate around £6bn for the UK economy.”
{'entity': 'B-LOC', 'score': 0.99796593, 'index': 5, 'word': 'UK', 'start': 17, 'end': 19}
{'entity': 'B-LOC', 'score': 0.9962745, 'index': 40, 'word': 'UK', 'start': 207, 'end': 209}
DESNZ added that GBN will play a key role in helping the government hit its ambition to provide up to a quarter of the UK’s electricity from homegrown nuclear energy by 2050.
{'entity': 'B-O

In [5]:
# RE: REBEL-large
from transformers import pipeline
model = 'Babelscape/rebel-large'

triplet_extractor = pipeline('text2text-generation', model=model, tokenizer=model)

# CODE FROM DOCUMENTATION
def extract_triplets(text):
    triplets = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
    return triplets

relations = []
for paragraph in text:
    print(paragraph)

    # Relations
    extracted_text = triplet_extractor.tokenizer.batch_decode([triplet_extractor(paragraph, return_tensors=True, return_text=False)[0]["generated_token_ids"]])
    relation_result = extract_triplets(extracted_text[0])
    for rR in relation_result:
        print(rR)
        relations.append(rR)

According to DESNZ: “GBN will drive the rapid expansion of new nuclear power plants in the UK at an unprecedented scale and pace.
{'head': 'DESNZ', 'type': 'country', 'tail': 'UK'}
“This will boost UK energy security, reduce dependence on volatile fossil fuel imports, create more affordable power and grow the economy, with the nuclear industry estimated to generate around £6bn for the UK economy.”
{'head': 'nuclear industry', 'type': 'country', 'tail': 'UK'}
DESNZ added that GBN will play a key role in helping the government hit its ambition to provide up to a quarter of the UK’s electricity from homegrown nuclear energy by 2050.
{'head': 'nuclear energy by 2050', 'type': 'point in time', 'tail': '2050'}
GBN’s official launch is supported by a competition for organisations to bid for funding support for the development of their nuclear products, including small modular reactors (SMRs).
{'head': 'small modular reactor', 'type': 'subclass of', 'tail': 'nuclear products'}
Current SMR proj

In [6]:
# RE: KnowGL
from transformers import AutoTokenizer, pipeline

model = 'ibm/knowgl-large'

classifier = pipeline('text2text-generation', model=model, tokenizer=model)

relations = []
for paragraph in text:
    print(paragraph)

    # Relations
    relation_result = classifier(paragraph)
    for rR in relation_result:
        print(rR)
        relations.append(rR)


According to DESNZ: “GBN will drive the rapid expansion of new nuclear power plants in the UK at an unprecedented scale and pace.
{'generated_text': '[(New Zealand#New Zealand#sovereign state)|member of|(Commonwealth#Commonwealth of Nations#intergovernmental organization)]'}
“This will boost UK energy security, reduce dependence on volatile fossil fuel imports, create more affordable power and grow the economy, with the nuclear industry estimated to generate around £6bn for the UK economy.”
{'generated_text': '[economy#Economy of the United Kingdom#national economy)|location|(UK#United Kingdom#sovereign state)]'}
DESNZ added that GBN will play a key role in helping the government hit its ambition to provide up to a quarter of the UK’s electricity from homegrown nuclear energy by 2050.
{'generated_text': '[(UK#United Kingdom#sovereign state)|executive body|(government#Government of the United Kingdom#government)]'}
GBN’s official launch is supported by a competition for organisations to

In [7]:
# RE: yseop/distilbert-base-financial-relation-extraction
model = 'yseop/distilbert-base-financial-relation-extraction'
classifier = pipeline('text-classification', model=model, tokenizer=model)

relations = []
for paragraph in text:
    print(paragraph)

    # Relations
    relation_result = classifier(paragraph)
    for rR in relation_result:
        print(rR)
        relations.append(rR)



According to DESNZ: “GBN will drive the rapid expansion of new nuclear power plants in the UK at an unprecedented scale and pace.
{'label': 'are', 'score': 0.9930928945541382}
“This will boost UK energy security, reduce dependence on volatile fossil fuel imports, create more affordable power and grow the economy, with the nuclear industry estimated to generate around £6bn for the UK economy.”
{'label': 'x', 'score': 0.9958141446113586}
DESNZ added that GBN will play a key role in helping the government hit its ambition to provide up to a quarter of the UK’s electricity from homegrown nuclear energy by 2050.
{'label': 'x', 'score': 0.46124711632728577}
GBN’s official launch is supported by a competition for organisations to bid for funding support for the development of their nuclear products, including small modular reactors (SMRs).
{'label': 'is in', 'score': 0.7382863759994507}
Current SMR projects include Rolls-Royce SMR proposals looking at constructing reactors in Oldbury and Berk