In [10]:
import pandas as pd
import random
import sys
from transformers import AutoTokenizer, pipeline
from collections import Counter
import time

articles = pd.read_json('../scraping/articles/all_articles.json')
text = articles['text'][666][1:8]
print('\n'.join(text))

The US Department of Energy, he said, is studying ways to bail out nuclear (and coal) facilities, including potentially by mandating grid operators to purchase power from them.
Supporters say it’s a sensible move because the nation’s power grid cannot rely solely on natural gas, wind, and solar. Opponents of the nuclear industry say nuclear power that was once advertised as being “too cheap to meter” has become too costly for electric utilities to buy. 
The US has the largest number of nuclear plants in the world – 99 in commercial operation providing 20% of its electricity generation – but its industry leadership is declining as efforts to build a new generation of reactors have been plagued by problems, and ageing plants have been retired or closed in the face of economic, market, and financial pressures.
The situation, exacerbated by robust competition in the new-build sector from China and Russia, has seen the nuclear industry and its supporters call on the government to enact legi

In [11]:
text = """
Citing high fuel prices, United Airlines said Friday it has increased fares by $6 per round trip on flights to some cities also served by lower-cost carriers. American Airlines, a unit of AMR, immediately matched the move, spokesman Tim Wagner said. United, a unit of UAL, said the increase took effect Thursday night and applies to most routes where it competes against discount carriers, such as Chicago to Dallas and Atlanta and Denver to San Francisco, Los Angeles and New York.
"""
text = [text]

In [12]:
# For entity extraction
start = time.time()
model = "dslim/distilbert-NER"
tokenizer = AutoTokenizer.from_pretrained(model)
ner_pipeline = pipeline('ner', model=model, tokenizer=tokenizer)

# NOTE: with current process, entities' start and end indices are reset for each new paragraph
import importlib
import _NER
importlib.reload(_NER)
from _NER import merge_result

entities = []

for paragraph in text:
    print(paragraph)

    # Entities
    entity_result = ner_pipeline(paragraph)
    merged_result = merge_result(entity_result, "dslim/distilbert-NER")
    for eR in merged_result:
        print(eR)
        entities.append(eR)

print(f'Time: {(time.time()-start)/60:>2f} minutes')


Citing high fuel prices, United Airlines said Friday it has increased fares by $6 per round trip on flights to some cities also served by lower-cost carriers. American Airlines, a unit of AMR, immediately matched the move, spokesman Tim Wagner said. United, a unit of UAL, said the increase took effect Thursday night and applies to most routes where it competes against discount carriers, such as Chicago to Dallas and Atlanta and Denver to San Francisco, Los Angeles and New York.

{'entity': 'B-ORG', 'score': 0.9983473, 'index': 7, 'word': 'United', 'start': 26, 'end': 32}
{'entity': 'I-ORG', 'score': 0.9981199, 'index': 8, 'word': 'Airlines', 'start': 33, 'end': 41}
{'entity': 'B-ORG', 'score': 0.9987262, 'index': 35, 'word': 'American', 'start': 160, 'end': 168}
{'entity': 'I-ORG', 'score': 0.99801755, 'index': 36, 'word': 'Airlines', 'start': 169, 'end': 177}
{'entity': 'B-ORG', 'score': 0.9986008, 'index': 41, 'word': 'AMR', 'start': 189, 'end': 192}
{'entity': 'B-PER', 'score': 0.9

In [13]:
# RE: REBEL-large
from transformers import pipeline
start = time.time()
model = 'Babelscape/rebel-large'
triplet_extractor = pipeline('text2text-generation', model=model, tokenizer=model)

# CODE FROM DOCUMENTATION
def extract_triplets(text):
    triplets = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
    return triplets

relations = []
for paragraph in text:
    print(paragraph)

    # Relations
    extracted_text = triplet_extractor.tokenizer.batch_decode([triplet_extractor(
        paragraph, 
        return_tensors=True, 
        return_text=False)[0]["generated_token_ids"]])
    relation_result = extract_triplets(extracted_text[0])
    for rR in relation_result:
        print(rR)
        relations.append(rR)

print(f'Time: {(time.time()-start)/60:>2f} minutes')


Citing high fuel prices, United Airlines said Friday it has increased fares by $6 per round trip on flights to some cities also served by lower-cost carriers. American Airlines, a unit of AMR, immediately matched the move, spokesman Tim Wagner said. United, a unit of UAL, said the increase took effect Thursday night and applies to most routes where it competes against discount carriers, such as Chicago to Dallas and Atlanta and Denver to San Francisco, Los Angeles and New York.

{'head': 'American Airlines', 'type': 'parent organization', 'tail': 'AMR'}
{'head': 'AMR', 'type': 'subsidiary', 'tail': 'American Airlines'}
Time: 0.130373 minutes


In [14]:
# RE: mREBEL Large
from transformers import pipeline
start = time.time()

triplet_extractor = pipeline('translation_xx_to_yy', model='Babelscape/mrebel-large', tokenizer='Babelscape/mrebel-large')

# Function to parse the generated text and extract the triplets
def extract_triplets_typed(text):

    triplets = []
    relation = ''
    text = text.strip()
    current = 'x'
    subject, relation, object_, object_type, subject_type = '','','','',''

    for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").replace("tp_XX", "").replace("__en__", "").split():
        if token == "<triplet>" or token == "<relation>":
            current = 't'
            if relation != '':
                triplets.append({'head': subject.strip(), 'head_type': subject_type, 'type': relation.strip(),'tail': object_.strip(), 'tail_type': object_type})
                relation = ''
            subject = ''
        elif token.startswith("<") and token.endswith(">"):
            if current == 't' or current == 'o':
                current = 's'
                if relation != '':
                    triplets.append({'head': subject.strip(), 'head_type': subject_type, 'type': relation.strip(),'tail': object_.strip(), 'tail_type': object_type})
                object_ = ''
                subject_type = token[1:-1]
            else:
                current = 'o'
                object_type = token[1:-1]
                relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '' and object_type != '' and subject_type != '':
        triplets.append({'head': subject.strip(), 'head_type': subject_type, 'type': relation.strip(),'tail': object_.strip(), 'tail_type': object_type})
    return triplets

for paragraph in text:
    print(paragraph)
    extracted_text = triplet_extractor.tokenizer.batch_decode([triplet_extractor(
        paragraph, 
        decoder_start_token_id=250058, 
        src_lang="en_XX", tgt_lang="<triplet>",
        return_tensors=True, return_text=False)[0]["translation_token_ids"]]) # change en_XX for the language of the source.
    relation_result = extract_triplets_typed(extracted_text[0])
    for rR in relation_result:
        print(rR)
        relations.append(rR)

print(f'Time: {(time.time()-start)/60:>2f} minutes')


Citing high fuel prices, United Airlines said Friday it has increased fares by $6 per round trip on flights to some cities also served by lower-cost carriers. American Airlines, a unit of AMR, immediately matched the move, spokesman Tim Wagner said. United, a unit of UAL, said the increase took effect Thursday night and applies to most routes where it competes against discount carriers, such as Chicago to Dallas and Atlanta and Denver to San Francisco, Los Angeles and New York.

{'head': 'American Airlines', 'head_type': 'org', 'type': 'business division', 'tail': 'AMR', 'tail_type': 'org'}
Time: 0.252326 minutes


In [15]:
# RE: KnowGL
from transformers import AutoTokenizer, pipeline
start = time.time()

model = 'ibm/knowgl-large'
classifier = pipeline('text2text-generation', model=model, tokenizer=model)

relations = []
for paragraph in text:
    print(paragraph)

    # Relations
    relation_result = classifier(paragraph)
    for rR in relation_result:
        print(rR)
        relations.append(rR)

print(f'Time: {(time.time()-start)/60:>2f} minutes')



Citing high fuel prices, United Airlines said Friday it has increased fares by $6 per round trip on flights to some cities also served by lower-cost carriers. American Airlines, a unit of AMR, immediately matched the move, spokesman Tim Wagner said. United, a unit of UAL, said the increase took effect Thursday night and applies to most routes where it competes against discount carriers, such as Chicago to Dallas and Atlanta and Denver to San Francisco, Los Angeles and New York.

{'generated_text': '[(American Airlines#American Airlines#business)|parent organization|(AMR#American Airlines#business)]'}
Time: 0.133522 minutes


In [16]:
# RE: yseop/distilbert-base-financial-relation-extraction
start = time.time()

model = 'yseop/distilbert-base-financial-relation-extraction'
classifier = pipeline('text-classification', model=model, tokenizer=model)

relations = []
for paragraph in text:
    print(paragraph)

    # Relations
    relation_result = classifier(paragraph)
    for rR in relation_result:
        print(rR)
        relations.append(rR)

print(f'Time: {(time.time()-start)/60:>2f} minutes')


Citing high fuel prices, United Airlines said Friday it has increased fares by $6 per round trip on flights to some cities also served by lower-cost carriers. American Airlines, a unit of AMR, immediately matched the move, spokesman Tim Wagner said. United, a unit of UAL, said the increase took effect Thursday night and applies to most routes where it competes against discount carriers, such as Chicago to Dallas and Atlanta and Denver to San Francisco, Los Angeles and New York.

{'label': 'is in', 'score': 0.9808366298675537}
Time: 0.013802 minutes
