In [1]:
# Libraries 

import os
import pickle
import pandas as pd
from tqdm.notebook import tqdm
from pymongo import MongoClient

import spacy
from nltk.tokenize import sent_tokenize

import nlp_utils

from tner import TransformersNER

In [4]:
# Read Saudi news triples 

with open('data/model_extracted_triples.pickle', 'rb') as handle: 
    
    saudi_news_triples = pickle.load(handle)

In [5]:
# Information extraction functions 

def ner_extractor(sentences, ner_model): 
    
    result = {}
    for sentence in sentences: 

        # Convert to flair format 
        flair_sentence = Sentence(sentence)

        # Predict NER 
        ner_model.predict(flair_sentence)

        # Format to list of NER tags 
        found_spans = flair_sentence.get_spans('ner')
        
        for span in found_spans:
            
            result[span.text] = span.tag
            
    return result

In [None]:
# Information extraction 

# Extract NER 
ner_model = TransformersNER("tner/roberta-large-ontonotes5")

In [None]:
text = 'Saudi Arabia’s Public Investment Fund—referred to in economic circles as PIF—'
ner_model.predict([text])

In [None]:
article_content = {}
for link, content in tqdm(saudi_news_triples.items()): 
    
    link_results = []
    for sentence_content in content: 
        
        subject_ner = ner_extractor([sentence_content['subj']], ner_model)
        
        object_ner = ner_extractor([sentence_content['obj']], ner_model)
        
        result = {'sentence':sentence_content['sentence'], 'subj':subject_ner, 'obj':object_ner}
        
        link_results.append(result)
    
    article_content[link] = link_results