In [None]:
# Libraries 

import re
import os
import json
import pickle
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from pymongo import MongoClient

import spacy
from nltk.tokenize import sent_tokenize

import http
import requests

# Libraries 

# Starting MongoDB
password = '' # Your System Password
mongod_restart_command = "sudo -S systemctl restart mongod"
os.system('echo %s | %s' % (password, mongod_restart_command))

In [None]:
# Models 

# Read space NLP model 
spacy_nlp = spacy.load("en_core_web_trf")

In [None]:
# Functions 

def query_id(col, id_object): 
    
    raw_result = list(col.find({"_id": id_object}))[0]
    
    _id = raw_result['_id']
    
    url = raw_result['url']
    
    cat = raw_result['category']
    
    title = raw_result['title']
    
    text = raw_result['text']
        
    result = {'_id':_id, 'SOURCEURL':url, 'category':cat, 'Title':title, 'Text':text}
    
    return result

# Splitting functions  

def paragraph_split(text): 
    
    # Get paragraphs 
    pgs = re.split("\n |\n\n", text)
    
    # Strip whitespace 
    pgs_no_whitespace = [re.sub(' \n', ' ', re.sub(' +', ' ', pg.strip())) for pg in pgs if pg.strip() != '']
        
    return pgs_no_whitespace

def sentence_split(text): 
    
    # Clean text 
    clean_text = text.replace('\xa0', ' ')

    # Separate sentences by internal newlines 
    paragraphs = [pg for pg in paragraph_split(clean_text) if len(pg.split(' ')) >= 2]

    # Get sentences 
    sentences = []
    for pg in paragraphs: 

        pg_sentences = sent_tokenize(pg)
        valid_sentences = [sentence for sentence in pg_sentences if len(sentence.split(' ')) >= 2]
        sentences.extend(valid_sentences)

    # Split sentences by ; 
    semicolon_sentences = []
    for sentence in sentences: 

        split_sentences = sentence.split('; ')
        semicolon_sentences.extend(split_sentences)
        
    return semicolon_sentences

def get_company_contract_sentences(company_name, company_ids): 

    contract_ids = company_ids[company_name]
    sentences = []
    for contract_id in contract_ids:
        
        contract_text = nlp_utils.query_id(col, contract_id)['Text']
        if contract_text:
            contract_sentences = sentence_split(contract_text)
            sentences.extend(contract_sentences)
        
    return sentences

# Triple extraction functions 

def is_valid_sentence(span): 
    
    """
    Checks if a span of text has a subject and a verb, making it a valid sentence.

    :param span: spacy.tokens.Span, the span to check
    :return: bool, True if the span is a valid sentence, False otherwise
    """
    
    has_subject = any(['subj' in token.dep_ for token in span])
    has_verb = any([token.pos_ in ["AUX", "VERB"] for token in span])
    
    return has_subject and has_verb

def is_full_sentence(span): 
    
    """
    Checks if a span of text has a subject and a verb, making it a valid sentence.

    :param span: spacy.tokens.Span, the span to check
    :return: bool, True if the span is a valid sentence, False otherwise
    """
    
    subject_ids = []
    object_ids = []
    verb_ids = []
    root_id = None
    for token in span:
        
        if 'subj' in token.dep_:
            subject_ids.append(token.i)
            
        if token.pos_ in ["AUX", "VERB"]:
            verb_ids.append(token.i)

        if 'obj' in token.dep_:
            object_ids.append(token.i)
            
        if token.dep_ == 'ROOT':
            root_id = token.i
            
    if subject_ids != [] and verb_ids != [] and object_ids != [] and root_id: 
            
        if any([id_ < root_id for id_ in subject_ids]) and any([id_ > root_id for id_ in object_ids]): 

            return True
        
        else:
            return False
        
    else:
        return False

def split_compound_sentences(doc): 
    
    """
    Splits compound sentences into simple sentences.
    
    :param text: str, input compound sentence
    :return: list[str], list of simple sentences
    """
    
    sentences = []
    # Iterate through tokens in the parsed sentence
    for sent in doc.sents:
        
        start = sent.start
        for token in sent:
            
            # Check for conjunctions and punctuation that typically link clauses
            if token.dep_ in ['cc', 'mark'] or (token.dep_ == 'punct'):
                
                # Check is head of token is root and that it is a verb or verb modifier
                if token.head.dep_ == "ROOT" or (token.head.pos_=='VERB' and token.head.dep_ in ['conj', 'advcl']):
                    
                    left_sentence = doc[start:token.i]
                    right_sentence = doc[token.i:sent.end]

                    # Check if the clause before the conjunction/comma forms a valid sentence
                    if is_valid_sentence(left_sentence):
                        
                        if left_sentence not in sentences:
                            sentences.append(left_sentence)                        
                            start = token.i + 1
                            
                    if is_valid_sentence(right_sentence):
                        
                        if right_sentence not in sentences:
                            sentences.append(right_sentence)                        
                            start = token.i + 1
                            
    deduplicated_sentences = clean_list(sentences)
    cleaned_sentences = [clean_sentence(sentence) for sentence in deduplicated_sentences]

    full_sentences = [sentence for sentence in cleaned_sentences if is_full_sentence(sentence)]
    if full_sentences == []:
        full_sentences = [doc]
    
    return full_sentences

def extract_expanded_subject(doc): 
    
    subj_tokens = [token for token in doc if 'subj' in token.dep_ and token.head.dep_ == 'ROOT']
    expanded_subjs = []
    for subj_token in subj_tokens: 
        expanded_subj = [descendant for descendant in subj_token.subtree]
        expanded_subjs.append(expanded_subj)
    
    if expanded_subjs == []:
        return None
    
    else:
        
        full_subj_tokens = []
        for expanded_subj in expanded_subjs: 
            full_subj_tokens.extend([token for token in expanded_subj])
        
        full_subj_tokens = sorted(full_subj_tokens, key = lambda x: x.i)

        return full_subj_tokens

def extract_expanded_verb(doc): 
    
    root_tokens = [token for token in doc if token.dep_ == 'ROOT']
    if len(root_tokens) == 1:
        
        root_token = root_tokens[0]
        root_subtree = [token for token in root_token.subtree]
        root_ids = [root_token.i]

        left_subtree = [token for token in root_subtree if token.i < root_token.i]
        sorted_left_subtree = sorted(left_subtree, key = lambda x: x.i)
        relevant_left_subtree = []
        for token in sorted_left_subtree[::-1]:
            if (token.head.i in root_ids) and any([dep in token.dep_ for dep in ['agent','neg', 'aux', 'ccomp', 'xcomp', 'acomp', 'oprd', 'cc', 'conj', 'adv', 'prt']]):
                relevant_left_subtree.append(token)
                root_ids.append(token.i)
            else:
                break
        
        right_subtree = [token for token in root_subtree if token.i > root_token.i]
        sorted_right_subtree = sorted(right_subtree, key = lambda x: x.i)
        relevant_right_subtree = []
        for token in sorted_right_subtree:
            if (token.head.i in root_ids) and any([dep in token.dep_ for dep in ['agent','neg', 'aux', 'ccomp', 'xcomp', 'acomp', 'oprd', 'cc', 'conj', 'adv', 'prt']]):
                relevant_right_subtree.append(token)
                root_ids.append(token.i)
            else:
                break
                
        full_root = relevant_left_subtree + [root_token] + relevant_right_subtree
        expanded_root_tokens = sorted(full_root, key = lambda x: x.i)
        
    else:
        expanded_root_tokens = None
        
    return expanded_root_tokens

def extract_expanded_object(doc, root): 
    
    prep_tokens = [token for token in doc if any([dep in token.dep_ for dep in ['prep']]) and token.head.i in [token.i for token in root]]
    expanded_prep_objs = []
    for prep_token in prep_tokens: 
        expanded_prep_obj = [descendant for descendant in prep_token.subtree]
        expanded_prep_objs.extend(expanded_prep_obj)
    if expanded_prep_objs == []: 
        expanded_prep_objs = None
    else: 
        expanded_prep_objs = list(np.unique(sorted(expanded_prep_objs, key = lambda x: x.i)))
        
    obj_tokens = [token for token in doc if any([dep in token.dep_ for dep in ['obj', 'obl', 'ccomp', 'acomp', 'attr', 'xcomp']]) and token.head.i in [token.i for token in root]]
    expanded_obj_objs = []
    for obj_token in obj_tokens: 
        expanded_obj_obj = [descendant for descendant in obj_token.subtree]
        expanded_obj_objs.extend(expanded_obj_obj)
    if expanded_obj_objs == []: 
        expanded_obj_objs = None
    else: 
        expanded_obj_objs = list(np.unique(sorted(expanded_obj_objs, key = lambda x: x.i)))

    advcl_tokens = [token for token in doc if any([dep in token.dep_ for dep in ['advcl']]) and token.head.i in [token.i for token in root]]
    expanded_advcl_objs = []
    for advcl_token in advcl_tokens: 
        expanded_advcl_obj = [descendant for descendant in advcl_token.subtree]
        expanded_advcl_objs.extend(expanded_advcl_obj)  
    if expanded_advcl_objs == []: 
        expanded_advcl_objs = None
    else: 
        expanded_advcl_objs = list(np.unique(sorted(expanded_advcl_objs, key = lambda x: x.i)))

    return expanded_prep_objs, expanded_obj_objs, expanded_advcl_objs

def format_triples(all_triples): 
    
    formatted_triples = []
    for triples in tqdm(all_triples): 
            
        for triple in triples: 
            
            subj_str = ' '.join([token.text for token in triple['subj']])
            rel_str = ' '.join([token.text for token in triple['rel']])
            obj_str = ' '.join([token.text for token in triple['obj']])
            
            formatted_triples.append([subj_str, rel_str, obj_str])

    return formatted_triples

In [None]:
# Get text from DB 
 
# DB 

# Access database  
client = MongoClient("mongodb://localhost:" + "27017" + "/")

# Access Database
db = client['NLP701 Project']

# Access Collection in Database
col = db['GDELT_ARTICLES']

doc_ids = col.distinct('_id')

# Get companies and their contracts 
articles = {}
article_ind = 0
for _id in tqdm(doc_ids): 
    
    datapoint = query_id(col, _id)
    articles[article_ind] = datapoint
    
    article_ind += 1

In [None]:
triples = {}
for index, text in tqdm(articles.items()): # got up to 54
    
    sents = sent_tokenize(text)
    all_triples = []
    for sentence in tqdm(sents): 

        parsed_sentence = spacy_nlp(sentence)

        simplified_sentences = split_compound_sentences(parsed_sentence)

        triples = []
        for doc in simplified_sentences: 

            text = capitalize_first(doc.text.strip()) + "."

            doc = spacy_nlp(text)
            subj = extract_expanded_subject(doc)    
            rel = extract_expanded_verb(doc)

            if subj and rel: 

                obj = None

                modifiers = []
                obj_tok, prep_tok, advcl_tok = extract_expanded_object(doc, rel)
                if obj_tok != []: 

                    obj = obj_tok

                    if prep_tok != []: 
                        prep_modifier = prep_tok
                        modifiers.append(prep_modifier)

                    if advcl_tok != []: 
                        advcl_modifier = advcl_tok
                        modifiers.append(advcl_modifier)

                elif (obj_tok == []) and (prep_tok != []): 

                    obj = prep_tok

                    if advcl_tok != []: 
                        advcl_modifier = advcl_tok
                        modifiers.append(advcl_modifier)

                if obj:

                    new_triple = {'sentence': doc, 
                                  'subj':subj, 'rel':rel, 'obj':obj, 
                                  'mods':modifiers}

                    triples.append(new_triple)

        if triples != []:
            all_triples.append(triples)
            
    triples[company] = all_triples

In [None]:
formatted_triples = []
for index, sentences in tqdm(triples.items()): 
    
    for sentence in sentences:
        
        for triple in sentence:
            
            formatted_triple = {'sentence':triple['sentence'],
                                'subj': [token.i for token in triple['subj']],
                                'rel':[token.i for token in triple['rel']],
                                'obj':[token.i for token in triple['obj']],
                                'mods':[]}
            
            for mod in triple['mods']:
                
                if mod:
                    formatted_mod = [token.i for token in mod]
                    formatted_triple['mods'].append(formatted_mod)
                    
            formatted_triples.append(formatted_triple)

In [None]:
with open('formatted_triples.pickle', 'wb') as handle:
    
    pickle.dump(formatted_triples, handle, protocol=pickle.HIGHEST_PROTOCOL)