# Extracting information - Subject Action Object triples from simple declarative sentences from dependency trees
The dependency trees and the dataframe they use are done in news_dependency_parser.py

In [1]:
import lucem_illud #pip install -U git+git://github.com/Computational-Content-Analysis-2018/lucem_illud.git

#All these packages need to be installed from pip
#For NLP
import nltk

import numpy as np #For arrays
import pandas #Gives us DataFrames
import matplotlib.pyplot as plt #For graphics
import seaborn #Makes the graphics look nicer

#Displays the graphs
import graphviz #You also need to install the command line graphviz

#These are from the standard library
import os.path
import zipfile
import subprocess
import io
import tempfile
import re
import time

import lucem_illud.stanford as stanford # Requires having run lucem_illud.setupStanfordNLP() once

%matplotlib inline

In [2]:
news_df = pandas.concat([pandas.read_pickle('../news_df_norm1.pkl'), pandas.read_pickle('../news_df_norm2.pkl')])

This file sampletrees3k.pkl includes all the dependency trees parsed using the parsing python script in this repo

In [3]:
sampled_df = pandas.read_pickle('sampletrees3k.pkl')#'sampletrees1k.pkl')
sampled_df.dropna(inplace=True)

<br> <br>The following methods produce the extraction: From a simple declarative sentence (tagged S), get the subject and predicate.

In [4]:
def extract_noun(tree):
    """
    Intended to be used as being passed a NP tag as part of a subject or direct object
    component of a sentence. Looks for the most directly related NN (noun) or PRP (pronoun)
    searching recursively, depth-first.
    """
    if not tree.label().startswith('N'):
        return ''
    result = ''
    for child in tree:
        if child.label() == 'NP':
            result += extract_noun(child)
        elif child.label().startswith('NN') or \
            child.label() == 'PRP':
            result += child[0] + ' '
        elif child.label() == ',':
            break
    return result

def extract_verb(tree, lem_verb=True, lemmer=None):
    """
    Intended to receive a VP predicate, looks for the Verb. If a compound verb, returns the
    concatenation of all the verbs. Able to lemmatize verbs using the passed lemmatizer.
    """
    if not tree.label().startswith('V'):
        return ''
    lemmer = nltk.stem.WordNetLemmatizer()
    result = ''
    for child in tree:
        if child.label() == 'VP':
            result += extract_verb(child, lem_verb=stem_verb, lemmer=lemmer)
        elif child.label().startswith('VB'):
            if lem_verb:
                result += lemmer.lemmatize(child[0], pos='v') + ' '
            else result += child[0] + ' '
    return result

def extract_direct_object(tree):
    """
    Extracts the direct object from a simple declarative sentence VP predicate.
    Attempts to even bypass a prepositional predicate to find the noun directly related to 
    the verb.
    """
    #pdb.set_trace()
    if not tree.label().startswith('VP'):
        return ''
    for child in tree:
        if child.label() == 'NP':
            return extract_noun(child)
        elif child.label() == 'PP':
            for subchild in child:
                if subchild.label() == 'NP':
                    return extract_noun(subchild)
    for child in tree:
        if child.label() == 'VP':
            return extract_direct_object(child)
    return ''

def extract_SVO(tree, last_noun = True):
    """
    Extracts an SVO triple from a simple declarative sentence (S) tree.
    last_noun: True if only the last word from the noun and verb is required
    """
    subject = verb = objec = ''
    if not tree.label() == 'S':
        return None
    for child in tree:
        if child.label() == 'NP':
            subject = extract_noun(child)
        elif child.label() == 'VP':
            verb = condense_verbs(extract_verb(child))
            objec = extract_direct_object(child)
    if last_noun:
        subject = subject.strip().split(' ')[-1]
        objec = objec.strip().split(' ')[-1]
    return (subject, verb, objec)


In [5]:
def get_SVOs_in_sentence_tree(tree):
    """
    Locates simple declarative sentences in a tree and returns a list of all the SVO triples
    """
    result = []
    for subt in tree.subtrees():
        if subt.label()=='S':
            result.append(extract_SVO(subt, last_noun=True))
    return result

def condense_verbs(verbs):
    """
    for complex verbs, eliminate the auxiliar verbs
    """
    vblist = verbs.split()
    if len(vblist) < 2:
        return verbs
    aux= ['have', 'has', 'had', 'been', 'are', 'is', 'be', 'am', 'does', 'did', 'was', 'being', 'having']
    return " ".join([vb for vb in vblist if vb not in aux])

In [162]:
lemming = nltk.stem.WordNetLemmatizer()
lemming.lemmatize('hidden', pos='v')

'hide'

<br> <br>
Now in action, we go through the sample and look for SVO triples

In [165]:
small_df = sampled_df
svo_big_list = []
for ix, row in small_df.iterrows():
    for sent_tree in row.parse_sents:
        for svo in get_SVOs_in_sentence_tree(sent_tree[0]):
            if svo[0] != '' : 
                svo_big_list.append({'filename': row['filename'],
                                'country': row['country'],
                                'subject': svo[0],
                                'verb': svo[1],
                                'object': svo[2]})
small_df['svos'] = small_df['parse_sents'].apply(lambda x: [get_SVOs_in_sentence_tree(i[0]) for i in x])

In [2]:
#svo_df.to_pickle('svo_df.pkl')

The StanfordTokenizer will be deprecated in version 3.2.5.
Please use [91mnltk.tag.corenlp.CoreNLPPOSTagger[0m or [91mnltk.tag.corenlp.CoreNLPNERTagger[0m instead.
  super(StanfordNERTagger, self).__init__(*args, **kwargs)
The StanfordTokenizer will be deprecated in version 3.2.5.
Please use [91mnltk.tag.corenlp.CoreNLPPOSTagger[0m or [91mnltk.tag.corenlp.CoreNLPNERTagger[0m instead.
  super(StanfordPOSTagger, self).__init__(*args, **kwargs)


Ensuring all is working as intended

In [105]:
# Look at 14
experiment = small_df.iloc[0].parse_sents[25][0][0][1][2][1][1][2][1]
experiment.pretty_print()
#print(get_SVOs_in_sentence_tree(experiment))

                       S                            
      _________________|_____                        
     |                       VP                     
     |             __________|___                    
     |            |             ADJP                
     |            |     _________|____               
     |            |    |              S             
     |            |    |              |              
     |            |    |              VP            
     |            |    |      ________|___           
     |            |    |     |            VP        
     |            |    |     |    ________|___       
     NP           |    |     |   |            NP    
  ___|_____       |    |     |   |         ___|___   
 DT        NN    VBD   JJ    TO  VB      PRP      ''
 |         |      |    |     |   |        |       |  
the     hospital was unable  to pay      him      ''



In [70]:
experiment[0,0,1,1].pretty_print()

                                                        VP                                                
    ____________________________________________________|____                                              
   |                                                         NP                                           
   |            _____________________________________________|____________________                         
   |           |             |                    |               |              SBAR                     
   |           |             |                    |               |     __________|_____                   
   |           |             |                    |               |    |                S                 
   |           |             |                    |               |    |           _____|_________         
   |           |             |                    |               |    |          |               VP      
   |           |             |   

In [166]:
ix = 0
print(small_df.iloc[ix].text)
print(small_df.iloc[ix].svos)

 Angolans cast their ballots Wednesday in an election marking the end of President Jose Eduardo Dos Santos's 38-year reign, with his MPLA party set to retain power despite an economic crisis.  The MPLA, which has ruled since Angola's independence from Portugal in 1975, is expected to defeat opposition parties, which are stifled by Dos Santos's authoritarian regime.  Dos Santos's unexpected retirement — reportedly prompted by poor health — has triggered the biggest political transition in decades for Angola, a leading oil exporter in Africa.  However, his chosen successor is Defence Minister Joao Lourenco, a loyalist expected to avoid immediate change in a government often criticised for corruption and its failure to tackle dire poverty". My mission will be to revive the economy", Lourenco told reporters in the capital Luanda on the eve of the vote". If I succeed, I would like to be recognised in history as the man of Angola's economic miracle".  Dos Santos's long reign has seen the end

Creates a new DataFrame consisting of all the triples, to be used in the actual analysis

In [167]:
svo_df = pandas.DataFrame(svo_big_list)

In [169]:
svo_df

Unnamed: 0,country,filename,object,subject,verb
0,KE,./data/KE/KE5_39.html,ballots,Angolans,cast
1,KE,./data/KE/KE5_39.html,,MPLA,expect
2,KE,./data/KE/KE5_39.html,transition,—,trigger
3,KE,./data/KE/KE5_39.html,Lourenco,successor,be
4,KE,./data/KE/KE5_39.html,reporters,Lourenco,tell
5,KE,./data/KE/KE5_39.html,,mission,be
6,KE,./data/KE/KE5_39.html,man,I,like
7,KE,./data/KE/KE5_39.html,,I,succeed
8,KE,./data/KE/KE5_39.html,end,reign,see
9,KE,./data/KE/KE5_39.html,benefit,flood,bring


In [8]:
#svo_df.to_pickle('svo_simplified.pkl')