In [1]:
import os
import re
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import json

import spacy
from spacy.tokens import Doc
from spacy.tokens import Span
from spacy.tokens import Token
# Load English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load("en_core_web_lg")

import pprint
pp = pprint.PrettyPrinter(indent=4)

import matplotlib.pyplot as plt
import seaborn as sns; sns.set_theme()
import collections

from itertools import chain

from IPython.core.display import display, HTML
from nltk import Tree

In [2]:
SUBJ_LIKE_RELATIONS = ['nsubj', 'csubj', 'nsubjpass', 'csubjpass']
OBJ_LIKE_RELATIONS = ['dobj', 'pobj', 'iobj']
MOD_LIKE_RELATIONS = ['npadvmod', 'amod', 'advmod', 'nummod', 'quantmod', 'rcmod', 'tmod', 'vmod']

RULE_1_POS_LIST = ['NOUN', 'PROPN', 'VERB', 'NUM', 'PRON', 'X']

Token.set_extension("is_included", default=True, force=True)

In [3]:
# Utility functions
def tok_format(tok):
    return f'{tok.orth_}({tok.dep_})'

def to_nltk_tree(node):
    if node.n_lefts + node.n_rights > 0:
        return Tree(tok_format(node), [to_nltk_tree(child) for child in node.children if child._.is_included])
    else:
        return tok_format(node)
    
def print_subtree(subtr, main_node):
    msg = '['
    for t in subtr:
        if t.text == main_node:
            msg = msg.strip()
            msg += f']--({t.text})-->['
        else:
            if t.dep_ == 'punct':
                msg = msg.strip()
            msg += f'{t.text} '
            if t.dep_ == 'punct':
                msg = msg.strip()
    
    msg = msg.strip()
    msg += ']'
    display(HTML(f'<h4 style="color: red">{msg}</h4>'))


# Rule 1
### (Growth)
If the edge to the head node is of the type relcl or ccomp, and the existing subj-like child node does not have the POS tag NOUN, PROPN, VERB, NUM, PRON, or X, replace the subj-like child node with the immediate head node. If there is no subj-like child node, simply move the head node as to be its subj-like child.

In [4]:
def is_edge_of_node_to_its_head_relcl_or_ccomp(node):
    return any([node.dep_ == 'relcl', 
                node.dep_ == 'ccomp'])

def is_subj_like(node):
    return node.dep_ in SUBJ_LIKE_RELATIONS
    
def get_subj_like_children(node):
    subj_like_children_list = [child for child in node.children if is_subj_like(child)]
    return subj_like_children_list

def has_a_single_subj_like_child(node):
    subj_like_children = get_subj_like_children(node)
    if len(subj_like_children) > 1:
        raise ValueError(f'Too many subj like children. Node: {node.text}')
    return len(subj_like_children)

def has_unwanted_POS(node):
    return node.pos_ in RULE_1_POS_LIST

# Rule 7
def remove_punct_children(node):
    for c in node.subtree:
        if c.dep_ == 'punct':
            c._.is_included = False
            
def Growth_replace_subj_if_dep_is_relcl_or_ccomp(sentence, target_word):
    spn = sentence[0:]
    for node in spn:
        if node.text != target_word:
            continue
            
        if not is_edge_of_node_to_its_head_relcl_or_ccomp(node):
            continue
        
        # move the node to be direct child of its ancestor
        ancestor_head = node.head.head
        node_old_head = node.head # keep track of old node before changing it
        node.head = ancestor_head
        
        # move the head node as to be its subj-like child
        node_old_head.head = node
        # TODO: Edges are not updated after chaning the heads
        
        if has_a_single_subj_like_child(node):
            subj_like_child = get_subj_like_children(node)[0] # It must have only one child
            if not has_unwanted_POS(subj_like_child):
                subj_like_child._.is_included = False
                
        remove_punct_children(node)
        
        return [n for n in node.subtree if n._.is_included]
    


# Rule 1 examples

In [5]:
corpus = """
Demo; Calvin is a distributed main-memory database system that uses a deterministic execution strategy.
CVE-2020-10138; Acronis Cyber Backup and Cyber Protect contain a privileged service that uses this OpenSSL component.
CVE-2020-10683; However, there is popular external documentation from OWASP showing how to enable the safe, non-default behavior in any application that uses dom4j.
CVE-2020-13799; Several scenarios have been identified in which the RPMB state may be affected by an attacker without the knowledge of the trusted component that uses the RPMB feature.
CVE-2020-14054; Hardware version 212 allows remote attackers to bypass admin authentication via a SQL injection attack that uses the User Name or Password field on the login page.
CVE-2020-2035; This allows a compromised host in a protected network to evade any security policy that uses URL filtering on a firewall configured with SSL Decryption in the Forward Proxy mode.
"""
doc = nlp(corpus)

for i, sent in enumerate(doc.sents):
    if len(sent) < 2:
        continue
   
    print("~~~~~~~~ BEFORE: ")
    to_nltk_tree(sent.root).pretty_print()
    
    extraction_result = Growth_replace_subj_if_dep_is_relcl_or_ccomp(sent, 'uses')
    
    to_nltk_tree(sent.root).pretty_print()
    
    print('-'*100)
    display(HTML(f'<h4>{sent}</h4>'))
    print_subtree(extraction_result, 'uses')
    print('-'*100)


~~~~~~~~ BEFORE: 
                                                       is(ROOT)                                                                                                            
    ______________________________________________________|__________________________________________                                                                       
   |           |          |         |                                                           system(attr)                                                               
   |           |          |         |         _______________________________________________________|_______________________________                                       
   |           |          |         |        |            |                    database(compoun                                 uses(relcl)                                
   |           |          |         |        |            |                           d)                                

----------------------------------------------------------------------------------------------------
~~~~~~~~ BEFORE: 
                                                                              contain(ROOT)                                                                                                             
    ________________________________________________________________________________|______________________________________________________________                                                      
   |                                          10138(nsubj)                                                                                   service(dobj)                                              
   |          _____________________________________|________________________________                                    ___________________________|____________                                         
   |         |             |            |          |                       

----------------------------------------------------------------------------------------------------
~~~~~~~~ BEFORE: 
                                                                                                                           is(ROOT)                                                                                                                                                   
    __________________________________________________________________________________________________________________________|_________________________________________                                                                                                               
   |            |           |          |         |                       |                                                                                        documentation(                                                                                                      
   |            |           |          |   

----------------------------------------------------------------------------------------------------
~~~~~~~~ BEFORE: 
                                 identified(ROOT)                                                                                                                                                                                                                                          
    ____________________________________|_________________________________________________________________________________________                                                                                                                                                          
   |         |           |              |                          |                                                       scenarios(nsubjp                                                                                                                                                
   |         |           |  

----------------------------------------------------------------------------------------------------
~~~~~~~~ BEFORE: 
                                                                                                                              allows(ROOT)                                                                                                                                                  
    _______________________________________________________________________________________________________________________________|____________________________________________________                                                                                                     
   |                       |                                                  |                                                                                                   bypass(ccomp)                                                                                             
   |                     

----------------------------------------------------------------------------------------------------
~~~~~~~~ BEFORE: 
                                                                                                    allows(ROOT)                                                                                                                                                                           
    _____________________________________________________________________________________________________|_____________________________________________________                                                                                                                             
   |          |         |                       |                                                                                                         evade(ccomp)                                                                                                                     
   |          |         |   

----------------------------------------------------------------------------------------------------


# Rule 2
### (Growth)
If the current node is part of a conj relation through its head edge, and no subj-like child node exists, search for a subj-like child node in the parent (a sibling node). Recurse in case this is not found and the head edge is again a conj.

In [6]:
def is_edge_of_node_to_its_head_conj(node):
    return node.dep_ == 'conj'

def recursively_find_subj_like_node_in_sibiling(node):
    for c in node.head.children:
        if c.dep_ in SUBJ_LIKE_RELATIONS:
            print(c.text, c.dep_)
            return c
        if node.dep_ == 'ROOT':
            return False
        else:
            return recursively_find_subj_like_node_in_sibiling(node.head)

def Growth_recurse_on_dep_conj_if_no_subj(sentence, target_word):
    spn = sentence[0:]
    for node in spn:
        if node.text != target_word:
            continue
            
        if not is_edge_of_node_to_its_head_conj(node):
            continue
            
        if has_a_single_subj_like_child(node):
            continue
        
        result = recursively_find_subj_like_node_in_sibiling(node)
        if result:
            result.head = node
            
        remove_punct_children(node)
        return [n for n in node.subtree if n._.is_included]

In [7]:
corpus = """Using many ASR hypotheses helps recover the ASR errors of NE words in 1-best ASR results and improves NER accuracy.
This research studies the usage of grammars and LZ77 parsing for compression of similar sequence collections and improves complexity bounds with respect to space as well as time.
"""
doc = nlp(corpus)

for i, sent in enumerate(doc.sents):
    if len(sent) < 2:
        continue
   
    print(i, sent)
    print("~~~~~~~~ BEFORE: ")
    to_nltk_tree(sent.root).pretty_print()
        
    rule_2_extraction_result = Growth_recurse_on_dep_conj_if_no_subj(sent, 'improves')

    print("~~~~~~~~ AFTER:")
    to_nltk_tree(sent.root).pretty_print()
        
    print('-'*100)
    display(HTML(f'<h4>{sent}</h4>'))
    print_subtree(rule_2_extraction_result, 'improves')
    print('-'*100)

0 Using many ASR hypotheses helps recover the ASR errors of NE words in 1-best ASR results and improves NER accuracy.
~~~~~~~~ BEFORE: 
                                                                     helps(ROOT)                                                                               
    ______________________________________________________________________|_____________                                                                        
   |                       |                                                      recover(xcomp)                                                               
   |                       |                           _________________________________|______________________________________________________________         
   |                       |                          |              errors(dobj)                                 in(prep)                             |       
   |                       |                          |       

----------------------------------------------------------------------------------------------------
2 This research studies the usage of grammars and LZ77 parsing for compression of similar sequence collections and improves complexity bounds with respect to space as well as time.
~~~~~~~~ BEFORE: 
                                                  studies(ROOT)                                                                                                                               
    ____________________________________________________|___________________________________________________________________________________________                                           
   |       |            |                          usage(dobj)                                                                                      |                                         
   |       |            |            ___________________|______________________________                                       

----------------------------------------------------------------------------------------------------


# Rule 3
### (Growth)
If no obj-like child node exists, transform? nodes xcomp or ccomp in a dobj. If no subj-like child node exists, transform? nodes xcomp or ccomp in a nsubj.

In [8]:
def transform_xcomp_to_dobj_or_sub_if_doesnt_exists(sentence, target_word):
#     TBD
    pass

In [9]:
corpus = """Recent work has showed that structured retrieval improves answer ranking for factoid questions.
"""
doc = nlp(corpus)

for i, sent in enumerate(doc.sents):
    if len(sent) < 2:
        continue
   
    print("~~~~~~~~ BEFORE: ")
    to_nltk_tree(sent.root).pretty_print()

~~~~~~~~ BEFORE: 
                               showed(ROOT)                                                   
    ________________________________|______________                                            
   |        |          |                    improves(ccomp)                                   
   |        |          |             ______________|_________________________________          
   |        |          |            |              |               |           ranking(advcl) 
   |        |          |            |              |               |                 |         
   |        |          |            |              |               |             for(prep)    
   |        |          |            |              |               |                 |         
   |        |     work(nsubj)       |              |        retrieval(nsubj)  questions(pobj) 
   |        |          |            |              |               |                 |         
has(aux) .(punct) Recent(am