In [1]:
import re 
import string 
import pandas as pd 
import numpy as np 
import math 
from tqdm import tqdm
import json

from pandas.core.frame import DataFrame
from IPython.core.display import display, HTML


import spacy
from spacy.matcher import Matcher 
from spacy.tokens import Span 
from spacy import displacy 


nlp = spacy.load("en_core_web_sm")

In [2]:
def getText():
    f = open('../data/texts.json',)
    data = json.load(f)
    content_debates = data['debates']
    #text= content_debates.replace("\n"," ")
    return content_debates
def sw_removal(text):
    sw_spacy = nlp.Defaults.stop_words
    words = [word for word in text.split() if word.lower() not in sw_spacy]
    new_text = " ".join(words)
    return new_text
def getSentences(text):
    document = nlp(text)
    return [sent.text for sent in document.sents]

In [3]:
def appendChunk(original, chunk):
    return original + ' ' + chunk

def printToken(token):
    print(token.text, "->", token.dep_)

def isRelationCandidate(token):
    deps = ["ROOT", "adj", "attr", "agent", "amod"]
    return any(subs in token.dep_ for subs in deps)


def isConstructionCandidate(token):
    deps = ["compound", "prep", "conj", "mod"]
    return any(subs in token.dep_ for subs in deps)

def processSubjectObjectPairs(tokens):
    subject = ''
    object = ''
    relation = ''
    subjectConstruction = ''
    objectConstruction = ''
    for token in tokens:
        #printToken(token)
        if "punct" in token.dep_:
            continue
        if isRelationCandidate(token):
            relation = appendChunk(relation, token.lemma_)
        if isConstructionCandidate(token):
            if subjectConstruction:
                subjectConstruction = appendChunk(subjectConstruction, token.text)
            if objectConstruction:
                objectConstruction = appendChunk(objectConstruction, token.text)
        if "subj" in token.dep_:
            subject = appendChunk(subject, token.text)
            subject = appendChunk(subjectConstruction, subject)
            subjectConstruction = ''
        if "obj" in token.dep_:
            object = appendChunk(object, token.text)
            object = appendChunk(objectConstruction, object)
            objectConstruction = ''

    #print (subject.strip(), ",", relation.strip(), ",", object.strip())
    return (subject.strip(), relation.strip(), object.strip())

In [4]:
def subtree_matcher(doc):
    subjpass = 0

    for i,tok in enumerate(doc):
        
    # find dependency tag that contains the text "subjpass"    
       if tok.dep_.find("subjpass") == True:
            subjpass = 1

    x = ''
    y = ''
    z = ''

  # if subjpass == 1 then sentence is passive
    if subjpass == 1:
        for i,tok in enumerate(doc):
            if tok.dep_.find("subj") == True:
                if(doc[i-1].dep_ in['amod','compound']):
                    y = doc[i-1].text + ' '+tok.text
                else:
                    y=tok.text
                                   
            if(tok.dep_.find('ROOT')==0):
                z=tok.text
                
            if tok.dep_.endswith("obj") == True:
                if(doc[i-1].dep_ in['amod','compound']):
                    x = doc[i-1].text + ' '+tok.text  
                else:
                    x = tok.text
  
  # if subjpass == 0 then sentence is not passive
    else:
        for i,tok in enumerate(doc):
            if tok.dep_.endswith("subj") == True:
                if(doc[i-1].dep_ in['amod','compound']):
                    x = doc[i-1].text + ' '+tok.text 
                else:
                    x = tok.text
                    
            if(tok.dep_.find('ROOT')==0):
                z=tok.text

            if tok.dep_.endswith("obj") == True:
                if(doc[i-1].dep_ in['amod','compound']):
                    y = doc[i-1].text + ' '+tok.text
                else:
                    y = tok.text

    return x,y,z


In [5]:
doc = getText()
t = getSentences(doc)
#t = [sent.text for sent in document.sents]
#c={"ID" : range(len(t)),"sen" : t}
df=DataFrame(t)
df.columns = ['text']
display(df)

Unnamed: 0,text
0,Good evening from Hofstra University in Hemps...
1,"I am Lester Holt, anchor of ""NBC Nightly News.”"
2,I want to welcome you to the first presidentia...
3,\n
4,The participants tonight are Donald Trump and ...
...,...
4525,"While millions have already voted, election da..."
4526,One thing everyone here can agree on is we hop...
4527,It is one of the honors and obligations of liv...
4528,Thank you and good night.


In [6]:
def extract_re_subtree(text):
    return[subtree_matcher(nlp(sentence)) for sentence in text]
df['re_subtree'] = extract_re_subtree(t)
display(df)

Unnamed: 0,text,re_subtree
0,Good evening from Hofstra University in Hemps...,"(, Hempstead, )"
1,"I am Lester Holt, anchor of ""NBC Nightly News.”","(I, Nightly News, am)"
2,I want to welcome you to the first presidentia...,"(I, presidential debate, want)"
3,\n,"(, , \n)"
4,The participants tonight are Donald Trump and ...,"(participants, , are)"
...,...,...
4525,"While millions have already voted, election da...","(millions, , is)"
4526,One thing everyone here can agree on is we hop...,"(you, , is)"
4527,It is one of the honors and obligations of liv...,"(It, great country, is)"
4528,Thank you and good night.,"(, you, Thank)"


In [7]:
def extract_re_pairs(text):
    return[processSubjectObjectPairs(nlp(sentence)) for sentence in text]
df['re_pairs'] = extract_re_pairs(t)
display(df)

Unnamed: 0,text,re_subtree,re_pairs
0,Good evening from Hofstra University in Hemps...,"(, Hempstead, )","(, good, University Hempstead)"
1,"I am Lester Holt, anchor of ""NBC Nightly News.”","(I, Nightly News, am)","(I, be Holt, News)"
2,I want to welcome you to the first presidentia...,"(I, presidential debate, want)","(I, want first presidential, you debate)"
3,\n,"(, , \n)","(, , )"
4,The participants tonight are Donald Trump and ...,"(participants, , are)","(participants, be Trump, )"
...,...,...,...
4525,"While millions have already voted, election da...","(millions, , is)","(millions, be, )"
4526,One thing everyone here can agree on is we hop...,"(you, , is)","(thing everyone we you, be, )"
4527,It is one of the honors and obligations of liv...,"(It, great country, is)","(It, be one great, honors country)"
4528,Thank you and good night.,"(, you, Thank)","(, thank good, you)"


In [5]:
doc = sw_removal(getText())
document = nlp(doc)
t = [sent.text for sent in document.sents]
#c={"ID" : range(len(t)),"sen" : t}
df1=DataFrame(t)
df1.columns = ['text']
display(df1)

Unnamed: 0,text
0,"Good evening Hofstra University Hempstead, New..."
1,"Lester Holt, anchor ""NBC Nightly News.”"
2,want welcome presidential debate.
3,participants tonight Donald Trump Hillary Clin...
4,debate sponsored Commission Presidential Debat...
...,...
3520,thing agree hope vote.
3521,honors obligations living great country.
3522,Thank good night.
3523,[


In [6]:
def extract_named_ents(text):
    return [(ent.text, ent.label_) for ent in nlp(text).ents]

def add_named_ents(df):
    df['named_ents'] = df['text'].apply(extract_named_ents) 
add_named_ents(df1)
display(df1)

Unnamed: 0,text,named_ents
0,"Good evening Hofstra University Hempstead, New...","[(Hofstra University Hempstead, ORG), (New Yor..."
1,"Lester Holt, anchor ""NBC Nightly News.”","[(Lester Holt, PERSON), (NBC Nightly News, ORG)]"
2,want welcome presidential debate.,[]
3,participants tonight Donald Trump Hillary Clin...,"[(Donald Trump, PERSON), (Hillary Clinton, PER..."
4,debate sponsored Commission Presidential Debat...,"[(Commission Presidential Debates, ORG)]"
...,...,...
3520,thing agree hope vote.,[]
3521,honors obligations living great country.,[]
3522,Thank good night.,"[(night, TIME)]"
3523,[,[]


In [7]:
def extract_noun(text):
        return [(chunk.text) for chunk in nlp(text).noun_chunks]

def add_named_ents(df):
    df['noun'] = df['text'].apply(extract_noun) 
add_named_ents(df1)
display(df1)    

Unnamed: 0,text,named_ents,noun
0,"Good evening Hofstra University Hempstead, New...","[(Hofstra University Hempstead, ORG), (New Yor...",[Good evening Hofstra University Hempstead]
1,"Lester Holt, anchor ""NBC Nightly News.”","[(Lester Holt, PERSON), (NBC Nightly News, ORG)]","[Lester Holt, anchor, NBC Nightly News]"
2,want welcome presidential debate.,[],[welcome presidential debate]
3,participants tonight Donald Trump Hillary Clin...,"[(Donald Trump, PERSON), (Hillary Clinton, PER...","[participants, Donald Trump Hillary Clinton]"
4,debate sponsored Commission Presidential Debat...,"[(Commission Presidential Debates, ORG)]",[debate sponsored Commission Presidential Deba...
...,...,...,...
3520,thing agree hope vote.,[],"[thing, hope vote]"
3521,honors obligations living great country.,[],"[honors obligations, great country]"
3522,Thank good night.,"[(night, TIME)]",[]
3523,[,[],[]


In [8]:
def extract_compounds(text):
    comp_idx = 0
    compound = []
    compound_nps = []
    tok_idx = 0
    for idx, tok in enumerate(nlp(text)):
        if tok.dep_ == 'compound':

            # capture hyphenated compounds
            children = ''.join([c.text for c in tok.children])
            if '-' in children:
                compound.append(''.join([children, tok.text]))
            else:
                compound.append(tok.text)

            # remember starting index of first child in compound or word
            try:
                tok_idx = [c for c in tok.children][0].idx
            except IndexError:
                if len(compound) == 1:
                    tok_idx = tok.idx
            comp_idx = tok.i

        # append the last word in a compound phrase
        if tok.i - comp_idx == 1:
            compound.append(tok.text)
            if len(compound) > 1: 
                compound = ' '.join(compound)
                compound_nps.append((compound))

            # reset parameters
            tok_idx = 0 
            compound = []

    return compound_nps

def add_compounds(df):
    df['compounds'] = df['text'].apply(extract_compounds)
add_compounds(df1) 
display(df1)

Unnamed: 0,text,named_ents,noun,compounds
0,"Good evening Hofstra University Hempstead, New...","[(Hofstra University Hempstead, ORG), (New Yor...",[Good evening Hofstra University Hempstead],"[evening Hofstra University Hempstead, New York]"
1,"Lester Holt, anchor ""NBC Nightly News.”","[(Lester Holt, PERSON), (NBC Nightly News, ORG)]","[Lester Holt, anchor, NBC Nightly News]","[Lester Holt, NBC Nightly News]"
2,want welcome presidential debate.,[],[welcome presidential debate],[]
3,participants tonight Donald Trump Hillary Clin...,"[(Donald Trump, PERSON), (Hillary Clinton, PER...","[participants, Donald Trump Hillary Clinton]",[Donald Trump Hillary Clinton]
4,debate sponsored Commission Presidential Debat...,"[(Commission Presidential Debates, ORG)]",[debate sponsored Commission Presidential Deba...,[Commission Presidential Debates]
...,...,...,...,...
3520,thing agree hope vote.,[],"[thing, hope vote]",[hope vote]
3521,honors obligations living great country.,[],"[honors obligations, great country]",[honors obligations]
3522,Thank good night.,"[(night, TIME)]",[],[]
3523,[,[],[],[]


In [9]:
df1["com_nou_phrase"] = df1["compounds"] + df1['noun']
display(df1)

Unnamed: 0,text,named_ents,noun,compounds,com_nou_phrase
0,"Good evening Hofstra University Hempstead, New...","[(Hofstra University Hempstead, ORG), (New Yor...",[Good evening Hofstra University Hempstead],"[evening Hofstra University Hempstead, New York]","[evening Hofstra University Hempstead, New Yor..."
1,"Lester Holt, anchor ""NBC Nightly News.”","[(Lester Holt, PERSON), (NBC Nightly News, ORG)]","[Lester Holt, anchor, NBC Nightly News]","[Lester Holt, NBC Nightly News]","[Lester Holt, NBC Nightly News, Lester Holt, a..."
2,want welcome presidential debate.,[],[welcome presidential debate],[],[welcome presidential debate]
3,participants tonight Donald Trump Hillary Clin...,"[(Donald Trump, PERSON), (Hillary Clinton, PER...","[participants, Donald Trump Hillary Clinton]",[Donald Trump Hillary Clinton],"[Donald Trump Hillary Clinton, participants, D..."
4,debate sponsored Commission Presidential Debat...,"[(Commission Presidential Debates, ORG)]",[debate sponsored Commission Presidential Deba...,[Commission Presidential Debates],"[Commission Presidential Debates, debate spons..."
...,...,...,...,...,...
3520,thing agree hope vote.,[],"[thing, hope vote]",[hope vote],"[hope vote, thing, hope vote]"
3521,honors obligations living great country.,[],"[honors obligations, great country]",[honors obligations],"[honors obligations, honors obligations, great..."
3522,Thank good night.,"[(night, TIME)]",[],[],[]
3523,[,[],[],[],[]


In [10]:
def extract_verb(text):
    return [token for token in nlp(text)if token.pos_=='VERB']
def add_verbs(df):
    df['verbs'] = df['text'].apply(extract_verb)
add_verbs(df1) 
display(df1)

Unnamed: 0,text,named_ents,noun,compounds,com_nou_phrase,verbs
0,"Good evening Hofstra University Hempstead, New...","[(Hofstra University Hempstead, ORG), (New Yor...",[Good evening Hofstra University Hempstead],"[evening Hofstra University Hempstead, New York]","[evening Hofstra University Hempstead, New Yor...",[]
1,"Lester Holt, anchor ""NBC Nightly News.”","[(Lester Holt, PERSON), (NBC Nightly News, ORG)]","[Lester Holt, anchor, NBC Nightly News]","[Lester Holt, NBC Nightly News]","[Lester Holt, NBC Nightly News, Lester Holt, a...",[]
2,want welcome presidential debate.,[],[welcome presidential debate],[],[welcome presidential debate],[want]
3,participants tonight Donald Trump Hillary Clin...,"[(Donald Trump, PERSON), (Hillary Clinton, PER...","[participants, Donald Trump Hillary Clinton]",[Donald Trump Hillary Clinton],"[Donald Trump Hillary Clinton, participants, D...",[]
4,debate sponsored Commission Presidential Debat...,"[(Commission Presidential Debates, ORG)]",[debate sponsored Commission Presidential Deba...,[Commission Presidential Debates],"[Commission Presidential Debates, debate spons...",[sponsored]
...,...,...,...,...,...,...
3520,thing agree hope vote.,[],"[thing, hope vote]",[hope vote],"[hope vote, thing, hope vote]",[agree]
3521,honors obligations living great country.,[],"[honors obligations, great country]",[honors obligations],"[honors obligations, honors obligations, great...",[living]
3522,Thank good night.,"[(night, TIME)]",[],[],[],[Thank]
3523,[,[],[],[],[],[]


In [11]:
def extract_nvn(text):
    
    doc = nlp(text)
    
    sent = []
    
    for token in doc:
        # root word
        if (token.pos_=='VERB'):
            
            phrase =''
            
            # only extract noun or pronoun subjects
            for sub_tok in token.lefts:
                
                if (sub_tok.dep_ in ['nsubj','nsubjpass']) and (sub_tok.pos_ in ['NOUN','PROPN','PRON']):
                    
                    # look for subject modifier
                    adj = rule2_mod(text,sub_tok.i)
                    
                    phrase += adj + ' ' + sub_tok.text

                    # save the root word of the word
                    phrase += ' '+token.lemma_ 

                    # check for noun or pronoun direct objects
                    for sub_tok in token.rights:
                        
                        if (sub_tok.dep_ in ['dobj']) and (sub_tok.pos_ in ['NOUN','PROPN']):
                            
                            # look for object modifier
                            adj = rule2_mod(text,sub_tok.i)
                            
                            phrase += adj+' '+sub_tok.text
                            sent.append(phrase)
            
    return sent
def rule2_mod(text,index):
    
    doc = nlp(text)

    phrase = ''
    
    for token in doc:
        
        if token.i == index:
            
            for subtoken in token.children:
                if (subtoken.pos_ == 'ADJ'):
                    phrase += ' '+subtoken.text
            break
    
    return phrase

In [13]:
#n_v_n need to be optimization
df1['rule1'] = df1['text'].apply(extract_nvn)
display(df1)  

Unnamed: 0,text,named_ents,noun,compounds,com_nou_phrase,verbs,rule1
0,"Good evening Hofstra University Hempstead, New...","[(Hofstra University Hempstead, ORG), (New Yor...",[Good evening Hofstra University Hempstead],"[evening Hofstra University Hempstead, New York]","[evening Hofstra University Hempstead, New Yor...",[],[]
1,"Lester Holt, anchor ""NBC Nightly News.”","[(Lester Holt, PERSON), (NBC Nightly News, ORG)]","[Lester Holt, anchor, NBC Nightly News]","[Lester Holt, NBC Nightly News]","[Lester Holt, NBC Nightly News, Lester Holt, a...",[],[]
2,want welcome presidential debate.,[],[welcome presidential debate],[],[welcome presidential debate],[want],[]
3,participants tonight Donald Trump Hillary Clin...,"[(Donald Trump, PERSON), (Hillary Clinton, PER...","[participants, Donald Trump Hillary Clinton]",[Donald Trump Hillary Clinton],"[Donald Trump Hillary Clinton, participants, D...",[],[]
4,debate sponsored Commission Presidential Debat...,"[(Commission Presidential Debates, ORG)]",[debate sponsored Commission Presidential Deba...,[Commission Presidential Debates],"[Commission Presidential Debates, debate spons...",[sponsored],[]
...,...,...,...,...,...,...,...
3520,thing agree hope vote.,[],"[thing, hope vote]",[hope vote],"[hope vote, thing, hope vote]",[agree],[ thing agree vote]
3521,honors obligations living great country.,[],"[honors obligations, great country]",[honors obligations],"[honors obligations, honors obligations, great...",[living],[]
3522,Thank good night.,"[(night, TIME)]",[],[],[],[Thank],[]
3523,[,[],[],[],[],[],[]


In [14]:
def extract_adj_noun(text):
    
    doc = nlp(text)

    pat = []
    
    # iterate over tokens
    for token in doc:
        phrase = ''
        # if the word is a subject noun or an object noun
        if (token.pos_ == 'NOUN')\
            and (token.dep_ in ['dobj','pobj','nsubj','nsubjpass']):
            
            # iterate over the children nodes
            for subtoken in token.children:
                # if word is an adjective or has a compound dependency
                if (subtoken.pos_ == 'ADJ') or (subtoken.dep_ == 'compound'):
                    phrase += subtoken.text + ' '
                    
            if len(phrase)!=0:
                phrase += token.text
             
        if  len(phrase)!=0:
            pat.append(phrase)
        
    
    return pat

In [16]:
df1['adj_noun'] = df1['text'].apply(extract_adj_noun)

In [17]:
display(df1)

Unnamed: 0,text,named_ents,noun,compounds,com_nou_phrase,verbs,rule1,adj_noun
0,"Good evening Hofstra University Hempstead, New...","[(Hofstra University Hempstead, ORG), (New Yor...",[Good evening Hofstra University Hempstead],"[evening Hofstra University Hempstead, New York]","[evening Hofstra University Hempstead, New Yor...",[],[],[]
1,"Lester Holt, anchor ""NBC Nightly News.”","[(Lester Holt, PERSON), (NBC Nightly News, ORG)]","[Lester Holt, anchor, NBC Nightly News]","[Lester Holt, NBC Nightly News]","[Lester Holt, NBC Nightly News, Lester Holt, a...",[],[],[]
2,want welcome presidential debate.,[],[welcome presidential debate],[],[welcome presidential debate],[want],[],[welcome presidential debate]
3,participants tonight Donald Trump Hillary Clin...,"[(Donald Trump, PERSON), (Hillary Clinton, PER...","[participants, Donald Trump Hillary Clinton]",[Donald Trump Hillary Clinton],"[Donald Trump Hillary Clinton, participants, D...",[],[],[]
4,debate sponsored Commission Presidential Debat...,"[(Commission Presidential Debates, ORG)]",[debate sponsored Commission Presidential Deba...,[Commission Presidential Debates],"[Commission Presidential Debates, debate spons...",[sponsored],[],[]
...,...,...,...,...,...,...,...,...
3520,thing agree hope vote.,[],"[thing, hope vote]",[hope vote],"[hope vote, thing, hope vote]",[agree],[ thing agree vote],[hope vote]
3521,honors obligations living great country.,[],"[honors obligations, great country]",[honors obligations],"[honors obligations, honors obligations, great...",[living],[],[great country]
3522,Thank good night.,"[(night, TIME)]",[],[],[],[Thank],[],[]
3523,[,[],[],[],[],[],[],[]


In [18]:
def extract_npn(text):
    doc = nlp(text)
    
    sent = []
    
    for token in doc:

        # look for prepositions
        if token.pos_=='ADP':

            phrase = ''
            
            # if its head word is a noun
            if token.head.pos_=='NOUN':
                
                # append noun and preposition to phrase
                phrase += token.head.text
                phrase += ' '+token.text

                # check the nodes to the right of the preposition
                for right_tok in token.rights:
                    # append if it is a noun or proper noun
                    if (right_tok.pos_ in ['NOUN','PROPN']):
                        phrase += ' '+right_tok.text
                
                if len(phrase)>2:
                    sent.append(phrase)
                
    return sent

In [19]:
df1['noun_pre_noun'] = df1['text'].apply(extract_npn)
display(df1)

Unnamed: 0,text,named_ents,noun,compounds,com_nou_phrase,verbs,rule1,adj_noun,noun_pre_noun
0,"Good evening Hofstra University Hempstead, New...","[(Hofstra University Hempstead, ORG), (New Yor...",[Good evening Hofstra University Hempstead],"[evening Hofstra University Hempstead, New York]","[evening Hofstra University Hempstead, New Yor...",[],[],[],[]
1,"Lester Holt, anchor ""NBC Nightly News.”","[(Lester Holt, PERSON), (NBC Nightly News, ORG)]","[Lester Holt, anchor, NBC Nightly News]","[Lester Holt, NBC Nightly News]","[Lester Holt, NBC Nightly News, Lester Holt, a...",[],[],[],[]
2,want welcome presidential debate.,[],[welcome presidential debate],[],[welcome presidential debate],[want],[],[welcome presidential debate],[]
3,participants tonight Donald Trump Hillary Clin...,"[(Donald Trump, PERSON), (Hillary Clinton, PER...","[participants, Donald Trump Hillary Clinton]",[Donald Trump Hillary Clinton],"[Donald Trump Hillary Clinton, participants, D...",[],[],[],[]
4,debate sponsored Commission Presidential Debat...,"[(Commission Presidential Debates, ORG)]",[debate sponsored Commission Presidential Deba...,[Commission Presidential Debates],"[Commission Presidential Debates, debate spons...",[sponsored],[],[],[]
...,...,...,...,...,...,...,...,...,...
3520,thing agree hope vote.,[],"[thing, hope vote]",[hope vote],"[hope vote, thing, hope vote]",[agree],[ thing agree vote],[hope vote],[]
3521,honors obligations living great country.,[],"[honors obligations, great country]",[honors obligations],"[honors obligations, honors obligations, great...",[living],[],[great country],[]
3522,Thank good night.,"[(night, TIME)]",[],[],[],[Thank],[],[],[]
3523,[,[],[],[],[],[],[],[],[]


In [20]:
def rule0(text, index):
    
    doc = nlp(text)
        
    token = doc[index]
    
    entity = ''
    
    for sub_tok in token.children:
        if (sub_tok.dep_ in ['compound','amod']):
            entity += sub_tok.text+' '
    
    entity += token.text

    return entity

def rule3_mod(text):
    
    doc = nlp(text)
    
    sent = []
    
    for token in doc:

        if token.pos_=='ADP':

            phrase = ''
            if token.head.pos_=='NOUN':
                
                # appended rule
                append = rule0(text, token.head.i)
                if len(append)!=0:
                    phrase += append
                else:  
                    phrase += token.head.text
                phrase += ' '+token.text

                for right_tok in token.rights:
                    if (right_tok.pos_ in ['NOUN','PROPN']):
                        
                        right_phrase = ''
                        # appended rule
                        append = rule0(text, right_tok.i)
                        if len(append)!=0:
                            right_phrase += ' '+append
                        else:
                            right_phrase += ' '+right_tok.text
                            
                        phrase += right_phrase
                
                if len(phrase)>2:
                    sent.append(phrase)
                

    return sent

In [21]:
df1['noun_pre_noun_m'] = df1['text'].apply(rule3_mod)
display(df1)

Unnamed: 0,text,named_ents,noun,compounds,com_nou_phrase,verbs,rule1,adj_noun,noun_pre_noun,noun_pre_noun_m
0,"Good evening Hofstra University Hempstead, New...","[(Hofstra University Hempstead, ORG), (New Yor...",[Good evening Hofstra University Hempstead],"[evening Hofstra University Hempstead, New York]","[evening Hofstra University Hempstead, New Yor...",[],[],[],[],[]
1,"Lester Holt, anchor ""NBC Nightly News.”","[(Lester Holt, PERSON), (NBC Nightly News, ORG)]","[Lester Holt, anchor, NBC Nightly News]","[Lester Holt, NBC Nightly News]","[Lester Holt, NBC Nightly News, Lester Holt, a...",[],[],[],[],[]
2,want welcome presidential debate.,[],[welcome presidential debate],[],[welcome presidential debate],[want],[],[welcome presidential debate],[],[]
3,participants tonight Donald Trump Hillary Clin...,"[(Donald Trump, PERSON), (Hillary Clinton, PER...","[participants, Donald Trump Hillary Clinton]",[Donald Trump Hillary Clinton],"[Donald Trump Hillary Clinton, participants, D...",[],[],[],[],[]
4,debate sponsored Commission Presidential Debat...,"[(Commission Presidential Debates, ORG)]",[debate sponsored Commission Presidential Deba...,[Commission Presidential Debates],"[Commission Presidential Debates, debate spons...",[sponsored],[],[],[],[]
...,...,...,...,...,...,...,...,...,...,...
3520,thing agree hope vote.,[],"[thing, hope vote]",[hope vote],"[hope vote, thing, hope vote]",[agree],[ thing agree vote],[hope vote],[],[]
3521,honors obligations living great country.,[],"[honors obligations, great country]",[honors obligations],"[honors obligations, honors obligations, great...",[living],[],[great country],[],[]
3522,Thank good night.,"[(night, TIME)]",[],[],[],[Thank],[],[],[],[]
3523,[,[],[],[],[],[],[],[],[],[]


In [22]:
def extract_re_pairs(text):
    return[processSubjectObjectPairs(nlp(sentence)) for sentence in text]
df1['re_pairs'] = extract_re_pairs(t)
display(df1)

Unnamed: 0,text,named_ents,noun,compounds,com_nou_phrase,verbs,rule1,adj_noun,noun_pre_noun,noun_pre_noun_m,re_pairs
0,"Good evening Hofstra University Hempstead, New...","[(Hofstra University Hempstead, ORG), (New Yor...",[Good evening Hofstra University Hempstead],"[evening Hofstra University Hempstead, New York]","[evening Hofstra University Hempstead, New Yor...",[],[],[],[],[],"(, good Hempstead, )"
1,"Lester Holt, anchor ""NBC Nightly News.”","[(Lester Holt, PERSON), (NBC Nightly News, ORG)]","[Lester Holt, anchor, NBC Nightly News]","[Lester Holt, NBC Nightly News]","[Lester Holt, NBC Nightly News, Lester Holt, a...",[],[],[],[],[],"(, Holt, )"
2,want welcome presidential debate.,[],[welcome presidential debate],[],[welcome presidential debate],[want],[],[welcome presidential debate],[],[],"(, want welcome presidential, debate)"
3,participants tonight Donald Trump Hillary Clin...,"[(Donald Trump, PERSON), (Hillary Clinton, PER...","[participants, Donald Trump Hillary Clinton]",[Donald Trump Hillary Clinton],"[Donald Trump Hillary Clinton, participants, D...",[],[],[],[],[],"(, participant, )"
4,debate sponsored Commission Presidential Debat...,"[(Commission Presidential Debates, ORG)]",[debate sponsored Commission Presidential Deba...,[Commission Presidential Debates],"[Commission Presidential Debates, debate spons...",[sponsored],[],[],[],[],"(, sponsor Debates nonpartisan nonprofit, )"
...,...,...,...,...,...,...,...,...,...,...,...
3520,thing agree hope vote.,[],"[thing, hope vote]",[hope vote],"[hope vote, thing, hope vote]",[agree],[ thing agree vote],[hope vote],[],[],"(thing, agree, vote)"
3521,honors obligations living great country.,[],"[honors obligations, great country]",[honors obligations],"[honors obligations, honors obligations, great...",[living],[],[great country],[],[],"(, obligation great, country)"
3522,Thank good night.,"[(night, TIME)]",[],[],[],[Thank],[],[],[],[],"(, thank good, )"
3523,[,[],[],[],[],[],[],[],[],[],"(, [, )"


In [23]:
def extract_re_subtree(text):
    return[subtree_matcher(nlp(sentence)) for sentence in text]
df1['re_subtree'] = extract_re_subtree(t)
display(df1)

Unnamed: 0,text,named_ents,noun,compounds,com_nou_phrase,verbs,rule1,adj_noun,noun_pre_noun,noun_pre_noun_m,re_pairs,re_subtree
0,"Good evening Hofstra University Hempstead, New...","[(Hofstra University Hempstead, ORG), (New Yor...",[Good evening Hofstra University Hempstead],"[evening Hofstra University Hempstead, New York]","[evening Hofstra University Hempstead, New Yor...",[],[],[],[],[],"(, good Hempstead, )","(, , Hempstead)"
1,"Lester Holt, anchor ""NBC Nightly News.”","[(Lester Holt, PERSON), (NBC Nightly News, ORG)]","[Lester Holt, anchor, NBC Nightly News]","[Lester Holt, NBC Nightly News]","[Lester Holt, NBC Nightly News, Lester Holt, a...",[],[],[],[],[],"(, Holt, )","(, , Holt)"
2,want welcome presidential debate.,[],[welcome presidential debate],[],[welcome presidential debate],[want],[],[welcome presidential debate],[],[],"(, want welcome presidential, debate)","(, presidential debate, want)"
3,participants tonight Donald Trump Hillary Clin...,"[(Donald Trump, PERSON), (Hillary Clinton, PER...","[participants, Donald Trump Hillary Clinton]",[Donald Trump Hillary Clinton],"[Donald Trump Hillary Clinton, participants, D...",[],[],[],[],[],"(, participant, )","(, , participants)"
4,debate sponsored Commission Presidential Debat...,"[(Commission Presidential Debates, ORG)]",[debate sponsored Commission Presidential Deba...,[Commission Presidential Debates],"[Commission Presidential Debates, debate spons...",[sponsored],[],[],[],[],"(, sponsor Debates nonpartisan nonprofit, )","(, , Debates)"
...,...,...,...,...,...,...,...,...,...,...,...,...
3520,thing agree hope vote.,[],"[thing, hope vote]",[hope vote],"[hope vote, thing, hope vote]",[agree],[ thing agree vote],[hope vote],[],[],"(thing, agree, vote)","(thing, hope vote, agree)"
3521,honors obligations living great country.,[],"[honors obligations, great country]",[honors obligations],"[honors obligations, honors obligations, great...",[living],[],[great country],[],[],"(, obligation great, country)","(, great country, obligations)"
3522,Thank good night.,"[(night, TIME)]",[],[],[],[Thank],[],[],[],[],"(, thank good, )","(, , Thank)"
3523,[,[],[],[],[],[],[],[],[],[],"(, [, )","(, , [)"


In [25]:
df1.to_csv('info.xls')

In [26]:
df1.to_csv('info.csv')