In [1]:
import spacy
from pandas.core.frame import DataFrame
import pandas as pd
from IPython.core.display import display, HTML
import json
nlp = spacy.load('en_core_web_sm')

In [2]:
def getText():
    f = open('../data/texts.json',)
    data = json.load(f)
    content_debates = data['debates']
    text= content_debates.replace("\n"," ")
    return text
def sw_removal(text):
    sw_spacy = nlp.Defaults.stop_words
    words = [word for word in text.split() if word.lower() not in sw_spacy]
    new_text = " ".join(words)
    return new_text



In [5]:
document = nlp(getText())



In [6]:
t = [sent.text for sent in document.sents]
df=DataFrame(t)
df.columns = ['text']
display(df)

Unnamed: 0,text
0,Good evening from Hofstra University in Hemps...
1,"I am Lester Holt, anchor of ""NBC Nightly News.”"
2,I want to welcome you to the first presidentia...
3,The participants tonight are Donald Trump and ...
4,This debate is sponsored by the Commission on ...
...,...
4173,One thing everyone here can agree on is we hop...
4174,It is one of the honors and obligations of liv...
4175,Thank you and good night.
4176,[ Applause ]


In [7]:
#name entities
def extract_named_ents(text):
    return [(ent.text, ent.label_) for ent in nlp(text).ents]

def add_named_ents(df):
    df['named_ents'] = df['text'].apply(extract_named_ents) 
add_named_ents(df)
display(df)

Unnamed: 0,text,named_ents
0,Good evening from Hofstra University in Hemps...,"[(evening, TIME), (Hofstra University, ORG), (..."
1,"I am Lester Holt, anchor of ""NBC Nightly News.”","[(Lester Holt, PERSON), (NBC, ORG), (Nightly N..."
2,I want to welcome you to the first presidentia...,"[(first, ORDINAL)]"
3,The participants tonight are Donald Trump and ...,"[(tonight, TIME), (Donald Trump, PERSON), (Hil..."
4,This debate is sponsored by the Commission on ...,"[(the Commission on Presidential Debates, ORG)]"
...,...,...
4173,One thing everyone here can agree on is we hop...,"[(One, CARDINAL)]"
4174,It is one of the honors and obligations of liv...,[]
4175,Thank you and good night.,[]
4176,[ Applause ],[]


In [10]:
#name-entities without label
def extract_namedents_nolabels(text):
    return [ent.text for ent in nlp(text).ents]

def add_named_ents_nolabel(df):
    df['named_ents_nolabel'] = df['text'].apply(extract_namedents_nolabels) 
add_named_ents_nolabel(df)
display(df)

Unnamed: 0,text,named_ents,named_ents_nolabel
0,Good evening from Hofstra University in Hemps...,"[(evening, TIME), (Hofstra University, ORG), (...","[evening, Hofstra University, Hempstead, New Y..."
1,"I am Lester Holt, anchor of ""NBC Nightly News.”","[(Lester Holt, PERSON), (NBC, ORG), (Nightly N...","[Lester Holt, NBC, Nightly News]"
2,I want to welcome you to the first presidentia...,"[(first, ORDINAL)]",[first]
3,The participants tonight are Donald Trump and ...,"[(tonight, TIME), (Donald Trump, PERSON), (Hil...","[tonight, Donald Trump, Hillary Clinton]"
4,This debate is sponsored by the Commission on ...,"[(the Commission on Presidential Debates, ORG)]",[the Commission on Presidential Debates]
...,...,...,...
4173,One thing everyone here can agree on is we hop...,"[(One, CARDINAL)]",[One]
4174,It is one of the honors and obligations of liv...,[],[]
4175,Thank you and good night.,[],[]
4176,[ Applause ],[],[]


In [11]:
#名词提取，可以是实体
def extract_chunk(text):
        return [(chunk.text) for chunk in nlp(text).noun_chunks]

def add_named_ents(df):
    df['chunk'] = df['text'].apply(extract_chunk) 
add_named_ents(df)
display(df) 

Unnamed: 0,text,named_ents,named_ents_nolabel,chunk
0,Good evening from Hofstra University in Hemps...,"[(evening, TIME), (Hofstra University, ORG), (...","[evening, Hofstra University, Hempstead, New Y...","[Hofstra University, Hempstead]"
1,"I am Lester Holt, anchor of ""NBC Nightly News.”","[(Lester Holt, PERSON), (NBC, ORG), (Nightly N...","[Lester Holt, NBC, Nightly News]","[I, Lester Holt, anchor, ""NBC Nightly News]"
2,I want to welcome you to the first presidentia...,"[(first, ORDINAL)]",[first],"[I, you, the first presidential debate]"
3,The participants tonight are Donald Trump and ...,"[(tonight, TIME), (Donald Trump, PERSON), (Hil...","[tonight, Donald Trump, Hillary Clinton]","[The participants, Donald Trump, Hillary Clinton]"
4,This debate is sponsored by the Commission on ...,"[(the Commission on Presidential Debates, ORG)]",[the Commission on Presidential Debates],"[This debate, the Commission, Presidential Deb..."
...,...,...,...,...
4173,One thing everyone here can agree on is we hop...,"[(One, CARDINAL)]",[One],"[One thing, everyone, we, you]"
4174,It is one of the honors and obligations of liv...,[],[],"[It, the honors, obligations, this great country]"
4175,Thank you and good night.,[],[],"[you, good night]"
4176,[ Applause ],[],[],[]


In [12]:
#compound
def extract_compounds(text):
    comp_idx = 0
    compound = []
    compound_nps = []
    tok_idx = 0
    for idx, tok in enumerate(nlp(text)):
        if tok.dep_ == 'compound':

            # capture hyphenated compounds
            children = ''.join([c.text for c in tok.children])
            if '-' in children:
                compound.append(''.join([children, tok.text]))
            else:
                compound.append(tok.text)

            # remember starting index of first child in compound or word
            try:
                tok_idx = [c for c in tok.children][0].idx
            except IndexError:
                if len(compound) == 1:
                    tok_idx = tok.idx
            comp_idx = tok.i

        # append the last word in a compound phrase
        if tok.i - comp_idx == 1:
            compound.append(tok.text)
            if len(compound) > 1: 
                compound = ' '.join(compound)
                compound_nps.append((compound))

            # reset parameters
            tok_idx = 0 
            compound = []

    return compound_nps

def add_compounds(df):
    df['compounds'] = df['text'].apply(extract_compounds)
add_compounds(df) 
display(df)

Unnamed: 0,text,named_ents,named_ents_nolabel,chunk,compounds
0,Good evening from Hofstra University in Hemps...,"[(evening, TIME), (Hofstra University, ORG), (...","[evening, Hofstra University, Hempstead, New Y...","[Hofstra University, Hempstead]","[Hofstra University, New York]"
1,"I am Lester Holt, anchor of ""NBC Nightly News.”","[(Lester Holt, PERSON), (NBC, ORG), (Nightly N...","[Lester Holt, NBC, Nightly News]","[I, Lester Holt, anchor, ""NBC Nightly News]","[Lester Holt, NBC Nightly News]"
2,I want to welcome you to the first presidentia...,"[(first, ORDINAL)]",[first],"[I, you, the first presidential debate]",[]
3,The participants tonight are Donald Trump and ...,"[(tonight, TIME), (Donald Trump, PERSON), (Hil...","[tonight, Donald Trump, Hillary Clinton]","[The participants, Donald Trump, Hillary Clinton]","[Donald Trump, Hillary Clinton]"
4,This debate is sponsored by the Commission on ...,"[(the Commission on Presidential Debates, ORG)]",[the Commission on Presidential Debates],"[This debate, the Commission, Presidential Deb...",[Presidential Debates]
...,...,...,...,...,...
4173,One thing everyone here can agree on is we hop...,"[(One, CARDINAL)]",[One],"[One thing, everyone, we, you]",[]
4174,It is one of the honors and obligations of liv...,[],[],"[It, the honors, obligations, this great country]",[]
4175,Thank you and good night.,[],[],"[you, good night]",[]
4176,[ Applause ],[],[],[],[]


In [13]:
def extract_verb(text):
    return [token for token in nlp(text)if token.pos_=='VERB']
def add_verbs(df):
    df['verbs'] = df['text'].apply(extract_verb)
add_verbs(df) 
display(df)

Unnamed: 0,text,named_ents,named_ents_nolabel,chunk,compounds,verbs
0,Good evening from Hofstra University in Hemps...,"[(evening, TIME), (Hofstra University, ORG), (...","[evening, Hofstra University, Hempstead, New Y...","[Hofstra University, Hempstead]","[Hofstra University, New York]",[]
1,"I am Lester Holt, anchor of ""NBC Nightly News.”","[(Lester Holt, PERSON), (NBC, ORG), (Nightly N...","[Lester Holt, NBC, Nightly News]","[I, Lester Holt, anchor, ""NBC Nightly News]","[Lester Holt, NBC Nightly News]",[]
2,I want to welcome you to the first presidentia...,"[(first, ORDINAL)]",[first],"[I, you, the first presidential debate]",[],"[want, welcome]"
3,The participants tonight are Donald Trump and ...,"[(tonight, TIME), (Donald Trump, PERSON), (Hil...","[tonight, Donald Trump, Hillary Clinton]","[The participants, Donald Trump, Hillary Clinton]","[Donald Trump, Hillary Clinton]",[]
4,This debate is sponsored by the Commission on ...,"[(the Commission on Presidential Debates, ORG)]",[the Commission on Presidential Debates],"[This debate, the Commission, Presidential Deb...",[Presidential Debates],[sponsored]
...,...,...,...,...,...,...
4173,One thing everyone here can agree on is we hop...,"[(One, CARDINAL)]",[One],"[One thing, everyone, we, you]",[],"[agree, hope, go, vote]"
4174,It is one of the honors and obligations of liv...,[],[],"[It, the honors, obligations, this great country]",[],[living]
4175,Thank you and good night.,[],[],"[you, good night]",[],[Thank]
4176,[ Applause ],[],[],[],[],[]


In [15]:
#sub_tree matcher
def subtree_matcher(doc):
    subjpass = 0

    for i,tok in enumerate(doc):
        
    # find dependency tag that contains the text "subjpass"    
       if tok.dep_.find("subjpass") == True:
            subjpass = 1

    x = ''
    y = ''
    z = ''

  # if subjpass == 1 then sentence is passive
    if subjpass == 1:
        for i,tok in enumerate(doc):
            if tok.dep_.find("subj") == True:
                if(doc[i-1].dep_ in['amod','compound']):
                    y = doc[i-1].text + ' '+tok.text
                else:
                    y=tok.text
                                   
            if(tok.dep_.find('ROOT')==0):
                z=tok.text
                
            if tok.dep_.endswith("obj") == True:
                if(doc[i-1].dep_ in['amod','compound']):
                    x = doc[i-1].text + ' '+tok.text  
                else:
                    x = tok.text
  
  # if subjpass == 0 then sentence is not passive
    else:
        for i,tok in enumerate(doc):
            if tok.dep_.endswith("subj") == True:
                if(doc[i-1].dep_ in['amod','compound']):
                    x = doc[i-1].text + ' '+tok.text 
                else:
                    x = tok.text
                    
            if(tok.dep_.find('ROOT')==0):
                z=tok.text

            if tok.dep_.endswith("obj") == True:
                if(doc[i-1].dep_ in['amod','compound']):
                    y = doc[i-1].text + ' '+tok.text
                else:
                    y = tok.text

    return x,y,z
def extract_re_subtree(text):
    return[subtree_matcher(nlp(sentence)) for sentence in text]
df['re_subtree'] = extract_re_subtree(t)
display(df)

Unnamed: 0,text,named_ents,named_ents_nolabel,chunk,compounds,verbs,re_subtree
0,Good evening from Hofstra University in Hemps...,"[(evening, TIME), (Hofstra University, ORG), (...","[evening, Hofstra University, Hempstead, New Y...","[Hofstra University, Hempstead]","[Hofstra University, New York]",[],"(, Hempstead, )"
1,"I am Lester Holt, anchor of ""NBC Nightly News.”","[(Lester Holt, PERSON), (NBC, ORG), (Nightly N...","[Lester Holt, NBC, Nightly News]","[I, Lester Holt, anchor, ""NBC Nightly News]","[Lester Holt, NBC Nightly News]",[],"(I, Nightly News, am)"
2,I want to welcome you to the first presidentia...,"[(first, ORDINAL)]",[first],"[I, you, the first presidential debate]",[],"[want, welcome]","(I, presidential debate, want)"
3,The participants tonight are Donald Trump and ...,"[(tonight, TIME), (Donald Trump, PERSON), (Hil...","[tonight, Donald Trump, Hillary Clinton]","[The participants, Donald Trump, Hillary Clinton]","[Donald Trump, Hillary Clinton]",[],"(participants, , are)"
4,This debate is sponsored by the Commission on ...,"[(the Commission on Presidential Debates, ORG)]",[the Commission on Presidential Debates],"[This debate, the Commission, Presidential Deb...",[Presidential Debates],[sponsored],"(Presidential Debates, debate, sponsored)"
...,...,...,...,...,...,...,...
4173,One thing everyone here can agree on is we hop...,"[(One, CARDINAL)]",[One],"[One thing, everyone, we, you]",[],"[agree, hope, go, vote]","(you, , is)"
4174,It is one of the honors and obligations of liv...,[],[],"[It, the honors, obligations, this great country]",[],[living],"(It, great country, is)"
4175,Thank you and good night.,[],[],"[you, good night]",[],[Thank],"(, you, Thank)"
4176,[ Applause ],[],[],[],[],[],"(, , Applause)"


In [16]:
#chunker matcher
def appendChunk(original, chunk):
    return original + ' ' + chunk

def printToken(token):
    print(token.text, "->", token.dep_)

def isRelationCandidate(token):
    deps = ["ROOT", "adj", "attr", "agent", "amod"]
    return any(subs in token.dep_ for subs in deps)


def isConstructionCandidate(token):
    deps = ["compound", "prep", "conj", "mod"]
    return any(subs in token.dep_ for subs in deps)

def processSubjectObjectPairs(tokens):
    subject = ''
    object = ''
    relation = ''
    subjectConstruction = ''
    objectConstruction = ''
    for token in tokens:
        #printToken(token)
        if "punct" in token.dep_:
            continue
        if isRelationCandidate(token):
            relation = appendChunk(relation, token.lemma_)
        if isConstructionCandidate(token):
            if subjectConstruction:
                subjectConstruction = appendChunk(subjectConstruction, token.text)
            if objectConstruction:
                objectConstruction = appendChunk(objectConstruction, token.text)
        if "subj" in token.dep_:
            subject = appendChunk(subject, token.text)
            subject = appendChunk(subjectConstruction, subject)
            subjectConstruction = ''
        if "obj" in token.dep_:
            object = appendChunk(object, token.text)
            object = appendChunk(objectConstruction, object)
            objectConstruction = ''

    #print (subject.strip(), ",", relation.strip(), ",", object.strip())
    return (subject.strip(), relation.strip(), object.strip())
def extract_re_pairs(text):
    return[processSubjectObjectPairs(nlp(sentence)) for sentence in text]
df['re_chunk_v_chunk'] = extract_re_pairs(t)
display(df)

Unnamed: 0,text,named_ents,named_ents_nolabel,chunk,compounds,verbs,re_subtree,re_chunk_v_chunk
0,Good evening from Hofstra University in Hemps...,"[(evening, TIME), (Hofstra University, ORG), (...","[evening, Hofstra University, Hempstead, New Y...","[Hofstra University, Hempstead]","[Hofstra University, New York]",[],"(, Hempstead, )","(, good, University Hempstead)"
1,"I am Lester Holt, anchor of ""NBC Nightly News.”","[(Lester Holt, PERSON), (NBC, ORG), (Nightly N...","[Lester Holt, NBC, Nightly News]","[I, Lester Holt, anchor, ""NBC Nightly News]","[Lester Holt, NBC Nightly News]",[],"(I, Nightly News, am)","(I, be Holt, News)"
2,I want to welcome you to the first presidentia...,"[(first, ORDINAL)]",[first],"[I, you, the first presidential debate]",[],"[want, welcome]","(I, presidential debate, want)","(I, want first presidential, you debate)"
3,The participants tonight are Donald Trump and ...,"[(tonight, TIME), (Donald Trump, PERSON), (Hil...","[tonight, Donald Trump, Hillary Clinton]","[The participants, Donald Trump, Hillary Clinton]","[Donald Trump, Hillary Clinton]",[],"(participants, , are)","(participants, be Trump, )"
4,This debate is sponsored by the Commission on ...,"[(the Commission on Presidential Debates, ORG)]",[the Commission on Presidential Debates],"[This debate, the Commission, Presidential Deb...",[Presidential Debates],[sponsored],"(Presidential Debates, debate, sponsored)","(debate, sponsor by nonpartisan nonprofit, Com..."
...,...,...,...,...,...,...,...,...
4173,One thing everyone here can agree on is we hop...,"[(One, CARDINAL)]",[One],"[One thing, everyone, we, you]",[],"[agree, hope, go, vote]","(you, , is)","(thing everyone we you, be, )"
4174,It is one of the honors and obligations of liv...,[],[],"[It, the honors, obligations, this great country]",[],[living],"(It, great country, is)","(It, be one great, honors country)"
4175,Thank you and good night.,[],[],"[you, good night]",[],[Thank],"(, you, Thank)","(, thank good, you)"
4176,[ Applause ],[],[],[],[],[],"(, , Applause)","(, applause, )"


In [18]:
def extract_adj(text):
    return [token for token in nlp(text)if token.dep_=='amod']
def add_adj(df):
    df['adj'] = df['text'].apply(extract_adj)
add_adj(df) 
display(df)

Unnamed: 0,text,named_ents,named_ents_nolabel,chunk,compounds,verbs,re_subtree,re_chunk_v_chunk,adj
0,Good evening from Hofstra University in Hemps...,"[(evening, TIME), (Hofstra University, ORG), (...","[evening, Hofstra University, Hempstead, New Y...","[Hofstra University, Hempstead]","[Hofstra University, New York]",[],"(, Hempstead, )","(, good, University Hempstead)",[Good]
1,"I am Lester Holt, anchor of ""NBC Nightly News.”","[(Lester Holt, PERSON), (NBC, ORG), (Nightly N...","[Lester Holt, NBC, Nightly News]","[I, Lester Holt, anchor, ""NBC Nightly News]","[Lester Holt, NBC Nightly News]",[],"(I, Nightly News, am)","(I, be Holt, News)",[]
2,I want to welcome you to the first presidentia...,"[(first, ORDINAL)]",[first],"[I, you, the first presidential debate]",[],"[want, welcome]","(I, presidential debate, want)","(I, want first presidential, you debate)","[first, presidential]"
3,The participants tonight are Donald Trump and ...,"[(tonight, TIME), (Donald Trump, PERSON), (Hil...","[tonight, Donald Trump, Hillary Clinton]","[The participants, Donald Trump, Hillary Clinton]","[Donald Trump, Hillary Clinton]",[],"(participants, , are)","(participants, be Trump, )",[]
4,This debate is sponsored by the Commission on ...,"[(the Commission on Presidential Debates, ORG)]",[the Commission on Presidential Debates],"[This debate, the Commission, Presidential Deb...",[Presidential Debates],[sponsored],"(Presidential Debates, debate, sponsored)","(debate, sponsor by nonpartisan nonprofit, Com...","[nonpartisan, nonprofit]"
...,...,...,...,...,...,...,...,...,...
4173,One thing everyone here can agree on is we hop...,"[(One, CARDINAL)]",[One],"[One thing, everyone, we, you]",[],"[agree, hope, go, vote]","(you, , is)","(thing everyone we you, be, )",[]
4174,It is one of the honors and obligations of liv...,[],[],"[It, the honors, obligations, this great country]",[],[living],"(It, great country, is)","(It, be one great, honors country)",[great]
4175,Thank you and good night.,[],[],"[you, good night]",[],[Thank],"(, you, Thank)","(, thank good, you)",[good]
4176,[ Applause ],[],[],[],[],[],"(, , Applause)","(, applause, )",[]


In [19]:
def extract_adj_noun(text):
    
    doc = nlp(text)

    pat = []
    
    # iterate over tokens
    for token in doc:
        phrase = ''
        # if the word is a subject noun or an object noun
        if (token.pos_ == 'NOUN')\
            and (token.dep_ in ['dobj','pobj','nsubj','nsubjpass']):
            
            # iterate over the children nodes
            for subtoken in token.children:
                # if word is an adjective or has a compound dependency
                if (subtoken.pos_ == 'ADJ') or (subtoken.dep_ == 'compound'):
                    phrase += subtoken.text + ' '
                    
            if len(phrase)!=0:
                phrase += token.text
             
        if  len(phrase)!=0:
            pat.append(phrase)
        
    
    return pat
df['adj_noun'] = df['text'].apply(extract_adj_noun)
display(df)

Unnamed: 0,text,named_ents,named_ents_nolabel,chunk,compounds,verbs,re_subtree,re_chunk_v_chunk,adj,adj_noun
0,Good evening from Hofstra University in Hemps...,"[(evening, TIME), (Hofstra University, ORG), (...","[evening, Hofstra University, Hempstead, New Y...","[Hofstra University, Hempstead]","[Hofstra University, New York]",[],"(, Hempstead, )","(, good, University Hempstead)",[Good],[]
1,"I am Lester Holt, anchor of ""NBC Nightly News.”","[(Lester Holt, PERSON), (NBC, ORG), (Nightly N...","[Lester Holt, NBC, Nightly News]","[I, Lester Holt, anchor, ""NBC Nightly News]","[Lester Holt, NBC Nightly News]",[],"(I, Nightly News, am)","(I, be Holt, News)",[],[]
2,I want to welcome you to the first presidentia...,"[(first, ORDINAL)]",[first],"[I, you, the first presidential debate]",[],"[want, welcome]","(I, presidential debate, want)","(I, want first presidential, you debate)","[first, presidential]",[first presidential debate]
3,The participants tonight are Donald Trump and ...,"[(tonight, TIME), (Donald Trump, PERSON), (Hil...","[tonight, Donald Trump, Hillary Clinton]","[The participants, Donald Trump, Hillary Clinton]","[Donald Trump, Hillary Clinton]",[],"(participants, , are)","(participants, be Trump, )",[],[]
4,This debate is sponsored by the Commission on ...,"[(the Commission on Presidential Debates, ORG)]",[the Commission on Presidential Debates],"[This debate, the Commission, Presidential Deb...",[Presidential Debates],[sponsored],"(Presidential Debates, debate, sponsored)","(debate, sponsor by nonpartisan nonprofit, Com...","[nonpartisan, nonprofit]",[]
...,...,...,...,...,...,...,...,...,...,...
4173,One thing everyone here can agree on is we hop...,"[(One, CARDINAL)]",[One],"[One thing, everyone, we, you]",[],"[agree, hope, go, vote]","(you, , is)","(thing everyone we you, be, )",[],[]
4174,It is one of the honors and obligations of liv...,[],[],"[It, the honors, obligations, this great country]",[],[living],"(It, great country, is)","(It, be one great, honors country)",[great],[great country]
4175,Thank you and good night.,[],[],"[you, good night]",[],[Thank],"(, you, Thank)","(, thank good, you)",[good],[]
4176,[ Applause ],[],[],[],[],[],"(, , Applause)","(, applause, )",[],[]


In [20]:
def extract_prep(text):
    return [token for token in nlp(text)if token.dep_=='prep']
def add_prep(df):
    df['preps'] = df['text'].apply(extract_prep)
add_prep(df) 
display(df)

Unnamed: 0,text,named_ents,named_ents_nolabel,chunk,compounds,verbs,re_subtree,re_chunk_v_chunk,adj,adj_noun,preps
0,Good evening from Hofstra University in Hemps...,"[(evening, TIME), (Hofstra University, ORG), (...","[evening, Hofstra University, Hempstead, New Y...","[Hofstra University, Hempstead]","[Hofstra University, New York]",[],"(, Hempstead, )","(, good, University Hempstead)",[Good],[],"[from, in]"
1,"I am Lester Holt, anchor of ""NBC Nightly News.”","[(Lester Holt, PERSON), (NBC, ORG), (Nightly N...","[Lester Holt, NBC, Nightly News]","[I, Lester Holt, anchor, ""NBC Nightly News]","[Lester Holt, NBC Nightly News]",[],"(I, Nightly News, am)","(I, be Holt, News)",[],[],[of]
2,I want to welcome you to the first presidentia...,"[(first, ORDINAL)]",[first],"[I, you, the first presidential debate]",[],"[want, welcome]","(I, presidential debate, want)","(I, want first presidential, you debate)","[first, presidential]",[first presidential debate],[to]
3,The participants tonight are Donald Trump and ...,"[(tonight, TIME), (Donald Trump, PERSON), (Hil...","[tonight, Donald Trump, Hillary Clinton]","[The participants, Donald Trump, Hillary Clinton]","[Donald Trump, Hillary Clinton]",[],"(participants, , are)","(participants, be Trump, )",[],[],[]
4,This debate is sponsored by the Commission on ...,"[(the Commission on Presidential Debates, ORG)]",[the Commission on Presidential Debates],"[This debate, the Commission, Presidential Deb...",[Presidential Debates],[sponsored],"(Presidential Debates, debate, sponsored)","(debate, sponsor by nonpartisan nonprofit, Com...","[nonpartisan, nonprofit]",[],[on]
...,...,...,...,...,...,...,...,...,...,...,...
4173,One thing everyone here can agree on is we hop...,"[(One, CARDINAL)]",[One],"[One thing, everyone, we, you]",[],"[agree, hope, go, vote]","(you, , is)","(thing everyone we you, be, )",[],[],[on]
4174,It is one of the honors and obligations of liv...,[],[],"[It, the honors, obligations, this great country]",[],[living],"(It, great country, is)","(It, be one great, honors country)",[great],[great country],"[of, of, in]"
4175,Thank you and good night.,[],[],"[you, good night]",[],[Thank],"(, you, Thank)","(, thank good, you)",[good],[],[]
4176,[ Applause ],[],[],[],[],[],"(, , Applause)","(, applause, )",[],[],[]


In [21]:
def rule0(text, index):
    
    doc = nlp(text)
        
    token = doc[index]
    
    entity = ''
    
    for sub_tok in token.children:
        if (sub_tok.dep_ in ['compound','amod']):
            entity += sub_tok.text+' '
    
    entity += token.text

    return entity

def rule3_mod(text):
    
    doc = nlp(text)
    
    sent = []
    
    for token in doc:

        if token.pos_=='ADP':

            phrase = ''
            if token.head.pos_=='NOUN':
                
                # appended rule
                append = rule0(text, token.head.i)
                if len(append)!=0:
                    phrase += append
                else:  
                    phrase += token.head.text
                phrase += ' '+token.text

                for right_tok in token.rights:
                    if (right_tok.pos_ in ['NOUN','PROPN']):
                        
                        right_phrase = ''
                        # appended rule
                        append = rule0(text, right_tok.i)
                        if len(append)!=0:
                            right_phrase += ' '+append
                        else:
                            right_phrase += ' '+right_tok.text
                            
                        phrase += right_phrase
                
                if len(phrase)>2:
                    sent.append(phrase)
                

    return sent
df['noun_pre_noun'] = df['text'].apply(rule3_mod)
display(df)

Unnamed: 0,text,named_ents,named_ents_nolabel,chunk,compounds,verbs,re_subtree,re_chunk_v_chunk,adj,adj_noun,preps,noun_pre_noun
0,Good evening from Hofstra University in Hemps...,"[(evening, TIME), (Hofstra University, ORG), (...","[evening, Hofstra University, Hempstead, New Y...","[Hofstra University, Hempstead]","[Hofstra University, New York]",[],"(, Hempstead, )","(, good, University Hempstead)",[Good],[],"[from, in]",[Good evening from Hofstra University]
1,"I am Lester Holt, anchor of ""NBC Nightly News.”","[(Lester Holt, PERSON), (NBC, ORG), (Nightly N...","[Lester Holt, NBC, Nightly News]","[I, Lester Holt, anchor, ""NBC Nightly News]","[Lester Holt, NBC Nightly News]",[],"(I, Nightly News, am)","(I, be Holt, News)",[],[],[of],[anchor of NBC Nightly News]
2,I want to welcome you to the first presidentia...,"[(first, ORDINAL)]",[first],"[I, you, the first presidential debate]",[],"[want, welcome]","(I, presidential debate, want)","(I, want first presidential, you debate)","[first, presidential]",[first presidential debate],[to],[]
3,The participants tonight are Donald Trump and ...,"[(tonight, TIME), (Donald Trump, PERSON), (Hil...","[tonight, Donald Trump, Hillary Clinton]","[The participants, Donald Trump, Hillary Clinton]","[Donald Trump, Hillary Clinton]",[],"(participants, , are)","(participants, be Trump, )",[],[],[],[]
4,This debate is sponsored by the Commission on ...,"[(the Commission on Presidential Debates, ORG)]",[the Commission on Presidential Debates],"[This debate, the Commission, Presidential Deb...",[Presidential Debates],[sponsored],"(Presidential Debates, debate, sponsored)","(debate, sponsor by nonpartisan nonprofit, Com...","[nonpartisan, nonprofit]",[],[on],[]
...,...,...,...,...,...,...,...,...,...,...,...,...
4173,One thing everyone here can agree on is we hop...,"[(One, CARDINAL)]",[One],"[One thing, everyone, we, you]",[],"[agree, hope, go, vote]","(you, , is)","(thing everyone we you, be, )",[],[],[on],[]
4174,It is one of the honors and obligations of liv...,[],[],"[It, the honors, obligations, this great country]",[],[living],"(It, great country, is)","(It, be one great, honors country)",[great],[great country],"[of, of, in]",[honors of]
4175,Thank you and good night.,[],[],"[you, good night]",[],[Thank],"(, you, Thank)","(, thank good, you)",[good],[],[],[]
4176,[ Applause ],[],[],[],[],[],"(, , Applause)","(, applause, )",[],[],[],[]
