<a href="https://colab.research.google.com/github/tushar910802/NLP-Functions/blob/main/SpacyFunctions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import spacy
from spacy import displacy
from spacy.symbols import (
    AUX, VERB, NOUN,
    agent, attr, aux, auxpass, csubj, csubjpass, dobj, neg, nsubj, nsubjpass, obj, pobj, xcomp,
)
from spacy.tokens import Doc, Span, Token

from typing import Iterable, List, Optional, Pattern, Tuple

import pandas as pd

from nltk import Tree
from operator import attrgetter

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
_NOMINAL_SUBJ_DEPS = {nsubj, nsubjpass}        ## Used in svo triple eg - Anita was driven
_CLAUSAL_SUBJ_DEPS = {csubj, csubjpass}       ## Used in svo triple 
_ACTIVE_SUBJ_DEPS = {csubj, nsubj}           ## Used in extract_quotation to find speaker 
_VERB_MODIFIER_DEPS = {aux, auxpass, neg}    ## eg - shall not be entitled  Used in expand_verb

In [None]:
def display_nlp(doc, include_punct=False):
    """Generate data frame for visualization of spaCy tokens."""
    rows = []
    for i, t in enumerate(doc):

        row = {'token': i, 'text': t.text, 'lemma_': t.lemma_,
         'tag_': t.tag_,
        'pos_': t.pos_, 'dep_': t.dep_,
        'ent_type_': t.ent_type_, 'ent_iob_': t.ent_iob_,
              'is_title':t.is_title}
        rows.append(row)
    df = pd.DataFrame(rows).set_index('token')
    df.index.name = None
    pd.set_option('display.max_rows', None)  # or 1000
    return df

In [None]:
def to_nltk_tree(node):
    if node.n_lefts + node.n_rights > 0:
        return Tree(node.orth_, [to_nltk_tree(child) for child in node.children])
    else:
        return node.orth_

In [None]:
def expand_noun(tok: Token) -> List[Token]:
    """Expand a noun token to include all associated conjunct and compound nouns."""
    tok_and_conjuncts = [tok] + list(tok.conjuncts)
    compounds = [child for tc in tok_and_conjuncts for child in tc.children
        # TODO: why doesn't compound import from spacy.symbols?
        if child.dep_ == "compound"
    ]
    return tok_and_conjuncts + compounds


def expand_verb(tok: Token) -> List[Token]:
    """Expand a verb token to include all associated auxiliary and negation tokens."""
    verb_modifiers = [
        child for child in tok.children if child.dep in _VERB_MODIFIER_DEPS
    ]
    return [tok] + verb_modifiers


In [None]:
text = 'The building landlord shall not be permitted to do a construction in the rented premises.'
doc = nlp(text)

In [None]:
[to_nltk_tree(sent.root).pretty_print() for sent in doc.sents]

                  permitted                                                           
   ___________________|________________________________                                
  |    |   |   |               |                       do                             
  |    |   |   |               |                _______|_______                        
  |    |   |   |               |               |          construction                
  |    |   |   |               |               |    ___________|__________             
  |    |   |   |               |               |   |                      in          
  |    |   |   |               |               |   |                      |            
  |    |   |   |            landlord           |   |                   premises       
  |    |   |   |       ________|________       |   |            __________|_______     
shall not  be  .     The             building  to  a          the               rented



[None]

In [None]:
displacy.render(doc)

'<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:lang="en" id="b3b920cc3f214b41b46831e3b905b263-0" class="displacy" width="2675" height="487.0" direction="ltr" style="max-width: none; height: 487.0px; color: #000000; background: #ffffff; font-family: Arial; direction: ltr">\n<text class="displacy-token" fill="currentColor" text-anchor="middle" y="397.0">\n    <tspan class="displacy-word" fill="currentColor" x="50">The</tspan>\n    <tspan class="displacy-tag" dy="2em" fill="currentColor" x="50">DET</tspan>\n</text>\n\n<text class="displacy-token" fill="currentColor" text-anchor="middle" y="397.0">\n    <tspan class="displacy-word" fill="currentColor" x="225">building</tspan>\n    <tspan class="displacy-tag" dy="2em" fill="currentColor" x="225">NOUN</tspan>\n</text>\n\n<text class="displacy-token" fill="currentColor" text-anchor="middle" y="397.0">\n    <tspan class="displacy-word" fill="currentColor" x="400">landlord</tspan>\n    <tspan class="disp

In [None]:
for token in doc:
    if token.pos == NOUN:
        print(token)
        noun = sorted(expand_noun(token),key=attrgetter('i'))
        print(noun)
        print('---------------------------------------------')
    if token.pos == VERB:
        print(token)
        verb=sorted(expand_verb(token), key=attrgetter("i"))
        print(verb)
        print('=============================================')

building
[building]
---------------------------------------------
landlord
[building, landlord]
---------------------------------------------
shall
[shall]
permitted
[shall, not, be, permitted]
construction
[construction]
---------------------------------------------
rented
[rented]
premises
[premises]
---------------------------------------------


In [None]:
display_nlp(doc)

Unnamed: 0,text,lemma_,tag_,pos_,dep_,ent_type_,ent_iob_,is_title
0,The,the,DT,DET,det,,O,True
1,building,building,NN,NOUN,compound,,O,False
2,landlord,landlord,NN,NOUN,nsubjpass,,O,False
3,shall,shall,MD,VERB,aux,,O,False
4,not,not,RB,PART,neg,,O,False
5,be,be,VB,AUX,auxpass,,O,False
6,permitted,permit,VBN,VERB,ROOT,,O,False
7,to,to,TO,PART,aux,,O,False
8,do,do,VB,AUX,xcomp,,O,False
9,a,a,DT,DET,det,,O,False
