In [1]:
import re
import pandas as pd
import bs4
import requests
import spacy
from spacy import displacy


from spacy.matcher import Matcher 
from spacy.tokens import Span 

import networkx as nx

import matplotlib.pyplot as plt
from tqdm import tqdm

pd.set_option('display.max_colwidth', 200)
%matplotlib inline



In [3]:
!python -m spacy download en_core_web_lg

2023-01-07 16:05:00.041374: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-lg==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.4.1/en_core_web_lg-3.4.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.4.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [4]:
nlp = spacy.load('en_core_web_lg')

In [5]:
doc = nlp("the drawdown process is governed by astm standard d823")

for tok in doc:
  print(tok.text, "...", tok.dep_)

the ... det
drawdown ... compound
process ... nsubjpass
is ... auxpass
governed ... ROOT
by ... agent
astm ... compound
standard ... compound
d823 ... pobj


In [6]:
def get_entities(sent):
  ## chunk 1
  ent1 = ""
  ent2 = ""

  prv_tok_dep = ""    # dependency tag of previous token in the sentence
  prv_tok_text = ""   # previous token in the sentence

  prefix = ""
  modifier = ""

  #############################################################
  
  for tok in nlp(sent):
    ## chunk 2
    # if token is a punctuation mark then move on to the next token
    if tok.dep_ != "punct":
      # check: token is a compound word or not
      if tok.dep_ == "compound":
        prefix = tok.text
        # if the previous word was also a 'compound' then add the current word to it
        if prv_tok_dep == "compound":
          prefix = prv_tok_text + " "+ tok.text
      
      # check: token is a modifier or not
      if tok.dep_.endswith("mod") == True:
        modifier = tok.text
        # if the previous word was also a 'compound' then add the current word to it
        if prv_tok_dep == "compound":
          modifier = prv_tok_text + " "+ tok.text
      
      ## chunk 3
      if tok.dep_.find("subj") == True:
        ent1 = modifier +" "+ prefix + " "+ tok.text
        prefix = ""
        modifier = ""
        prv_tok_dep = ""
        prv_tok_text = ""      

      ## chunk 4
      if tok.dep_.find("obj") == True:
        ent2 = modifier +" "+ prefix +" "+ tok.text
        
      ## chunk 5  
      # update variables
      prv_tok_dep = tok.dep_
      prv_tok_text = tok.text
  #############################################################

  return [ent1.strip(), ent2.strip()]

In [17]:
def get_relation(sent):

  doc = nlp(sent)

  # Matcher class object 
  matcher = Matcher(nlp.vocab)

  #define the pattern 
  pattern = [{'DEP':'ROOT'}, 
            {'DEP':'prep','OP':"?"},
            {'DEP':'agent','OP':"?"},  
            {'POS':'ADJ','OP':"?"}] 

  matcher.add("matching_1",  [pattern]) 

  matches = matcher(doc)
  k = len(matches) - 1

  span = doc[matches[k][1]:matches[k][2]] 

  return(span.text)

In [7]:
get_entities("the film had 200 patents")

['film', '200  patents']

In [9]:
sent1 = "suppressor phosphatase  and  tensin homolog  (PTEN)  function  increases  B7-H1  expression  and immune resistance in glioma"

In [21]:
doc = nlp(sent1)

for tok in doc:
  print(tok.text, "...", tok.dep_)

suppressor ... compound
phosphatase ... nsubj
  ... dep
and ... cc
  ... dep
tensin ... conj
homolog ... nsubj
  ... dep
( ... punct
PTEN ... appos
) ... punct
  ... dep
function ... compound
  ... dep
increases ... ROOT
  ... dep
B7 ... compound
- ... punct
H1 ... compound
  ... dep
expression ... dobj
  ... dep
and ... cc
immune ... amod
resistance ... conj
in ... prep
glioma ... pobj


In [22]:
print(get_entities(sent1))
print(get_relation(sent1))

['homolog', 'immune B7 H1 glioma']
increases


In [23]:
sent2 = "Phase II trial Evaluating Predicting of Response to Adjuvant Treatment with Immune \nCheckpoint Inhibition (ICI) in Patients with Newly Diagnosed Glioblastoma or Gliosarcoma"

In [24]:
doc = nlp(sent2)

for tok in doc:
  print(tok.text, "...", tok.dep_)

Phase ... compound
II ... compound
trial ... ROOT
Evaluating ... acl
Predicting ... dobj
of ... prep
Response ... pobj
to ... prep
Adjuvant ... compound
Treatment ... pobj
with ... prep
Immune ... compound

 ... dep
Checkpoint ... compound
Inhibition ... pobj
( ... punct
ICI ... appos
) ... punct
in ... prep
Patients ... pobj
with ... prep
Newly ... advmod
Diagnosed ... amod
Glioblastoma ... pobj
or ... cc
Gliosarcoma ... conj


In [25]:
print(get_entities(sent2))
print(get_relation(sent2))

['', 'Diagnosed Checkpoint Glioblastoma']
trial


In [26]:
sent3 = "The immunosuppressive nature of GBM may be reversible with immune checkpoint \ninhibitor (ICI) however initial studies have yet to demonstrate this"

In [27]:
print(get_entities(sent3))
print(get_relation(sent3))

['initial checkpoint studies', 'yet  this']
be reversible


In [37]:
doc = nlp(sent3)

for tok in doc:
  print(tok.text, "...", tok.dep_)

The ... det
immunosuppressive ... amod
nature ... nsubj
of ... prep
GBM ... pobj
may ... aux
be ... ROOT
reversible ... acomp
with ... prep
immune ... amod
checkpoint ... compound

 ... dep
inhibitor ... pobj
( ... punct
ICI ... appos
) ... punct
however ... advmod
initial ... amod
studies ... nsubj
have ... advcl
yet ... advmod
to ... aux
demonstrate ... xcomp
this ... dobj


In [38]:
doc = nlp("Autonomous cars shift insurance liability toward manufacturers")
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_,
            chunk.root.head.text)

Autonomous cars cars nsubj shift
insurance liability liability dobj shift
manufacturers manufacturers pobj toward


In [46]:
print(sent3)
doc = nlp(sent3)
for chunk in doc.noun_chunks:
    print(chunk.text, " - ", chunk.root.text, " - ", chunk.root.dep_, " - ",
            chunk.root.head.text)

print('=========================================')
for tok in doc:
  print(tok.text, "...", tok.dep_)

The immunosuppressive nature of GBM may be reversible with immune checkpoint 
inhibitor (ICI) however initial studies have yet to demonstrate this
The immunosuppressive nature  -  nature  -  nsubj  -  be
GBM  -  GBM  -  pobj  -  of
immune checkpoint 
inhibitor  -  inhibitor  -  pobj  -  with
ICI  -  ICI  -  appos  -  inhibitor
initial studies  -  studies  -  nsubj  -  have
this  -  this  -  dobj  -  demonstrate
The ... det
immunosuppressive ... amod
nature ... nsubj
of ... prep
GBM ... pobj
may ... aux
be ... ROOT
reversible ... acomp
with ... prep
immune ... amod
checkpoint ... compound

 ... dep
inhibitor ... pobj
( ... punct
ICI ... appos
) ... punct
however ... advmod
initial ... amod
studies ... nsubj
have ... advcl
yet ... advmod
to ... aux
demonstrate ... xcomp
this ... dobj


In [45]:
print(sent2)
doc = nlp(sent2)
for chunk in doc.noun_chunks:
    print(chunk.text, " - ", chunk.root.text, " - ", chunk.root.dep_, " - ",
            chunk.root.head.text)

print('=========================================')
for tok in doc:
  print(tok.text, "...", tok.dep_)

Phase II trial Evaluating Predicting of Response to Adjuvant Treatment with Immune 
Checkpoint Inhibition (ICI) in Patients with Newly Diagnosed Glioblastoma or Gliosarcoma
Phase II trial  -  trial  -  ROOT  -  trial
Predicting  -  Predicting  -  dobj  -  Evaluating
Response  -  Response  -  pobj  -  of
Adjuvant Treatment  -  Treatment  -  pobj  -  to
Immune 
Checkpoint Inhibition  -  Inhibition  -  pobj  -  with
ICI  -  ICI  -  appos  -  Inhibition
Patients  -  Patients  -  pobj  -  in
Newly Diagnosed Glioblastoma  -  Glioblastoma  -  pobj  -  with
Gliosarcoma  -  Gliosarcoma  -  conj  -  Glioblastoma
Phase ... compound
II ... compound
trial ... ROOT
Evaluating ... acl
Predicting ... dobj
of ... prep
Response ... pobj
to ... prep
Adjuvant ... compound
Treatment ... pobj
with ... prep
Immune ... compound

 ... dep
Checkpoint ... compound
Inhibition ... pobj
( ... punct
ICI ... appos
) ... punct
in ... prep
Patients ... pobj
with ... prep
Newly ... advmod
Diagnosed ... amod
Glioblastoma

In [44]:
print(sent1)
doc = nlp(sent1)
for chunk in doc.noun_chunks:
    print(chunk.text, " - ", chunk.root.text, " - ", chunk.root.dep_, " - ",
            chunk.root.head.text)

print('=========================================')
for tok in doc:
  print(tok.text, "...", tok.dep_)

suppressor phosphatase  and  tensin homolog  (PTEN)  function  increases  B7-H1  expression  and immune resistance in glioma
suppressor phosphatase  -  phosphatase  -  nsubj  -  increases
tensin  -  tensin  -  conj  -  phosphatase
homolog  -  homolog  -  nsubj  -  function
(PTEN  -  PTEN  -  appos  -  homolog
B7-H1  expression  -  expression  -  dobj  -  increases
immune resistance  -  resistance  -  conj  -  expression
glioma  -  glioma  -  pobj  -  in
suppressor ... compound
phosphatase ... nsubj
  ... dep
and ... cc
  ... dep
tensin ... conj
homolog ... nsubj
  ... dep
( ... punct
PTEN ... appos
) ... punct
  ... dep
function ... compound
  ... dep
increases ... ROOT
  ... dep
B7 ... compound
- ... punct
H1 ... compound
  ... dep
expression ... dobj
  ... dep
and ... cc
immune ... amod
resistance ... conj
in ... prep
glioma ... pobj
