<a href="https://colab.research.google.com/github/zhaw-iwi/qa_tutorials-work-in-progress-/blob/main/information_extraction_example_solution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -U spacy[cuda92]



In [2]:
!python -m spacy download en_core_web_sm

Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 648 kB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [3]:
import spacy
from spacy.matcher import Matcher 
from spacy.tokens import Span 
from spacy import displacy
from spacy.pipeline import merge_entities, merge_noun_chunks

import re

In [4]:
nlp = spacy.load('en_core_web_sm')

In [5]:
nlp.add_pipe(nlp.create_pipe('merge_entities'))
nlp.add_pipe(nlp.create_pipe('merge_noun_chunks'))
# TODO: merge enumerations!!!

In [6]:
class InformationExtractor:

  def __init__(self, text):
    self.text = text
    self.doc = nlp(text)

    # X such as Y
    self.hearst1 = [{'DEP':'amod', 'OP':"?"},
                    {'POS':'NOUN'}, 
                    {'LOWER': 'such'}, 
                    {'LOWER': 'as'}, 
                    {'POS': 'PROPN'}]
    # X and/or Y
    self.hearst2 = [{'DEP':'amod', 'OP':"?"}, 
                    {'POS':'NOUN'}, 
                    {'LOWER': 'and', 'OP':"?"}, 
                    {'LOWER': 'or', 'OP':"?"}, 
                    {'LOWER': 'other'}, 
                    {'POS': 'NOUN'}]
    # X, including Y
    self.hearst3 = [{'DEP':'nummod','OP':"?"},
                    {'DEP':'amod','OP':"?"},
                    {'POS':'NOUN'}, 
                    {'IS_PUNCT': True}, 
                    {'LOWER': 'including'}, 
                    {'DEP':'nummod','OP':"?"}, 
                    {'DEP':'amod','OP':"?"}, 
                    {'POS':'NOUN'}]
    # X, especially Y
    self.hearst4 = [{'DEP':'nummod','OP':"?"}, 
                    {'DEP':'amod','OP':"?"}, 
                    {'POS':'NOUN'}, 
                    {'IS_PUNCT': True}, 
                    {'LOWER': 'especially'}, 
                    {'DEP':'nummod','OP':"?"}, 
                    {'DEP':'amod','OP':"?"}, 
                    {'POS':'NOUN'}] 

    self.matcher = Matcher(nlp.vocab)
    self.matcher.add('such as ', [self.hearst1])
    self.matcher.add('and/or ', [self.hearst2])
    self.matcher.add(', including ', [self.hearst3])
    self.matcher.add(', especially ', [self.hearst4])



  def get_gramm_info(self):
    for token in self.doc:
      print(token.text, token.dep_, token.pos_)

  def get_hearst(self):
    parent = ''
    children = []
    matches = self.matcher(self.doc)
  
    for match_id, start, end in matches:
      span = self.doc[start:end]
      result = span.text
      match_text = nlp.vocab.strings[match_id]
      if match_text == 'and/or ':
        result = result.replace('and', 'or', 1)
        match_text = 'or '


      for i, t in enumerate(result.split(match_text, 1)):
        if i == 0:
          parent = t
        else:
          for c in t.split(', '):
            children.append(c)
      return parent, children

  def get_triple(self):
    # TODO: get pred subtree!
    passive = False
    subj = ''
    pred = ''
    obj = ''
    for token in self.doc:
      if token.dep_.find("subjpass") == True:
        passive = True

    # passive sentences
    if passive == True:
      for token in self.doc:
        if token.dep_.find('subjpass') == True:
          obj = token.text
          pred = token.head.text

        if token.dep_.endswith('obj') == True:
          subj = token.text
    # active sentences
    else:
      for token in self.doc:
        if token.dep_.endswith("subj") == True:
          subj = token.text
          pred = token.head.text

        if token.dep_.endswith("obj") == True:
          obj = token.text
    print(passive)

    return subj, pred, obj

  def visualize(self):
    displacy.render(self.doc, jupyter='true')



In [15]:
text = "Salesforce recently acquired Tableau." 
text2 = "Tableau was recently acquired by Salesforce." 
text3 = "GDP in developing countries such as Vietnam, China, or Laos will continue growing at a high rate." 
text4 = "Here is how you can keep your car or other vehicles clean."

In [8]:
InformationExtractor(text3).get_hearst()

('developing countries ', ['Vietnam'])

In [16]:
InformationExtractor(text).get_triple()

False


('Salesforce', 'acquired', 'Tableau')

In [13]:
InformationExtractor(text3).visualize()