In [None]:
!pip install -U pip setuptools wheel
!pip install -U spacy

In [None]:
!python -m spacy validate

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
! pip install feedparser

In [63]:
import spacy
from spacy.matcher import Matcher, DependencyMatcher
from spacy.tokens import Span 
from spacy import displacy
from spacy.pipeline import merge_entities, merge_noun_chunks

import inflect

import sqlite3

import feedparser

In [None]:
class InformationExtractor:

  def __init__(self, text):
    '''Extract Triples, Hearst-Patterns, and Enumerations out of any text'''

    self.text = text
    self.qa_pairs = []
    # initialize inflecter
    self.p = inflect.engine()
    # load the spacy language model
    # more: https://spacy.io/usage/models
    self.nlp = spacy.load('en_core_web_sm')
    self.nlp.add_pipe('merge_entities')
    self.nlp.add_pipe('merge_noun_chunks')

    # initialize the depency matcher
    self.dep_matcher = DependencyMatcher(self.nlp.vocab)

    # X such as Y
    self.hearst1 = [{'DEP':'amod', 'OP':"?"},
                    {'POS':'NOUN'}, 
                    {'LOWER': 'such'}, 
                    {'LOWER': 'as'}, 
                    {'POS': {'IN': ['PROPN', 'NOUN']}}]
    # X and/or Y
    self.hearst2 = [{'DEP':'amod', 'OP':"?"}, 
                    {'POS':'NOUN'}, 
                    {'LOWER': 'and', 'OP':"?"}, 
                    {'LOWER': 'or', 'OP':"?"}, 
                    {'LOWER': 'other'}, 
                    {'POS': 'NOUN'}]
    # X, including Y
    self.hearst3 = [{'DEP':'nummod','OP':"?"},
                    {'DEP':'amod','OP':"?"},
                    {'POS':'NOUN'}, 
                    {'IS_PUNCT': True}, 
                    {'LOWER': 'including'}, 
                    {'DEP':'nummod','OP':"?"}, 
                    {'DEP':'amod','OP':"?"}, 
                    {'POS':'NOUN'}]
    # X, especially Y
    self.hearst4 = [{'DEP':'nummod','OP':"?"}, 
                    {'DEP':'amod','OP':"?"}, 
                    {'POS':'NOUN'}, 
                    {'IS_PUNCT': True}, 
                    {'LOWER': 'especially'}, 
                    {'DEP':'nummod','OP':"?"}, 
                    {'DEP':'amod','OP':"?"}, 
                    {'POS':'NOUN'}] 

    self.matcher = Matcher(self.nlp.vocab)
    self.matcher.add('such as ', [self.hearst1])
    self.matcher.add('and/or ', [self.hearst2])
    self.matcher.add(', including ', [self.hearst3])
    self.matcher.add(', especially ', [self.hearst4])

    self.doc = self.nlp(text)
    self.matches = self.matcher(self.doc)

  @property
  def pairs(self):
    return self.qa_pairs

  def extract(self):
    for sent in self.doc.sents:
      self._get_hearst()
      self._get_triple()

  def _get_hearst(self):
    sing_result = []
    
    for match_id, start, end in self.matches:
      span = self.doc[start:end]

      for token in span.subtree:
        if str(token.morph) == 'Number=Plur':
          sing_result.append(self.p.singular_noun(token.text))
        else:
          sing_result.append(token.text)

      result = ' '.join(sing_result)

      match_text = self.nlp.vocab.strings[match_id]
      if match_text == 'and/or ':
        result = result.replace('and', 'or', 1)
        match_text = 'or '
      if match_text == ', including ':
        result = result.replace('_', ',', 1)
      try:
        pc = result.split(match_text, 1)
        parent = pc[0]
        child = pc[1]
        children = child.replace(',', '').replace('and ', '').replace('or ', '').split()
      except IndexError:
        print(result)
        children = None


      # form qa-pairs
      if children is not None:
        for c in children:
          a = c + ' is a ' + parent.rstrip() + '.'
          q = 'What is ' + c + '?'
          self.qa_pairs.append((q,a))
          print(q,a)

  def _get_triple(self):
    # use dependency parse
    passive = False
    triple = {'subj': '', 'pred': '', 'obj': ''}
    for token in self.doc:
      if token.dep_.find('subjpass') == True:
        passive = True

    if passive == True:
      pattern = [
      {
          'RIGHT_ID': 'pred',
          'RIGHT_ATTRS': {'DEP': 'ROOT'}
      },
      {
          'LEFT_ID': 'pred',
          'REL_OP': '>',
          'RIGHT_ID': 'obj',
          'RIGHT_ATTRS': {'DEP': 'nsubjpass'}
      },
      {
          'LEFT_ID': 'pred',
          'REL_OP': '>>',
          'RIGHT_ID': 'subj',
          'RIGHT_ATTRS': {"DEP": {'IN': ['pobj']}},
      }
  ]
      self.dep_matcher.add('PASSIVE', [pattern])
    else:
      pattern = [
      {
          'RIGHT_ID': 'pred',
          'RIGHT_ATTRS': {'DEP': 'ROOT'}
      },
      {
          'LEFT_ID': 'pred',
          'REL_OP': '>>',
          'RIGHT_ID': 'obj',
          'RIGHT_ATTRS': {'DEP': {'IN': ['pobj', 'dobj']}}

      },
      {
        'LEFT_ID': 'pred',
        'REL_OP': '>',
        'RIGHT_ID': 'subj',
        'RIGHT_ATTRS': {'DEP': {'IN': ['pobj', 'nsubj']}}
      }
  ]
      self.dep_matcher.add('ACTIVE', [pattern])

    matches = self.dep_matcher(self.doc)
    
    #TODO: extend matches to catch enumerations
    try:
      match_id, token_ids = matches[0]
      for i in range(len(token_ids)):
        triple[pattern[i]["RIGHT_ID"]] = self.doc[token_ids[i]]
      print(triple)
      
      # ask for subject
      # possible ent types: https://towardsdatascience.com/explorations-in-named-entity-recognition-and-was-eleanor-roosevelt-right-671271117218
      if triple['subj'].ent_type_ in ['PERSON', 'GPE', 'ORG', 'NORP']:
        q = 'Who ' + triple['pred'].text + ' ' + triple['obj'].text + '?'
        a = triple['subj'].text
        self.qa_pairs.append((q,a))
      elif triple['subj'].ent_type_ in ['DATE', 'TIME']:
        q = 'When ' + triple['pred'].text + ' ' + triple['obj'].text + '?'
        a = triple['subj'].text
        self.qa_pairs.append((q,a))
      elif triple['subj'].ent_type_ in ['LOC', 'FAC']:
        q = 'Where ' + triple['pred'].text + ' ' + triple['obj'].text + '?'
        a = triple['subj'].text
        self.qa_pairs.append((q,a))
      elif triple['subj'].ent_type_ in ['MONEY', 'QUANTITY', 'PERCENT']:
        q = 'How much ' + triple['pred'].text + ' ' + triple['obj'].text + '?'
        a = triple['subj'].text
        self.qa_pairs.append((q,a))
      elif triple['subj'].ent_type_ in ['PRODUCT', 'EVENT', 'WORK_OF_ART']:
        q = 'What ' + triple['pred'].text + ' ' + triple['obj'].text + '?'
        a = triple['subj'].text
        self.qa_pairs.append((q,a))
      elif triple['subj'].ent_type_ in ['CARDINAL']:
        q = 'How many ' + triple['pred'].text + ' ' + triple['obj'].text + '?'
        a = triple['subj'].text
        self.qa_pairs.append((q,a))
      elif triple['subj'].ent_type_ in ['LANGUAGE']:
        q = 'What language ' + triple['pred'].text + ' ' + triple['obj'].text + '?'
        a = triple['subj'].text
        self.qa_pairs.append((q,a))
      elif triple['subj'].ent_type_ in ['LAW']:
        q = 'What law ' + triple['pred'].text + ' ' + triple['obj'].text + '?'
        a = triple['subj'].text
        self.qa_pairs.append((q,a))
      else:
        q = 'What ' + triple['pred'].text + ' ' + triple['obj'].text + '?'
        a = triple['subj'].text
        self.qa_pairs.append((q,a))

    except IndexError:
      self.visualize()

  def get_gramm_info(self):
    for token in self.doc:
      print(token.text, token.dep_, token.pos_)

  def visualize(self):
    displacy.render(self.doc, jupyter='true')
    # # in case of vscode
    # svg = displacy.render(self.doc, style='dep')
    
    # with open('sent.svg', 'w', encoding='utf-8') as outfile:
    #   outfile.write(svg)

In [None]:
feed = feedparser.parse('https://www.democracynow.org/democracynow.rss')
for entry in feed.entries:
  print(entry.summary)

In [None]:
nlp = spacy.load('en_core_web_sm')
feed = feedparser.parse('https://www.democracynow.org/democracynow.rss')
extracts = []
entry_counter = 0

for entry in feed.entries:
  entry_counter += 1
  text = entry.summary
  doc = nlp(text)
  for sent in doc.sents:
    ext = InformationExtractor(sent.text)
    # ext.visualize()
      
    ext.extract()
    for qa in ext.qa_pairs:
      extracts.append(qa)

print(len(extracts))
print(entry_counter)

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
!mkdir gdrive/MyDrive/qa_pairs

In [None]:
# connect to database, if not existing, a new one will be created
conn = sqlite3.connect('gdrive/MyDrive/qa_pairs/qa_pairs.db')

cursor = conn.cursor()

# create a table
conn.execute('''CREATE TABLE QA
         (ID INT PRIMARY KEY NOT NULL,
         QUESTION TEXT NOT NULL,
         ANSWER TEXT NOT NULL);''')

In [None]:
# write contents of extracts into database
index = 0
for q, a in extracts:
  index += 1
  query = '''INSERT INTO QA
  (ID, QUESTION, ANSWER)
  VALUES
  (?, ?, ?)'''
  data = (index, q, a)
  cursor.execute(query, data)
  conn.commit()

In [None]:
# take a look at the results
cursor.execute('''SELECT * from QA''')
records = cursor.fetchall()
for row in records:
  print(row)

In [None]:
# create .csv file to use for fine tuning
import csv
with open('gdrive/MyDrive/qa_pairs/qa_pairs.csv', 'w', newline='') as csvfile:
  fieldnames = ['prompt', 'completion']
  writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
  writer.writeheader()
  cursor.execute('''SELECT * from QA''')
  records = cursor.fetchall()
  for row in records:
    ID, prompt, completion = row
    writer.writerow({'prompt': prompt, 'completion': completion})

In [None]:
!pip install wandb

In [None]:
!pip install --upgrade openai

In [None]:
!openai tools fine_tunes.prepare_data -f gdrive/MyDrive/qa_pairs/qa_pairs.csv

In [71]:
!export OPENAI_API_KEY="<YOUR KEY>"

In [None]:
!openai api fine_tunes.create -t "gdrive/MyDrive/qa_pairs/qa_pairs_prepared.jsonl"