In [42]:
!pip install -U pip setuptools wheel
!pip install -U spacy

[0m

In [43]:
!python -m spacy validate

⠙ Loading compatibility table...[2K[38;5;2m✔ Loaded compatibility table[0m
[1m
[38;5;4mℹ spaCy installation: /usr/local/lib/python3.7/dist-packages/spacy[0m

NAME             SPACY                 VERSION                            
en_core_web_sm   >=3.3.0.dev0,<3.4.0   [38;5;2m3.3.0[0m   [38;5;2m✔[0m



In [None]:
!python -m spacy download en_core_web_sm

In [46]:
! pip install feedparser

Collecting feedparser
  Downloading feedparser-6.0.8-py3-none-any.whl (81 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/81.0 KB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.0/81.0 KB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sgmllib3k
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone
  Created wheel for sgmllib3k: filename=sgmllib3k-1.0.0-py3-none-any.whl size=6066 sha256=c304cef87889414c921c5a2cbc91c5b38d483e9d0fdd0f648409c781a242ce2d
  Stored in directory: /root/.cache/pip/wheels/73/ad/a4/0dff4a6ef231fc0dfa12ffbac2a36cebfdddfe059f50e019aa
Successfully built sgmllib3k
Installing collected packages: sgmllib3k, feedparser
Successfully installed feedparser-6.0.8 sgmllib3k-1.0.0
[0m

In [47]:
import spacy
from spacy.matcher import Matcher, DependencyMatcher
from spacy.tokens import Span 
from spacy import displacy
from spacy.pipeline import merge_entities, merge_noun_chunks

import inflect

import sqlite3

import feedparser

In [48]:
class InformationExtractor:

  def __init__(self, text):
    '''Extract Triples, Hearst-Patterns, and Enumerations out of any text'''

    self.text = text
    self.qa_pairs = []
    # initialize inflecter
    self.p = inflect.engine()
    # load the spacy language model
    # more: https://spacy.io/usage/models
    self.nlp = spacy.load('en_core_web_sm')
    self.nlp.add_pipe('merge_entities')
    self.nlp.add_pipe('merge_noun_chunks')

    # initialize the depency matcher
    self.dep_matcher = DependencyMatcher(self.nlp.vocab)

    # X such as Y
    self.hearst1 = [{'DEP':'amod', 'OP':"?"},
                    {'POS':'NOUN'}, 
                    {'LOWER': 'such'}, 
                    {'LOWER': 'as'}, 
                    {'POS': {'IN': ['PROPN', 'NOUN']}}]
    # X and/or Y
    self.hearst2 = [{'DEP':'amod', 'OP':"?"}, 
                    {'POS':'NOUN'}, 
                    {'LOWER': 'and', 'OP':"?"}, 
                    {'LOWER': 'or', 'OP':"?"}, 
                    {'LOWER': 'other'}, 
                    {'POS': 'NOUN'}]
    # X, including Y
    self.hearst3 = [{'DEP':'nummod','OP':"?"},
                    {'DEP':'amod','OP':"?"},
                    {'POS':'NOUN'}, 
                    {'IS_PUNCT': True}, 
                    {'LOWER': 'including'}, 
                    {'DEP':'nummod','OP':"?"}, 
                    {'DEP':'amod','OP':"?"}, 
                    {'POS':'NOUN'}]
    # X, especially Y
    self.hearst4 = [{'DEP':'nummod','OP':"?"}, 
                    {'DEP':'amod','OP':"?"}, 
                    {'POS':'NOUN'}, 
                    {'IS_PUNCT': True}, 
                    {'LOWER': 'especially'}, 
                    {'DEP':'nummod','OP':"?"}, 
                    {'DEP':'amod','OP':"?"}, 
                    {'POS':'NOUN'}] 

    self.matcher = Matcher(self.nlp.vocab)
    self.matcher.add('such as ', [self.hearst1])
    self.matcher.add('and/or ', [self.hearst2])
    self.matcher.add(', including ', [self.hearst3])
    self.matcher.add(', especially ', [self.hearst4])

    self.doc = self.nlp(text)
    self.matches = self.matcher(self.doc)

  @property
  def pairs(self):
    return self.qa_pairs

  def extract(self):
    for sent in self.doc.sents:
      self._get_hearst()
      self._get_triple()

  def _get_hearst(self):
    sing_result = []
    
    for match_id, start, end in self.matches:
      span = self.doc[start:end]

      for token in span.subtree:
        if str(token.morph) == 'Number=Plur':
          sing_result.append(self.p.singular_noun(token.text))
        else:
          sing_result.append(token.text)

      result = ' '.join(sing_result)

      match_text = self.nlp.vocab.strings[match_id]
      if match_text == 'and/or ':
        result = result.replace('and', 'or', 1)
        match_text = 'or '
      if match_text == ', including ':
        result = result.replace('_', ',', 1)
      try:
        pc = result.split(match_text, 1)
        parent = pc[0]
        child = pc[1]
        children = child.replace(',', '').replace('and ', '').replace('or ', '').split()
      except IndexError:
        print(result)
        children = None


      # form qa-pairs
      if children is not None:
        for c in children:
          a = c + ' is a ' + parent.rstrip() + '.'
          q = 'What is ' + c + '?'
          self.qa_pairs.append((q,a))
          print(q,a)

  def _get_triple(self):
    # use dependency parse
    passive = False
    triple = {'subj': '', 'pred': '', 'obj': ''}
    for token in self.doc:
      if token.dep_.find('subjpass') == True:
        passive = True

    if passive == True:
      pattern = [
      {
          'RIGHT_ID': 'pred',
          'RIGHT_ATTRS': {'DEP': 'ROOT'}
      },
      {
          'LEFT_ID': 'pred',
          'REL_OP': '>',
          'RIGHT_ID': 'obj',
          'RIGHT_ATTRS': {'DEP': 'nsubjpass'}
      },
      {
          'LEFT_ID': 'pred',
          'REL_OP': '>>',
          'RIGHT_ID': 'subj',
          'RIGHT_ATTRS': {"DEP": {'IN': ['pobj']}},
      }
  ]
      self.dep_matcher.add('PASSIVE', [pattern])
    else:
      pattern = [
      {
          'RIGHT_ID': 'pred',
          'RIGHT_ATTRS': {'DEP': 'ROOT'}
      },
      {
          'LEFT_ID': 'pred',
          'REL_OP': '>>',
          'RIGHT_ID': 'obj',
          'RIGHT_ATTRS': {'DEP': {'IN': ['pobj', 'dobj']}}

      },
      {
        'LEFT_ID': 'pred',
        'REL_OP': '>',
        'RIGHT_ID': 'subj',
        'RIGHT_ATTRS': {'DEP': {'IN': ['pobj', 'nsubj']}}
      }
  ]
      self.dep_matcher.add('ACTIVE', [pattern])

    matches = self.dep_matcher(self.doc)
    
    #TODO: extend matches to catch enumerations
    try:
      match_id, token_ids = matches[0]
      for i in range(len(token_ids)):
        triple[pattern[i]["RIGHT_ID"]] = self.doc[token_ids[i]]
      print(triple)
      
      # ask for subject
      # possible ent types: https://towardsdatascience.com/explorations-in-named-entity-recognition-and-was-eleanor-roosevelt-right-671271117218
      if triple['subj'].ent_type_ in ['PERSON', 'GPE', 'ORG', 'NORP']:
        q = 'Who ' + triple['pred'].text + ' ' + triple['obj'].text + '?'
        a = triple['subj'].text
        self.qa_pairs.append((q,a))
      elif triple['subj'].ent_type_ in ['DATE', 'TIME']:
        q = 'When ' + triple['pred'].text + ' ' + triple['obj'].text + '?'
        a = triple['subj'].text
        self.qa_pairs.append((q,a))
      elif triple['subj'].ent_type_ in ['LOC', 'FAC']:
        q = 'Where ' + triple['pred'].text + ' ' + triple['obj'].text + '?'
        a = triple['subj'].text
        self.qa_pairs.append((q,a))
      elif triple['subj'].ent_type_ in ['MONEY', 'QUANTITY', 'PERCENT']:
        q = 'How much ' + triple['pred'].text + ' ' + triple['obj'].text + '?'
        a = triple['subj'].text
        self.qa_pairs.append((q,a))
      elif triple['subj'].ent_type_ in ['PRODUCT', 'EVENT', 'WORK_OF_ART']:
        q = 'What ' + triple['pred'].text + ' ' + triple['obj'].text + '?'
        a = triple['subj'].text
        self.qa_pairs.append((q,a))
      elif triple['subj'].ent_type_ in ['CARDINAL']:
        q = 'How many ' + triple['pred'].text + ' ' + triple['obj'].text + '?'
        a = triple['subj'].text
        self.qa_pairs.append((q,a))
      elif triple['subj'].ent_type_ in ['LANGUAGE']:
        q = 'What language ' + triple['pred'].text + ' ' + triple['obj'].text + '?'
        a = triple['subj'].text
        self.qa_pairs.append((q,a))
      elif triple['subj'].ent_type_ in ['LAW']:
        q = 'What law ' + triple['pred'].text + ' ' + triple['obj'].text + '?'
        a = triple['subj'].text
        self.qa_pairs.append((q,a))
      else:
        q = 'What ' + triple['pred'].text + ' ' + triple['obj'].text + '?'
        a = triple['subj'].text
        self.qa_pairs.append((q,a))

    except IndexError:
      self.visualize()

  def get_gramm_info(self):
    for token in self.doc:
      print(token.text, token.dep_, token.pos_)

  def visualize(self):
    displacy.render(self.doc, jupyter='true')
    # # in case of vscode
    # svg = displacy.render(self.doc, style='dep')
    
    # with open('sent.svg', 'w', encoding='utf-8') as outfile:
    #   outfile.write(svg)

In [49]:
feed = feedparser.parse('https://www.democracynow.org/democracynow.rss')
for entry in feed.entries:
  print(entry.summary)

Governments around the world are eagerly returning back to pre-pandemic conditions by relaxing preventative restrictions, lifting mask mandates and pulling back public funding. Dr. Abraar Karan, infectious disease fellow at Stanford University School of Medicine, says these moves are overly optimistic and that the U.S. is not prepared for new variants spreading around the country. &#8220;We&#8217;re trying to say it&#8217;s over. It&#8217;s not true,&#8221; he says. &#8220;As time goes on, immunity wanes, and we will begin to see more severe cases.&#8221;
The World Health Organization says the coronavirus pandemic has now caused an excess of 15 million deaths globally. We look at how staggering death counts reveal broader political failures to protect public health and close the international vaccine gap. &#8220;Western governments and rich corporations who are based primarily in the West have done very little to advance vaccine inequity or to help the entire world end this pandemic fa

In [None]:
nlp = spacy.load('en_core_web_sm')
feed = feedparser.parse('https://www.democracynow.org/democracynow.rss')
extracts = []
entry_counter = 0

for entry in feed.entries:
  entry_counter += 1
  text = entry.summary
  doc = nlp(text)
  for sent in doc.sents:
    ext = InformationExtractor(sent.text)
    # ext.visualize()
      
    ext.extract()
    for qa in ext.qa_pairs:
      extracts.append(qa)

print(len(extracts))
print(entry_counter)

In [51]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
!mkdir gdrive/MyDrive/qa_pairs

mkdir: cannot create directory ‘gdrive/MyDrive/qa_pairs’: File exists


In [None]:
# connect to database, if not existing, a new one will be created
conn = sqlite3.connect('gdrive/MyDrive/qa_pairs/qa_pairs.db')

cursor = conn.cursor()

# create a table
conn.execute('''CREATE TABLE QA
         (ID INT PRIMARY KEY NOT NULL,
         QUESTION TEXT NOT NULL,
         ANSWER TEXT NOT NULL);''')

ERROR:root:An unexpected error occurred while tokenizing input
The following traceback may be corrupted or invalid
The error message is: ('EOF in multi-line string', (1, 31))



OperationalError: ignored

In [53]:
# write contents of extracts into database
index = 0
for q, a in extracts:
  index += 1
  query = '''INSERT INTO QA
  (ID, QUESTION, ANSWER)
  VALUES
  (?, ?, ?)'''
  data = (index, q, a)
  cursor.execute(query, data)
  conn.commit()

In [None]:
# take a look at the results
cursor.execute('''SELECT * from QA''')
records = cursor.fetchall()
for row in records:
  print(row)

In [55]:
# create .csv file to use for fine tuning
import csv
with open('gdrive/MyDrive/qa_pairs/qa_pairs.csv', 'w', newline='') as csvfile:
  fieldnames = ['prompt', 'completion']
  writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
  writer.writeheader()
  cursor.execute('''SELECT * from QA''')
  records = cursor.fetchall()
  for row in records:
    ID, prompt, completion = row
    writer.writerow({'prompt': prompt, 'completion': completion})

In [56]:
!pip install --upgrade openai

Collecting openai
  Downloading openai-0.18.1.tar.gz (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.4 KB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.4/42.4 KB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pandas-stubs>=1.1.0.11
  Downloading pandas_stubs-1.2.0.58-py3-none-any.whl (162 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.9/162.9 KB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: openai
  Building wheel for openai (pyproject.toml) ... [?25l[?25hdone
  Created wheel for openai: filename=openai-0.18.1-py3-none-any.whl size=53167 sha256=a450ad50b12b71ddbdfd4a4a2b4faf68d846aa372ce7b08ec34db1e6de9e6bc5
  Stored in directory: /root/.cache/pip/wheels/

In [59]:
!openai tools fine_tunes.prepare_data -f gdrive/MyDrive/qa_pairs/qa_pairs.csv

Logging requires wandb to be installed. Run `pip install wandb`.
Analyzing...

- Based on your file extension, your file is formatted as a CSV file
- Your file contains 128 prompt-completion pairs
- All prompts end with suffix `?`
- Your data does not contain a common ending at the end of your completions. Having a common ending string appended to the end of the completion makes it clearer to the fine-tuned model where the completion should end. See https://beta.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more detail and examples.
- The completion should start with a whitespace character (` `). This tends to produce better results due to the tokenization we use. See https://beta.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more details

Based on the analysis we will perform the following actions:
- [Necessary] Your format `CSV` will be converted to `JSONL`
- [Recommended] Add a suffix ending `\n` to all completions [Y/n]: y
- [Recommended] Add a white

In [61]:
!export OPENAI_API_KEY="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"
!openai api fine_tunes.create -t "gdrive/MyDrive/qa_pairs/qa_pairs_prepared.jsonl"

Logging requires wandb to be installed. Run `pip install wandb`.
[91mError:[0m No API key provided. You can set your API key in code using 'openai.api_key = <API-KEY>', or you can set the environment variable OPENAI_API_KEY=<API-KEY>). If your API key is stored in a file, you can point the openai module at it with 'openai.api_key_path = <PATH>'. You can generate API keys in the OpenAI web interface. See https://onboard.openai.com for details, or email support@openai.com if you have any questions.
