In [1]:
"""
The json format output files generated by the CoreNLP are saved in the same directory of the kb files
command used to generate a output file: java -Xmx3g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLP \
-annotators tokenize,pos,lemma,ner,parse,coref,kbp -coref.md.type RULE -outputFormat json -file "Your input file here"
"""
import os
import json
import wikipedia

print("CoreNLP output files: ", [f for f in os.listdir("corenlp") if f.split('.')[-1] == "json"])

CoreNLP output files:  ['Awards and Honors.txt.json', 'Books and Movies.txt.json', 'Early Life.txt.json', 'Introduction.txt.json', 'Later Life.txt.json', 'Mid Life.txt.json', 'Personal life and Creative Approach.txt.json', 'Prediction.txt.json', 'Reception.txt.json', 'Views.txt.json']


In [2]:
save_kb_folder = r"corenlp"

op_files = ['Awards and Honors.txt.json', 'Books and Movies.txt.json', 'Early Life.txt.json', 'Introduction.txt.json', 'Later Life.txt.json', 'Mid Life.txt.json', 'Personal life and Creative Approach.txt.json', 'Prediction.txt.json', 'Reception.txt.json', 'Views.txt.json']
output_dict = []
for json_ in op_files:
    with open(save_kb_folder + rf"/{json_}", "r") as f:
        output_dict.append(f.read())
len(output_dict)

10

In [3]:
class KB():
    def __init__(self):
        self.entities = {} # { entity_title: {...} }
        self.relations = [] # [ head: entity_title, type: ..., tail: entity_title,
          # meta: { article_url: { spans: [...] } } ]
        self.sources = {} # { article_url: {...} }

    def merge_with_kb(self, kb2):
        for r in kb2.relations:
            article_url = list(r["meta"].keys())[0]
            source_data = kb2.sources[article_url]
            self.add_relation(r, source_data["article_title"],
                              source_data["article_publish_date"])

    def are_relations_equal(self, r1, r2):
        return all(r1[attr] == r2[attr] for attr in ["head", "type", "tail"])

    def exists_relation(self, r1):
        return any(self.are_relations_equal(r1, r2) for r2 in self.relations)

    def merge_relations(self, r2):
        r1 = [r for r in self.relations
              if self.are_relations_equal(r2, r)][0]

        # if different article
        article_url = list(r2["meta"].keys())[0]
        if article_url not in r1["meta"]:
            r1["meta"][article_url] = r2["meta"][article_url]

        # if existing article
        else:
            spans_to_add = [span for span in r2["meta"][article_url]["spans"]
                            if span not in r1["meta"][article_url]["spans"]]
            r1["meta"][article_url]["spans"] += spans_to_add

    def get_wikipedia_data(self, candidate_entity):
        try:
          #page = wikipedia.page(candidate_entity, auto_suggest=False)
          page = wikipedia.page(candidate_entity, auto_suggest=False)

          entity_data = {
            "title": page.title,
            "url": page.url,
            "summary": page.summary
          }
          return entity_data
        except:
          entity_data = {
            "title": candidate_entity+"*",
            "url": "",
            "summary": ""
          }
          return entity_data
          #return None

    def add_entity(self, e):
        self.entities[e["title"]] = {k:v for k,v in e.items() if k != "title"}

    def add_relation(self, r, article_title, article_publish_date):
        # check on wikipedia
        candidate_entities = [r["head"], r["tail"]]
        entities = [self.get_wikipedia_data(ent) for ent in candidate_entities]

        # if one entity does not exist, stop
        if any(ent is None for ent in entities):
            return

        # manage new entities
        for e in entities:
            self.add_entity(e)

        # rename relation entities with their wikipedia titles
        r["head"] = entities[0]["title"]
        r["tail"] = entities[1]["title"]

        # add source if not in kb
        article_url = list(r["meta"].keys())[0]
        if article_url not in self.sources:
            self.sources[article_url] = {
                "article_title": article_title,
                "article_publish_date": article_publish_date
            }

        # manage new relation
        if not self.exists_relation(r):
            self.relations.append(r)
        else:
            self.merge_relations(r)

    def print(self):
        print("Entities:")
        for e in self.entities.items():
            print(f"  {e}")
        print("Relations:")
        for r in self.relations:
            print(f"  {r}")
        print("Sources:")
        for s in self.sources.items():
            print(f"  {s}")

In [4]:
def extract_relations_from_corenlp_output(corenlp_output, article_url):
    relations = []
    for sentence in corenlp_output['sentences']:
        if sentence["kbp"] != 0:
            for kbp in sentence['kbp']:
                subject = kbp["subject"]
                relation = kbp["relation"]
                object_ = kbp["object"]
                relations.append({
                    'head': subject,
                    'type': relation,
                    'tail': object_,
                    'meta': {
                        article_url: {
                            "spans":   [kbp["subjectSpan"], kbp["relationSpan"], kbp["objectSpan"]]
                        }
                    }
                })
    return relations

def from_text_to_kb_using_corenlp(corenlp_output, article_url, article_title=None, article_publish_date=None, verbose=False, extend_kb=None):
    if type(corenlp_output) == str:
        corenlp_output = json.loads(corenlp_output)
    
    assert type(corenlp_output) == dict, f"Failed to parse the input. The input type is expected to be dict or json format str only, but found {type(corenlp_output)}"

    # Extract relations using the modified function
    relations = extract_relations_from_corenlp_output(corenlp_output, article_url)

    kb = KB() if extend_kb is None else extend_kb

    
    for relation in relations:
        kb.add_relation(relation, article_title, article_publish_date)

    return kb

In [5]:
import warnings
warnings.filterwarnings('ignore')

# helper functions
import pickle
def save_kb(kb, path):
    with open(path, "wb") as f:
        pickle.dump(kb, f)
        f.close()

def load_kb(path):
    with open(path, "rb") as f:
        loaded = pickle.load(f)
        f.close()
        return loaded

url = "https://en.wikipedia.org/wiki/Ray_Kurzweil"
last_update = "23 October 2023"
article_title = "Raymond Kurzweil biography"
kb_individuals = []
kb_all = None

for i, corenlp_output in enumerate(output_dict):
    print(f"Showing kb {i+1}")
    kb = from_text_to_kb_using_corenlp(corenlp_output=corenlp_output, article_url=url, article_title=article_title, article_publish_date=last_update, verbose=True)
    save_kb(kb, save_kb_folder + rf"/kb_CoreNLP_part{i+1}.kb")
    kb_individuals.append(kb)
    print("-" * 80)
    kb.print()

    extend_kb = None if not i else kb_all
    kb_all = from_text_to_kb_using_corenlp(corenlp_output=corenlp_output, article_url=url, article_title=article_title, article_publish_date=last_update, verbose=True, extend_kb=kb_all)
save_kb(kb_all, save_kb_folder + r"/kb_CoreNLP_full.kb")

Showing kb 1
--------------------------------------------------------------------------------
Entities:
  ('Ray Kurzweil', {'url': 'https://en.wikipedia.org/wiki/Ray_Kurzweil', 'summary': 'Raymond Kurzweil ( KURZ-wyle; born February 12, 1948) is an American computer scientist, author, inventor, and futurist. He is involved in fields such as optical character recognition (OCR), text-to-speech synthesis, speech recognition technology, and electronic keyboard instruments. He has written books on health, artificial intelligence (AI), transhumanism, the technological singularity, and futurism. Kurzweil is a public advocate for the futurist and transhumanist movements and gives public talks to share his optimistic outlook on life extension technologies and the future of nanotechnology, robotics, and biotechnology.\nKurzweil received the 1999 National Medal of Technology and Innovation, the United States\' highest honor in technology, from then President Bill Clinton in a White House ceremony

In [6]:
kb_all.print()

Entities:
  ('Ray Kurzweil', {'url': 'https://en.wikipedia.org/wiki/Ray_Kurzweil', 'summary': 'Raymond Kurzweil ( KURZ-wyle; born February 12, 1948) is an American computer scientist, author, inventor, and futurist. He is involved in fields such as optical character recognition (OCR), text-to-speech synthesis, speech recognition technology, and electronic keyboard instruments. He has written books on health, artificial intelligence (AI), transhumanism, the technological singularity, and futurism. Kurzweil is a public advocate for the futurist and transhumanist movements and gives public talks to share his optimistic outlook on life extension technologies and the future of nanotechnology, robotics, and biotechnology.\nKurzweil received the 1999 National Medal of Technology and Innovation, the United States\' highest honor in technology, from then President Bill Clinton in a White House ceremony. He was the recipient of the $500\u2009000 Lemelson–MIT Prize for 2001. He was elected a memb

### Extend CoreNLP output to REBEL KB file

In [19]:
save_kb_folder = 'kb_folder/'

In [20]:
save_kb_folder = 'kb_folder/'
kb = load_kb("kb_folder/Raymond Kurzweil Complete.kb")#load previous stored kb
output_dict = []
with open("corenlp/Introduction.txt.json", "r") as f:
        output_dict.append(f.read())
for i, json_ in enumerate(output_dict):
    corenlp_output = json.loads(json_)
kb = from_text_to_kb_using_corenlp(corenlp_output=corenlp_output, article_url=url, article_title=article_title, article_publish_date=last_update, verbose=True, extend_kb=kb)
filename = "Raymond Kurzweil 1 Corenlp"
save_kb(kb, save_kb_folder + filename + ".kb")
kb.print()

Entities:
  ('Ray Kurzweil', {'url': 'https://en.wikipedia.org/wiki/Ray_Kurzweil', 'summary': 'Raymond Kurzweil ( KURZ-wyle; born February 12, 1948) is an American computer scientist, author, inventor, and futurist. He is involved in fields such as optical character recognition (OCR), text-to-speech synthesis, speech recognition technology, and electronic keyboard instruments. He has written books on health, artificial intelligence (AI), transhumanism, the technological singularity, and futurism. Kurzweil is a public advocate for the futurist and transhumanist movements and gives public talks to share his optimistic outlook on life extension technologies and the future of nanotechnology, robotics, and biotechnology.\nKurzweil received the 1999 National Medal of Technology and Innovation, the United States\' highest honor in technology, from then President Bill Clinton in a White House ceremony. He was the recipient of the $500\u2009000 Lemelson–MIT Prize for 2001. He was elected a memb

In [21]:
save_kb_folder = 'kb_folder/'
kb = load_kb("kb_folder/Raymond Kurzweil 1 Corenlp.kb")#load previous stored kb
output_dict = []
with open("corenlp/Introduction.txt.json", "r") as f:
        output_dict.append(f.read())
for i, json_ in enumerate(output_dict):
    corenlp_output = json.loads(json_)
kb = from_text_to_kb_using_corenlp(corenlp_output=corenlp_output, article_url=url, article_title=article_title, article_publish_date=last_update, verbose=True, extend_kb=kb)
filename = "Raymond Kurzweil 2 Corenlp"
save_kb(kb, save_kb_folder + filename + ".kb")
kb.print()

Entities:
  ('Ray Kurzweil', {'url': 'https://en.wikipedia.org/wiki/Ray_Kurzweil', 'summary': 'Raymond Kurzweil ( KURZ-wyle; born February 12, 1948) is an American computer scientist, author, inventor, and futurist. He is involved in fields such as optical character recognition (OCR), text-to-speech synthesis, speech recognition technology, and electronic keyboard instruments. He has written books on health, artificial intelligence (AI), transhumanism, the technological singularity, and futurism. Kurzweil is a public advocate for the futurist and transhumanist movements and gives public talks to share his optimistic outlook on life extension technologies and the future of nanotechnology, robotics, and biotechnology.\nKurzweil received the 1999 National Medal of Technology and Innovation, the United States\' highest honor in technology, from then President Bill Clinton in a White House ceremony. He was the recipient of the $500\u2009000 Lemelson–MIT Prize for 2001. He was elected a memb

In [22]:
save_kb_folder = 'kb_folder/'
kb = load_kb("kb_folder/Raymond Kurzweil 2 Corenlp.kb")#load previous stored kb
output_dict = []
with open("corenlp/Introduction.txt.json", "r") as f:
        output_dict.append(f.read())
for i, json_ in enumerate(output_dict):
    corenlp_output = json.loads(json_)
kb = from_text_to_kb_using_corenlp(corenlp_output=corenlp_output, article_url=url, article_title=article_title, article_publish_date=last_update, verbose=True, extend_kb=kb)
filename = "Raymond Kurzweil 3 Corenlp"
save_kb(kb, save_kb_folder + filename + ".kb")
kb.print()

Entities:
  ('Ray Kurzweil', {'url': 'https://en.wikipedia.org/wiki/Ray_Kurzweil', 'summary': 'Raymond Kurzweil ( KURZ-wyle; born February 12, 1948) is an American computer scientist, author, inventor, and futurist. He is involved in fields such as optical character recognition (OCR), text-to-speech synthesis, speech recognition technology, and electronic keyboard instruments. He has written books on health, artificial intelligence (AI), transhumanism, the technological singularity, and futurism. Kurzweil is a public advocate for the futurist and transhumanist movements and gives public talks to share his optimistic outlook on life extension technologies and the future of nanotechnology, robotics, and biotechnology.\nKurzweil received the 1999 National Medal of Technology and Innovation, the United States\' highest honor in technology, from then President Bill Clinton in a White House ceremony. He was the recipient of the $500\u2009000 Lemelson–MIT Prize for 2001. He was elected a memb

In [24]:
save_kb_folder = 'kb_folder/'
kb = load_kb("kb_folder/Raymond Kurzweil 3 Corenlp.kb")#load previous stored kb
output_dict = []
with open("corenlp/Introduction.txt.json", "r") as f:
        output_dict.append(f.read())
for i, json_ in enumerate(output_dict):
    corenlp_output = json.loads(json_)
kb = from_text_to_kb_using_corenlp(corenlp_output=corenlp_output, article_url=url, article_title=article_title, article_publish_date=last_update, verbose=True, extend_kb=kb)
filename = "Raymond Kurzweil 4 Corenlp"
save_kb(kb, save_kb_folder + filename + ".kb")
kb.print()

Entities:
  ('Ray Kurzweil', {'url': 'https://en.wikipedia.org/wiki/Ray_Kurzweil', 'summary': 'Raymond Kurzweil ( KURZ-wyle; born February 12, 1948) is an American computer scientist, author, inventor, and futurist. He is involved in fields such as optical character recognition (OCR), text-to-speech synthesis, speech recognition technology, and electronic keyboard instruments. He has written books on health, artificial intelligence (AI), transhumanism, the technological singularity, and futurism. Kurzweil is a public advocate for the futurist and transhumanist movements and gives public talks to share his optimistic outlook on life extension technologies and the future of nanotechnology, robotics, and biotechnology.\nKurzweil received the 1999 National Medal of Technology and Innovation, the United States\' highest honor in technology, from then President Bill Clinton in a White House ceremony. He was the recipient of the $500\u2009000 Lemelson–MIT Prize for 2001. He was elected a memb

In [25]:
save_kb_folder = 'kb_folder/'
kb = load_kb("kb_folder/Raymond Kurzweil 4 Corenlp.kb")#load previous stored kb
output_dict = []
with open("corenlp/Introduction.txt.json", "r") as f:
        output_dict.append(f.read())
for i, json_ in enumerate(output_dict):
    corenlp_output = json.loads(json_)
kb = from_text_to_kb_using_corenlp(corenlp_output=corenlp_output, article_url=url, article_title=article_title, article_publish_date=last_update, verbose=True, extend_kb=kb)
filename = "Raymond Kurzweil 5 Corenlp"
save_kb(kb, save_kb_folder + filename + ".kb")
kb.print()

Entities:
  ('Ray Kurzweil', {'url': 'https://en.wikipedia.org/wiki/Ray_Kurzweil', 'summary': 'Raymond Kurzweil ( KURZ-wyle; born February 12, 1948) is an American computer scientist, author, inventor, and futurist. He is involved in fields such as optical character recognition (OCR), text-to-speech synthesis, speech recognition technology, and electronic keyboard instruments. He has written books on health, artificial intelligence (AI), transhumanism, the technological singularity, and futurism. Kurzweil is a public advocate for the futurist and transhumanist movements and gives public talks to share his optimistic outlook on life extension technologies and the future of nanotechnology, robotics, and biotechnology.\nKurzweil received the 1999 National Medal of Technology and Innovation, the United States\' highest honor in technology, from then President Bill Clinton in a White House ceremony. He was the recipient of the $500\u2009000 Lemelson–MIT Prize for 2001. He was elected a memb

In [26]:
save_kb_folder = 'kb_folder/'
kb = load_kb("kb_folder/Raymond Kurzweil 5 Corenlp.kb")#load previous stored kb
output_dict = []
with open("corenlp/Introduction.txt.json", "r") as f:
        output_dict.append(f.read())
for i, json_ in enumerate(output_dict):
    corenlp_output = json.loads(json_)
kb = from_text_to_kb_using_corenlp(corenlp_output=corenlp_output, article_url=url, article_title=article_title, article_publish_date=last_update, verbose=True, extend_kb=kb)
filename = "Raymond Kurzweil 6 Corenlp"
save_kb(kb, save_kb_folder + filename + ".kb")
kb.print()

Entities:
  ('Ray Kurzweil', {'url': 'https://en.wikipedia.org/wiki/Ray_Kurzweil', 'summary': 'Raymond Kurzweil ( KURZ-wyle; born February 12, 1948) is an American computer scientist, author, inventor, and futurist. He is involved in fields such as optical character recognition (OCR), text-to-speech synthesis, speech recognition technology, and electronic keyboard instruments. He has written books on health, artificial intelligence (AI), transhumanism, the technological singularity, and futurism. Kurzweil is a public advocate for the futurist and transhumanist movements and gives public talks to share his optimistic outlook on life extension technologies and the future of nanotechnology, robotics, and biotechnology.\nKurzweil received the 1999 National Medal of Technology and Innovation, the United States\' highest honor in technology, from then President Bill Clinton in a White House ceremony. He was the recipient of the $500\u2009000 Lemelson–MIT Prize for 2001. He was elected a memb

In [27]:
save_kb_folder = 'kb_folder/'
kb = load_kb("kb_folder/Raymond Kurzweil 6 Corenlp.kb")#load previous stored kb
output_dict = []
with open("corenlp/Introduction.txt.json", "r") as f:
        output_dict.append(f.read())
for i, json_ in enumerate(output_dict):
    corenlp_output = json.loads(json_)
kb = from_text_to_kb_using_corenlp(corenlp_output=corenlp_output, article_url=url, article_title=article_title, article_publish_date=last_update, verbose=True, extend_kb=kb)
filename = "Raymond Kurzweil 7 Corenlp"
save_kb(kb, save_kb_folder + filename + ".kb")
kb.print()

Entities:
  ('Ray Kurzweil', {'url': 'https://en.wikipedia.org/wiki/Ray_Kurzweil', 'summary': 'Raymond Kurzweil ( KURZ-wyle; born February 12, 1948) is an American computer scientist, author, inventor, and futurist. He is involved in fields such as optical character recognition (OCR), text-to-speech synthesis, speech recognition technology, and electronic keyboard instruments. He has written books on health, artificial intelligence (AI), transhumanism, the technological singularity, and futurism. Kurzweil is a public advocate for the futurist and transhumanist movements and gives public talks to share his optimistic outlook on life extension technologies and the future of nanotechnology, robotics, and biotechnology.\nKurzweil received the 1999 National Medal of Technology and Innovation, the United States\' highest honor in technology, from then President Bill Clinton in a White House ceremony. He was the recipient of the $500\u2009000 Lemelson–MIT Prize for 2001. He was elected a memb

In [28]:
save_kb_folder = 'kb_folder/'
kb = load_kb("kb_folder/Raymond Kurzweil 7 Corenlp.kb")#load previous stored kb
output_dict = []
with open("corenlp/Introduction.txt.json", "r") as f:
        output_dict.append(f.read())
for i, json_ in enumerate(output_dict):
    corenlp_output = json.loads(json_)
kb = from_text_to_kb_using_corenlp(corenlp_output=corenlp_output, article_url=url, article_title=article_title, article_publish_date=last_update, verbose=True, extend_kb=kb)
filename = "Raymond Kurzweil 8 Corenlp"
save_kb(kb, save_kb_folder + filename + ".kb")
kb.print()

Entities:
  ('Ray Kurzweil', {'url': 'https://en.wikipedia.org/wiki/Ray_Kurzweil', 'summary': 'Raymond Kurzweil ( KURZ-wyle; born February 12, 1948) is an American computer scientist, author, inventor, and futurist. He is involved in fields such as optical character recognition (OCR), text-to-speech synthesis, speech recognition technology, and electronic keyboard instruments. He has written books on health, artificial intelligence (AI), transhumanism, the technological singularity, and futurism. Kurzweil is a public advocate for the futurist and transhumanist movements and gives public talks to share his optimistic outlook on life extension technologies and the future of nanotechnology, robotics, and biotechnology.\nKurzweil received the 1999 National Medal of Technology and Innovation, the United States\' highest honor in technology, from then President Bill Clinton in a White House ceremony. He was the recipient of the $500\u2009000 Lemelson–MIT Prize for 2001. He was elected a memb

In [29]:
save_kb_folder = 'kb_folder/'
kb = load_kb("kb_folder/Raymond Kurzweil 8 Corenlp.kb")#load previous stored kb
output_dict = []
with open("corenlp/Introduction.txt.json", "r") as f:
        output_dict.append(f.read())
for i, json_ in enumerate(output_dict):
    corenlp_output = json.loads(json_)
kb = from_text_to_kb_using_corenlp(corenlp_output=corenlp_output, article_url=url, article_title=article_title, article_publish_date=last_update, verbose=True, extend_kb=kb)
filename = "Raymond Kurzweil 9 Corenlp"
save_kb(kb, save_kb_folder + filename + ".kb")
kb.print()

Entities:
  ('Ray Kurzweil', {'url': 'https://en.wikipedia.org/wiki/Ray_Kurzweil', 'summary': 'Raymond Kurzweil ( KURZ-wyle; born February 12, 1948) is an American computer scientist, author, inventor, and futurist. He is involved in fields such as optical character recognition (OCR), text-to-speech synthesis, speech recognition technology, and electronic keyboard instruments. He has written books on health, artificial intelligence (AI), transhumanism, the technological singularity, and futurism. Kurzweil is a public advocate for the futurist and transhumanist movements and gives public talks to share his optimistic outlook on life extension technologies and the future of nanotechnology, robotics, and biotechnology.\nKurzweil received the 1999 National Medal of Technology and Innovation, the United States\' highest honor in technology, from then President Bill Clinton in a White House ceremony. He was the recipient of the $500\u2009000 Lemelson–MIT Prize for 2001. He was elected a memb

In [30]:
save_kb_folder = 'kb_folder/'
kb = load_kb("kb_folder/Raymond Kurzweil 9 Corenlp.kb")#load previous stored kb
output_dict = []
with open("corenlp/Introduction.txt.json", "r") as f:
        output_dict.append(f.read())
for i, json_ in enumerate(output_dict):
    corenlp_output = json.loads(json_)
kb = from_text_to_kb_using_corenlp(corenlp_output=corenlp_output, article_url=url, article_title=article_title, article_publish_date=last_update, verbose=True, extend_kb=kb)
filename = "Raymond Kurzweil 10 Corenlp"
save_kb(kb, save_kb_folder + filename + ".kb")
kb.print()

Entities:
  ('Ray Kurzweil', {'url': 'https://en.wikipedia.org/wiki/Ray_Kurzweil', 'summary': 'Raymond Kurzweil ( KURZ-wyle; born February 12, 1948) is an American computer scientist, author, inventor, and futurist. He is involved in fields such as optical character recognition (OCR), text-to-speech synthesis, speech recognition technology, and electronic keyboard instruments. He has written books on health, artificial intelligence (AI), transhumanism, the technological singularity, and futurism. Kurzweil is a public advocate for the futurist and transhumanist movements and gives public talks to share his optimistic outlook on life extension technologies and the future of nanotechnology, robotics, and biotechnology.\nKurzweil received the 1999 National Medal of Technology and Innovation, the United States\' highest honor in technology, from then President Bill Clinton in a White House ceremony. He was the recipient of the $500\u2009000 Lemelson–MIT Prize for 2001. He was elected a memb