**This notebook is to extract information using REBEL, where everytime we extract the information from a document, it will not extend the previous information extracted from previous documents.**

**The output generated from this notebook is just for visualization in report.**

In [1]:
import warnings
warnings.filterwarnings("ignore")
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import math
import torch
import wikipedia
from newspaper import Article, ArticleException
import IPython
from pyvis.network import Network
import docx

### Load the REBEL model

In [2]:
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("Babelscape/rebel-large")
model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/rebel-large")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### Extract Relations from Model

In [3]:
def extract_relations_from_model_output(text):
    relations = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    text_replaced = text.replace("<s>", "").replace("<pad>", "").replace("</s>", "")
    for token in text_replaced.split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                relations.append({
                    'head': subject.strip(),
                    'type': relation.strip(),
                    'tail': object_.strip()
                })
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                relations.append({
                    'head': subject.strip(),
                    'type': relation.strip(),
                    'tail': object_.strip()
                })
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        relations.append({
            'head': subject.strip(),
            'type': relation.strip(),
            'tail': object_.strip()
        })
    return relations

### Extract KB from web article

In [4]:
def from_text_to_kb(text, article_url, span_length=128, article_title=None,
                    article_last_edited_date=None, verbose=False, extend_kb=None):
    # tokenize whole text
    inputs = tokenizer([text], return_tensors="pt")

    # compute span boundaries
    num_tokens = len(inputs["input_ids"][0])
    if verbose:
        print(f"Input has {num_tokens} tokens")
    num_spans = math.ceil(num_tokens / span_length)
    if verbose:
        print(f"Input has {num_spans} spans")
    overlap = math.ceil((num_spans * span_length - num_tokens) / 
                        max(num_spans - 1, 1))
    spans_boundaries = []
    start = 0
    for i in range(num_spans):
        spans_boundaries.append([start + span_length * i,
                                 start + span_length * (i + 1)])
        start -= overlap
    if verbose:
        print(f"Span boundaries are {spans_boundaries}")

    # transform input with spans
    tensor_ids = [inputs["input_ids"][0][boundary[0]:boundary[1]]
                  for boundary in spans_boundaries]
    tensor_masks = [inputs["attention_mask"][0][boundary[0]:boundary[1]]
                    for boundary in spans_boundaries]
    inputs = {
        "input_ids": torch.stack(tensor_ids),
        "attention_mask": torch.stack(tensor_masks)
    }

    # generate relations
    num_return_sequences = 3
    gen_kwargs = {
        "max_length": 256,
        "length_penalty": 0,
        "num_beams": 3,
        "num_return_sequences": num_return_sequences
    }
    generated_tokens = model.generate(
        **inputs,
        **gen_kwargs,
    )

    # decode relations
    decoded_preds = tokenizer.batch_decode(generated_tokens,
                                           skip_special_tokens=False)

    # create kb
    if(extend_kb is None):
      kb = KB()
    else:
      kb = extend_kb

    i = 0
    for sentence_pred in decoded_preds:
        current_span_index = i // num_return_sequences
        relations = extract_relations_from_model_output(sentence_pred)
        for relation in relations:
            relation["meta"] = {
                article_url: {
                    "spans": [spans_boundaries[current_span_index]]
                }
            }
            kb.add_relation(relation, article_title, article_last_edited_date)
        i += 1

    return kb

### Filter and normalize entities with Wikipedia

- highlight all entities that doesn't have a page on Wikipedia with *
- merge entities if they have the same wikipedia page

In [5]:
class KB():
    def __init__(self):
        self.entities = {} # { entity_title: {...} }
        self.relations = [] # [ head: entity_title, type: ..., tail: entity_title,
          # meta: { article_url: { spans: [...] } } ]
        self.sources = {} # { article_url: {...} }

    def merge_with_kb(self, kb2):
        for r in kb2.relations:
            article_url = list(r["meta"].keys())[0]
            source_data = kb2.sources[article_url]
            self.add_relation(r, source_data["article_title"],
                              source_data["article_last_edited_date"])

    def are_relations_equal(self, r1, r2):
        return all(r1[attr] == r2[attr] for attr in ["head", "type", "tail"])

    def exists_relation(self, r1):
        return any(self.are_relations_equal(r1, r2) for r2 in self.relations)

    def merge_relations(self, r2):
        r1 = [r for r in self.relations
              if self.are_relations_equal(r2, r)][0]

        # if different article
        article_url = list(r2["meta"].keys())[0]
        if article_url not in r1["meta"]:
            r1["meta"][article_url] = r2["meta"][article_url]

        # if existing article
        else:
            spans_to_add = [span for span in r2["meta"][article_url]["spans"]
                            if span not in r1["meta"][article_url]["spans"]]
            r1["meta"][article_url]["spans"] += spans_to_add

    def get_wikipedia_data(self, candidate_entity):
        try:
          page = wikipedia.page(candidate_entity, auto_suggest=False)
            
          entity_data = {
            "title": page.title,
            "url": page.url,
            "summary": page.summary
          }
          return entity_data
        except:
          entity_data = {
            "title": candidate_entity+"*",
            "url": "",
            "summary": ""
          }
          return entity_data
          #return None

    def add_entity(self, e):
        self.entities[e["title"]] = {k:v for k,v in e.items() if k != "title"}

    def add_relation(self, r, article_title, article_last_edited_date):
        # check on wikipedia
        candidate_entities = [r["head"], r["tail"]]
        entities = [self.get_wikipedia_data(ent) for ent in candidate_entities]

        # if one entity does not exist, stop
        if any(ent is None for ent in entities):
            return

        # manage new entities
        for e in entities:
            self.add_entity(e)

        # rename relation entities with their wikipedia titles
        r["head"] = entities[0]["title"]
        r["tail"] = entities[1]["title"]

        # add source if not in kb
        article_url = list(r["meta"].keys())[0]
        if article_url not in self.sources:
            self.sources[article_url] = {
                "article_title": article_title,
                "article_last_edited_date": article_last_edited_date
            }

        # manage new relation
        if not self.exists_relation(r):
            self.relations.append(r)
        else:
            self.merge_relations(r)

    def print(self):
        print("Entities:")
        for e in self.entities.items():
            print(f"  {e}")
        print("Relations:")
        for r in self.relations:
            print(f"  {r}")
        print("Sources:")
        for s in self.sources.items():
            print(f"  {s}")

In [6]:
# return a list of content of a single word file
def get_document_content(doc_path):
    doc = docx.Document(doc_path)
    content = []
    for paragraph in doc.paragraphs:
        content.append(paragraph.text)
    return "\n".join(content)

In [7]:
# function to save and load kb model
import pickle

def save_kb(kb, filename):
    with open(f"{save_kb_folder}{filename}", "wb") as f:
        pickle.dump(kb, f)

def load_kb(filename):
    res = None
    with open(f"{save_kb_folder}{filename}", "rb") as f:
        res = pickle.load(f)
    return res

I have separated the wikipedia contents to 10 different word file, where each word file is a new topic of article. The list of topic is shown below:

1. Introduction
2. Early Life
3. Mid Life
4. Later Life
5. Personal life and Creative Approach
6. Books and Movies
7. Views
8. Predictions
9. Receptions
10. Awards and Honors

All 10 separate kb file will be saved to 'separate_kb_folder'

In [8]:
# define parameters
folder_path = 'Raymond Kurzweil/'
url = "https://en.wikipedia.org/wiki/Ray_Kurzweil"
article_last_edited_date = "29 October 2023"
save_kb_folder = 'separate_kb_folder/'

In [9]:
# create kb for doc 1
doc = get_document_content(folder_path+"Introduction.docx")
article_title = "Introduction"
kb = from_text_to_kb(doc, url, article_title=article_title, article_last_edited_date=article_last_edited_date, verbose=True)
filename = "Raymond Kurzweil 1"
save_kb(kb, filename + ".kb")
kb.print()

Input has 439 tokens
Input has 4 spans
Span boundaries are [[0, 128], [103, 231], [206, 334], [309, 437]]
Entities:
  ('Ray Kurzweil', {'url': 'https://en.wikipedia.org/wiki/Ray_Kurzweil', 'summary': 'Raymond Kurzweil ( KURZ-wyle; born February 12, 1948) is an American computer scientist, author, inventor, and futurist. He is involved in fields such as optical character recognition (OCR), text-to-speech synthesis, speech recognition technology, and electronic keyboard instruments. He has written books on health, artificial intelligence (AI), transhumanism, the technological singularity, and futurism. Kurzweil is a public advocate for the futurist and transhumanist movements and gives public talks to share his optimistic outlook on life extension technologies and the future of nanotechnology, robotics, and biotechnology.\nKurzweil received the 1999 National Medal of Technology and Innovation, the United States\' highest honor in technology, from then President Bill Clinton in a White Ho

In [10]:
# create kb for doc 2
doc = get_document_content(folder_path+"Early Life.docx")
article_title = "Early Life"
kb = from_text_to_kb(doc, url, article_title=article_title, article_last_edited_date=article_last_edited_date, verbose=True)
filename = "Raymond Kurzweil 2"
save_kb(kb, filename + ".kb")
kb.print()

Input has 785 tokens
Input has 7 spans
Span boundaries are [[0, 128], [109, 237], [218, 346], [327, 455], [436, 564], [545, 673], [654, 782]]
Entities:
  ('Queens', {'url': 'https://en.wikipedia.org/wiki/Queens', 'summary': "Queens is a borough of New York City, coextensive with Queens County, in the U.S. state of New York. Located on Long Island, it is the largest of the five New York City boroughs by area. It is bordered by the borough of Brooklyn at the western tip of Long Island and by Nassau County to its east. Queens shares water borders with the boroughs of Manhattan, the Bronx, and Staten Island and with New Jersey.With a population of 2,405,464 as of the 2020 census, Queens is the second-most populous county in New York state, behind Kings County (Brooklyn), and is therefore also the second-most populous of the five New York City boroughs. If Queens were its own city, it would be the fourth most-populous in the U.S. after New York City itself, Los Angeles, and Chicago. Queens 

In [11]:
# create kb for doc 3
doc = get_document_content(folder_path+"Mid Life.docx")
article_title = "Mid Life"
kb = from_text_to_kb(doc, url, article_title=article_title, article_last_edited_date=article_last_edited_date, verbose=True)
filename = "Raymond Kurzweil 3"
save_kb(kb, filename + ".kb")
kb.print()

Input has 1000 tokens
Input has 8 spans
Span boundaries are [[0, 128], [124, 252], [248, 376], [372, 500], [496, 624], [620, 748], [744, 872], [868, 996]]
Entities:
  ('Frank Rosenblatt', {'url': 'https://en.wikipedia.org/wiki/Frank_Rosenblatt', 'summary': 'Frank Rosenblatt (July 11, 1928 – July 11, 1971) was an American psychologist notable in the field of artificial intelligence. He is sometimes called the father of deep learning for his pioneering work on neural networks.'})
  ('Cornell University', {'url': 'https://en.wikipedia.org/wiki/Cornell_University', 'summary': "Cornell University is a private Ivy League land-grant research university based in Ithaca, New York. The university was founded in 1865 by Ezra Cornell and Andrew Dickson White. Since its founding, Cornell has been a co-educational, non-sectarian institution where admission has not been restricted by religion or race. The student body for the fall 2022 semester consisted of more than 15,000 undergraduate and 7,000 gr

In [12]:
# create kb for doc 4
doc = get_document_content(folder_path+"Later Life.docx")
article_title = "Later Life"
kb = from_text_to_kb(doc, url, article_title=article_title, article_last_edited_date=article_last_edited_date, verbose=True)
filename = "Raymond Kurzweil 4"
save_kb(kb, filename + ".kb")
kb.print()

Input has 538 tokens
Input has 5 spans
Span boundaries are [[0, 128], [102, 230], [204, 332], [306, 434], [408, 536]]
Entities:
  ('Kurzweil Educational Systems', {'url': 'https://en.wikipedia.org/wiki/Kurzweil_Educational_Systems', 'summary': 'Kurzweil Education (formerly Kurzweil Educational Systems) is an American-based company that provides educational technology.\nKurzweil Education provides literacy solutions, tools and training for those with learning differences and challenges, or people with  blindness or partially sighted.\nFounded in 1996, the company has pioneered the development of computerized assistive technology.  Its headquarters are in Dallas, Texas.\nThe company supplies two principal software products to its customers—Kurzweil 1000 and Kurzweil 3000.  Kurzweil 1000 is a software which enables a visually impaired user to gain access to both web-based, digital or scanned print materials through its OCR and text to speech features; Kurzweil 1000 software provides easy 

In [13]:
# create kb for doc 5
doc = get_document_content(folder_path+"Personal life and Creative Approach.docx")
article_title = "Personal life and Creative Approach"
kb = from_text_to_kb(doc, url, article_title=article_title, article_last_edited_date=article_last_edited_date, verbose=True)
filename = "Raymond Kurzweil 5"
save_kb(kb, filename + ".kb")
kb.print()

Input has 401 tokens
Input has 4 spans
Span boundaries are [[0, 128], [91, 219], [182, 310], [273, 401]]
Entities:
  ('Sonya Rosenwald Kurzweil*', {'url': '', 'summary': ''})
  ('Harvard Medical School', {'url': 'https://en.wikipedia.org/wiki/Harvard_Medical_School', 'summary': "Harvard Medical School (HMS) is the graduate medical school of Harvard University and is located in the Longwood Medical Area in Boston, Massachusetts. Founded in 1782, HMS is one of the oldest medical schools in the United States. Unlike most other leading medical schools, HMS does not operate in conjunction with a single hospital but is directly affiliated with several teaching hospitals in the Boston area. Affiliated teaching hospitals and research institutes include Dana–Farber Cancer Institute, Massachusetts General Hospital, Brigham and Women's Hospital, Beth Israel Deaconess Medical Center, Boston Children's Hospital, McLean Hospital, Cambridge Health Alliance, The Baker Center for Children and Families,

In [14]:
# create kb for doc 6
doc = get_document_content(folder_path+"Books and Movies.docx")
article_title = "Books and Movies"
kb = from_text_to_kb(doc, url, article_title=article_title, article_last_edited_date=article_last_edited_date, verbose=True)
filename = "Raymond Kurzweil 6"
save_kb(kb, filename + ".kb")
kb.print()

Input has 1003 tokens
Input has 8 spans
Span boundaries are [[0, 128], [125, 253], [250, 378], [375, 503], [500, 628], [625, 753], [750, 878], [875, 1003]]
Entities:
  ('The Age of Intelligent Machines', {'url': 'https://en.wikipedia.org/wiki/The_Age_of_Intelligent_Machines', 'summary': "The Age of Intelligent Machines is a non-fiction book about artificial intelligence by inventor and futurist Ray Kurzweil. This was his first book and the Association of American Publishers named it the Most Outstanding Computer Science Book of 1990. It was reviewed in The New York Times and The Christian Science Monitor. The format is a combination of monograph and anthology with contributed essays by artificial intelligence experts such as Daniel Dennett, Douglas Hofstadter, and Marvin Minsky.Kurzweil surveys the philosophical, mathematical and technological roots of artificial intelligence, starting with the assumption that a sufficiently advanced computer program could exhibit human-level intellige

In [15]:
# create kb for doc 7
doc = get_document_content(folder_path+"Views.docx")
article_title = "Views"
kb = from_text_to_kb(doc, url, article_title=article_title, article_last_edited_date=article_last_edited_date, verbose=True)
filename = "Raymond Kurzweil 7"
save_kb(kb, filename + ".kb")
kb.print()

Token indices sequence length is longer than the specified maximum sequence length for this model (1136 > 1024). Running this sequence through the model will result in indexing errors


Input has 1136 tokens
Input has 9 spans
Span boundaries are [[0, 128], [126, 254], [252, 380], [378, 506], [504, 632], [630, 758], [756, 884], [882, 1010], [1008, 1136]]
Entities:
  ('The Age of Spiritual Machines', {'url': 'https://en.wikipedia.org/wiki/The_Age_of_Spiritual_Machines', 'summary': 'The Age of Spiritual Machines: When Computers Exceed Human Intelligence is a non-fiction book by inventor and futurist Ray Kurzweil about artificial intelligence and the future course of humanity. First published in hardcover on January 1, 1999, by Viking, it has received attention from The New York Times, The New York Review of Books and The Atlantic. In the book Kurzweil outlines his vision for how technology will progress during the 21st century.\nKurzweil believes evolution provides evidence that humans will one day create machines more  intelligent than they are. He presents his law of accelerating returns to explain why "key events" happen more frequently as time marches on. It also exp

In [16]:
# create kb for doc 8
doc = get_document_content(folder_path+"Prediction.docx")
article_title = "Prediction"
kb = from_text_to_kb(doc, url, article_title=article_title, article_last_edited_date=article_last_edited_date, verbose=True)
filename = "Raymond Kurzweil 8"
save_kb(kb, filename + ".kb")
kb.print()

Input has 1304 tokens
Input has 11 spans
Span boundaries are [[0, 128], [117, 245], [234, 362], [351, 479], [468, 596], [585, 713], [702, 830], [819, 947], [936, 1064], [1053, 1181], [1170, 1298]]
Entities:
  ('The Age of Intelligent Machines', {'url': 'https://en.wikipedia.org/wiki/The_Age_of_Intelligent_Machines', 'summary': "The Age of Intelligent Machines is a non-fiction book about artificial intelligence by inventor and futurist Ray Kurzweil. This was his first book and the Association of American Publishers named it the Most Outstanding Computer Science Book of 1990. It was reviewed in The New York Times and The Christian Science Monitor. The format is a combination of monograph and anthology with contributed essays by artificial intelligence experts such as Daniel Dennett, Douglas Hofstadter, and Marvin Minsky.Kurzweil surveys the philosophical, mathematical and technological roots of artificial intelligence, starting with the assumption that a sufficiently advanced computer pr

In [17]:
# create kb for doc 9
doc = get_document_content(folder_path+"Reception.docx")
article_title = "Reception"
kb = from_text_to_kb(doc, url, article_title=article_title, article_last_edited_date=article_last_edited_date, verbose=True)
filename = "Raymond Kurzweil 9"
save_kb(kb, filename + ".kb")
kb.print()

Input has 989 tokens
Input has 8 spans
Span boundaries are [[0, 128], [123, 251], [246, 374], [369, 497], [492, 620], [615, 743], [738, 866], [861, 989]]
Entities:
  ('Bill Gates', {'url': 'https://en.wikipedia.org/wiki/Bill_Gates', 'summary': "William Henry Gates III  (born October 28, 1955) is an American businessman, investor, philanthropist, and writer best known for co-founding the software giant Microsoft, along with his childhood friend Paul Allen. During his career at Microsoft, Gates held the positions of chairman, chief executive officer (CEO), president, and chief software architect, while also being its largest individual shareholder until May 2014. He was a major entrepreneur of the microcomputer revolution of the 1970s and 1980s.\nGates was born and raised in Seattle, Washington. In 1975, he and Allen founded Microsoft in Albuquerque, New Mexico. It later became the world's largest personal computer software company. Gates led the company as its chairman and chief executi

In [9]:
# create kb for doc 10
doc = get_document_content(folder_path+"Awards and Honors.docx")
article_title = "Awards and Honors"
kb = from_text_to_kb(doc, url, article_title=article_title, article_last_edited_date=article_last_edited_date, verbose=True)
filename = "Raymond Kurzweil 10"
save_kb(kb, filename + ".kb")
kb.print()

Input has 899 tokens
Input has 8 spans
Span boundaries are [[0, 128], [110, 238], [220, 348], [330, 458], [440, 568], [550, 678], [660, 788], [770, 898]]
Entities:
  ('Kurzweil Reading Machine*', {'url': '', 'summary': ''})
  ('Ray Kurzweil', {'url': 'https://en.wikipedia.org/wiki/Ray_Kurzweil', 'summary': 'Raymond Kurzweil ( KURZ-wyle; born February 12, 1948) is an American computer scientist, author, inventor, and futurist. He is involved in fields such as optical character recognition (OCR), text-to-speech synthesis, speech recognition technology, and electronic keyboard instruments. He has written books on health, artificial intelligence (AI), transhumanism, the technological singularity, and futurism. Kurzweil is a public advocate for the futurist and transhumanist movements and gives public talks to share his optimistic outlook on life extension technologies and the future of nanotechnology, robotics, and biotechnology.\nKurzweil received the 1999 National Medal of Technology and