### Importing libraries

In [2]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch
import math
import IPython
from pyvis.network import Network

### Loading the relation extraction model

In [3]:
# Laoding model and Tokenizers
tokenizer = AutoTokenizer.from_pretrained("Babelscape/rebel-large")
model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/rebel-large")

tokenizer_config.json:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/344 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

### Offering 2 types of text to Knowledge Base
1. Short text to Knowledge Base (Feeding summarised text to build KB)
2. Long Text to Knowledge Base (Feeding non-summarised text to build KB)

In [9]:
def extract_relations_from_model_output(text):
    relations = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    text_replaced = text.replace("<s>", "").replace("<pad>", "").replace("</s>", "")
    for token in text_replaced.split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                relations.append({
                    "head": subject.strip(),
                    "type": relation.strip(),
                    "tail": object_.strip()
                })
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                relations.append({
                    "head": subject.strip(),
                    "type": relation.strip(),
                    "tail": object_.strip()
                })
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    
    if subject != '' and relation != '' and object_ != '':
        relations.append({
            "head": subject.strip(),
            "type": relation.strip(),
            "tail": object_.strip()
        })
    
    return relations

### Implementing a KB class to deal with adding new relations to the Knowledge base

In [5]:
class KB():
    def __init__(self):
        self.relations = []

    def are_relations_equal(self, r1, r2):
        return all(r1[attr] == r2[attr] for attr in ["head", "type", "tail"])
    
    def exists_relation(self, r1):
        return any(self.are_relations_equal(r1, r2) for r2 in self.relations)
    
    def add_relations(self, r):
        if not self.exists_relation(r):
            self.relations.append(r)

    def print(self):
        print("Relations:")
        for r in self.relations:
            print(f"  {r}")

### Defining a function that returns KB object with relations extracted from a short text

In [10]:
def from_small_text_to_kb(text, verbose=False):
    kb = KB()

    # Tokenizer text
    model_inputs = tokenizer(text, max_length=512, padding=True, truncation=True, return_tensors='pt')
    if verbose:
        print(f"Num tokens: {len(model_inputs)}")

    # Generate
    gen_kwargs = {
        "max_length": 216,
        "length_penalty": 0,
        "num_beams": 3,
        "num_return_sequences": 3
    }
    generated_tokens = model.generate(
        **model_inputs,
        **gen_kwargs,
    )
    decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=False)

    # Creating the KB
    for sentence_pred in decoded_preds:
        relations = extract_relations_from_model_output(sentence_pred)
        for r in relations:
            kb.add_relations(r)
    
    return kb


### Loading text from the CMU intro lecture transcript and build a knowledge graph on it

In [11]:
cmu_lecture_transcript_path = "cmu_computer_graphics_intro_voice_transcribed.txt"
with open(cmu_lecture_transcript_path) as src:
    cmu_lecture_text = src.read()

print(cmu_lecture_text)

 Welcome to Computer Graphics 15462-662 at Carnegie Mellon University. I'm Kenan Crane. I'm a professor of computer science and robotics. And I also do research in computer graphics, so specifically in the area of geometric algorithms. The purpose of this video is to give you all the information that you'll need to succeed this semester. So periodically we'll upload little videos to cover administrative things, to talk about what's been going on this week, and to answer any significant questions that have come up. I should also say that all the information today is available on the course webpage at 15462.courses.cs.cmu.edu. So please go ahead, check out that link, read through especially the course info page in detail because there's a lot of things that I won't say here in this video but that are important for you to know as you go through the course. we have a great set of TAs this semester. So if you have any questions, please at any time, feel free to email them, email me, post a 

The KB build is below

In [12]:
kb = from_small_text_to_kb(cmu_lecture_text, verbose=True)
kb.print()

Num tokens: 2
Relations:
  {'head': 'Kenan Crane', 'type': 'field of work', 'tail': 'computer graphics'}
  {'head': 'computer graphics', 'type': 'part of', 'tail': 'computer science'}
  {'head': 'Computer Graphics 15462-662', 'type': 'main subject', 'tail': 'computer science'}
