**This notebook is to construct knowledge graph from separate KB file, which means each graph only contained the information of that particular document, and all graphs are saved to HTML format.**

**The output from this notebook is for visualization in the report.**

In [1]:
from pyvis.network import Network
import networkx as nx
import os
import pickle

### Load KB file

In [2]:
def load_kb(filename):
    res = None
    with open(filename, "rb") as f:
        res = pickle.load(f)
    return res

In [3]:
#declare the class KB
import wikipedia

class KB():
    def __init__(self):
        self.entities = {} # { entity_title: {...} }
        self.relations = [] # [ head: entity_title, type: ..., tail: entity_title,
          # meta: { article_url: { spans: [...] } } ]
        self.sources = {} # { article_url: {...} }

    def merge_with_kb(self, kb2):
        for r in kb2.relations:
            article_url = list(r["meta"].keys())[0]
            source_data = kb2.sources[article_url]
            self.add_relation(r, source_data["article_title"],
                              source_data["article_publish_date"])

    def are_relations_equal(self, r1, r2):
        return all(r1[attr] == r2[attr] for attr in ["head", "type", "tail"])

    def exists_relation(self, r1):
        return any(self.are_relations_equal(r1, r2) for r2 in self.relations)

    def merge_relations(self, r2):
        r1 = [r for r in self.relations
              if self.are_relations_equal(r2, r)][0]

        # if different article
        article_url = list(r2["meta"].keys())[0]
        if article_url not in r1["meta"]:
            r1["meta"][article_url] = r2["meta"][article_url]

        # if existing article
        else:
            spans_to_add = [span for span in r2["meta"][article_url]["spans"]
                            if span not in r1["meta"][article_url]["spans"]]
            r1["meta"][article_url]["spans"] += spans_to_add

    def get_wikipedia_data(self, candidate_entity):
        try:
          #page = wikipedia.page(candidate_entity, auto_suggest=False)
          page = wikipedia.page(candidate_entity, auto_suggest=False)
            
          entity_data = {
            "title": page.title,
            "url": page.url,
            "summary": page.summary
          }
          return entity_data
        except:
          entity_data = {
            "title": candidate_entity+"*",
            "url": "",
            "summary": ""
          }
          return entity_data
          #return None

    def add_entity(self, e):
        self.entities[e["title"]] = {k:v for k,v in e.items() if k != "title"}

    def add_relation(self, r, article_title, article_publish_date):
        # check on wikipedia
        candidate_entities = [r["head"], r["tail"]]
        entities = [self.get_wikipedia_data(ent) for ent in candidate_entities]

        # if one entity does not exist, stop
        if any(ent is None for ent in entities):
            return

        # manage new entities
        for e in entities:
            self.add_entity(e)

        # rename relation entities with their wikipedia titles
        r["head"] = entities[0]["title"]
        r["tail"] = entities[1]["title"]

        # add source if not in kb
        article_url = list(r["meta"].keys())[0]
        if article_url not in self.sources:
            self.sources[article_url] = {
                "article_title": article_title,
                "article_publish_date": article_publish_date
            }

        # manage new relation
        if not self.exists_relation(r):
            self.relations.append(r)
        else:
            self.merge_relations(r)

    def print(self):
        print("Entities:")
        for e in self.entities.items():
            print(f"  {e}")
        print("Relations:")
        for r in self.relations:
            print(f"  {r}")
        print("Sources:")
        for s in self.sources.items():
            print(f"  {s}")

### Construct knowledge graph to HTML format

In [4]:
def contruct_graph(kb, output_file):
    """
    Visualize a knowledge graph using the pyvis library.

    Parameters:
        kb (KB): The knowledge base containing entities and relations.
        output_file (str): The name of the HTML file to save the visualization.

    Returns:
        None
    """
    # Create a directed graph
    net = Network(
        directed=True,
        width="1200px",
        height="1000px",
        bgcolor="#FFFFFF",
        notebook=True,
    )

    # Nodes
    color_entity = "#00FF00"
    for e in kb.entities:
        net.add_node(e)
        print("add note",e)

    # Edges
    for r in kb.relations:
        net.add_edge(r["head"], r["tail"], title=r["type"], label=r["type"])
        print("add relation",r["head"]," ",r["tail"])

    # Graph settings
    net.repulsion(
        node_distance=230,
        damping=0.01
    )
    net.set_edge_smooth('dynamic')

    # Save the graph to an HTML file
    net.show(output_file)

### Run the function on all KB files

In [5]:
def process_all_kb_files(kb_folder, graph_folder):
    """
    Process all KB files in the specified folder and generate knowledge graphs.

    Parameters:
        kb_folder (str): The folder containing KB files.
        graph_folder (str): The folder to save the generated knowledge graphs.
    """
    # Ensure the output graph folder exists
    os.makedirs(graph_folder, exist_ok=True)

    # Iterate over KB files in the folder
    for kb_file in os.listdir(kb_folder):
        if kb_file.endswith(".kb"):
            kb_path = os.path.join(kb_folder, kb_file)
            graph_file = os.path.splitext(kb_file)[0] + ".html"
            graph_path = os.path.join(graph_folder, graph_file)

            # Load KB
            print(f"Processing KB file: {kb_path}")
            kb = load_kb(kb_path)

            # Generate and save knowledge graph
            print(f"Generating knowledge graph for: {graph_file}")
            contruct_graph(kb, output_file=graph_path)

            print(f"Knowledge graph saved at: {graph_path}")
            print("=" * 50)

In [6]:
kb_folder = 'separate_kb_folder/'
graph_folder = 'separate_graph_folder/'

In [7]:
process_all_kb_files(kb_folder, graph_folder)

Processing KB file: separate_kb_folder/Raymond Kurzweil 1 CoreNLP.kb
Generating knowledge graph for: Raymond Kurzweil 1 CoreNLP.html
Local cdn resources have problems on chrome/safari when used in jupyter-notebook. 
add note Ray Kurzweil
add note February 12, 1948*
add note Computer scientist
add note National Medal of Technology and Innovation
add note United States
add note Lemelson–MIT Prize
add note National Inventors Hall of Fame
add note Thomas Edison
add note rightful heir*
add relation Ray Kurzweil   February 12, 1948*
add relation Ray Kurzweil   Computer scientist
add relation Ray Kurzweil   National Medal of Technology and Innovation
add relation National Medal of Technology and Innovation   United States
add relation Ray Kurzweil   Lemelson–MIT Prize
add relation Ray Kurzweil   National Inventors Hall of Fame
add relation National Inventors Hall of Fame   Ray Kurzweil
add relation National Inventors Hall of Fame   United States
add relation Thomas Edison   rightful heir*
Kno