## Practice


Note: this notebook must be run from the root directory of the LinkPredict repository.

In [11]:
from src.data import extract_vault, holdout, delete_nodes_without_links

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import markdown as md
from bs4 import BeautifulSoup
import PyPDF2
from PIL import Image

import IPython
from pyvis.network import Network

import math
import wikipedia

import os, shutil
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
from tqdm import tqdm
from copy import deepcopy
import pandas as pd
from pathlib import Path

In [12]:
DIR = os.getcwd()
DIR

'/home/demin/prac/LinkPredict'

1. Extract data from the vault

In [13]:
vault = extract_vault('./vaultes/obsidian-hub/')

Front matter not populated for radekkozak.md: ScannerError('while scanning for the next token', <yaml._yaml.Mark object at 0x7fcf8288dcb0>, 'found character that cannot start any token', <yaml._yaml.Mark object at 0x7fcf8288dfd0>)
Front matter not populated for radekkozak.md: ScannerError('while scanning for the next token', <yaml._yaml.Mark object at 0x7fcf8288fb50>, 'found character that cannot start any token', <yaml._yaml.Mark object at 0x7fcf8288e5c0>)
Front matter not populated for jaynguyens.md: ScannerError('while scanning for the next token', <yaml._yaml.Mark object at 0x7fcf827a0f40>, 'found character that cannot start any token', <yaml._yaml.Mark object at 0x7fcf827a15d0>)
Front matter not populated for jaynguyens.md: ScannerError('while scanning for the next token', <yaml._yaml.Mark object at 0x7fcf827bff10>, 'found character that cannot start any token', <yaml._yaml.Mark object at 0x7fcf827a2610>)
Front matter not populated for beaussan.md: ScannerError('while scanning for

In [15]:
df = vault.get_all_file_metadata()
df

Unnamed: 0_level_0,rel_filepath,abs_filepath,file_exists,n_backlinks,n_wikilinks,n_tags,n_embedded_files,modified_time,graph_category
file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
,,,False,18,,,,NaT,nonexistent
00 - Contribute to the Obsidian Hub/03 Contributor Notes/03.02 Design Decisions/Content People,,,False,1,,,,NaT,nonexistent
02 - Community Expansions/02.05 All Community Expansions/Plugins/obsidian-pangu,,,False,1,,,,NaT,nonexistent
02 - Community Expansions/02.05 All Community Expansions/Plugins/obsidian-sidebar-expand-on-hover,,,False,1,,,,NaT,nonexistent
vertis,01 - Community/People/vertis.md,vaultes/obsidian-hub/01 - Community/People/ver...,True,2,1.0,0.0,0.0,2023-08-09 06:29:53.315518618,note
...,...,...,...,...,...,...,...,...,...
css-obsidian-layout.png,,,False,1,,,,NaT,nonexistent
Everblush,,,False,2,,,,NaT,nonexistent
01 - Community/People/hipstersmoothie,,,False,4,,,,NaT,nonexistent
signynt,,,False,1,,,,NaT,nonexistent


In [13]:
delete_nodes_without_links(vault.graph)
df.index.value_counts()

deleted 72 nodes from graph


file
Everblush                                                                                     2
g-bauer                                                                                       2
01 - Community/People/hipstersmoothie                                                         2
signynt                                                                                       2
css-obsidian-layout.png                                                                       2
                                                                                             ..
02 - Community Expansions/02.05 All Community Expansions/Plugins/obsidian-file-link           1
debanjandhar12                                                                                1
02 - Community Expansions/02.05 All Community Expansions/Plugins/find-unlinked-files          1
02 - Community Expansions/02.05 All Community Expansions/Plugins/obsidian-sequence-hotkeys    1
omnisearch                         

In [14]:
# quite sparse!
A = nx.adjacency_matrix(vault.graph)
A.nnz / A.shape[0] ** 2

0.00037938032721357667

2. Construct new relations between nodes

In [15]:
# Parse text in the files from the vault.
# Adapted from bert_lightfm.ipynb

def get_text_from_html(html):
    soup = BeautifulSoup(html, features="html.parser")

    # kill all script and style elements
    for script in soup(["script", "style"]):
        script.extract()    # rip it out

    # get text
    text = soup.get_text()

    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines
    text = '\n'.join(chunk for chunk in chunks if chunk)
    return text

def get_text_from_markdown(filepath):
    with open(filepath, 'r') as f:
        text = f.read()
        html = md.markdown(text)
        text = get_text_from_html(html)
    return text

def get_text_from_pdf(filepath):
    with open(filepath, 'rb') as f:
        pdfReader = PyPDF2.PdfReader(f)
        text = ""
        
        for page in pdfReader.pages:
            text += page.extract_text()
            all_images.extend(page.images)
    return text

def get_images_from_pdf(filepath):
    with open(filepath, 'rb') as f:
        pdfReader = PyPDF2.PdfReader(f)
        images = []
        
        for page in pdfReader.pages:
            for image_file in page.images:
                images.append(Image.open(io.BytesIO(image_file.data)))
    return images

In [17]:
# Parse output of the llm.
# Adapted from https://huggingface.co/Babelscape/rebel-large
def extract_relations_from_model_output(text):
    relations = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    text_replaced = text.replace("<s>", "").replace("<pad>", "").replace("</s>", "")
    for token in text_replaced.split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                relations.append({
                    'head': subject.strip(),
                    'type': relation.strip(),
                    'tail': object_.strip()
                })
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                relations.append({
                    'head': subject.strip(),
                    'type': relation.strip(),
                    'tail': object_.strip()
                })
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        relations.append({
            'head': subject.strip(),
            'type': relation.strip(),
            'tail': object_.strip()
        })
    return relations

# Returns the wiki-page of the given request or None
def get_wikipedia_data(self, candidate_entity):
    try:
        page = wikipedia.page(candidate_entity, auto_suggest=False)
        entity_data = {
            "title": page.title,
            "url": page.url,
            "summary": page.summary
        }
        return entity_data
    except:
        return None

In [18]:
class KnowledgePiece:
    def __init__(self):
        self.entities = {}
        self.relations = []

    def get_unique_handle(self, candidate_entity, use_wiki=False):
        if use_wiki:
            handle = get_wikipedia_data(self, candidate_entity)
        else:
            handle = {
                "title": candidate_entity
            }
        return handle

    def are_relations_equal(self, r1, r2):
        return all(r1[attr] == r2[attr] for attr in ["head", "type", "tail"])

    def exists_relation(self, r1):
        return any(self.are_relations_equal(r1, r2) for r2 in self.relations)

    def add_entity(self, e):
        self.entities[e["title"]] = {k:v for k,v in e.items() if k != "title"}

    def merge_relations(self, r1):
        r2 = [r for r in self.relations
              if self.are_relations_equal(r1, r)][0]
        spans_to_add = [span for span in r1["meta"]["spans"]
                        if span not in r2["meta"]["spans"]]
        r2["meta"]["spans"] += spans_to_add
            
    def add_relation(self, r):
        candidate_entities = [r["head"], r["tail"]]
        entities = [self.get_unique_handle(ent) for ent in candidate_entities]

        # if one entity does not exist, stop
        if any(ent is None for ent in entities):
            return

        # manage new entities
        for e in entities:
            self.add_entity(e)

        # rename relation entities with their wikipedia titles
        r["head"] = entities[0]["title"]
        r["tail"] = entities[1]["title"]

        # manage new relation
        if not self.exists_relation(r):
            self.relations.append(r)
        else:
            self.merge_relations(r)

    def print(self):
        print("Entities:")
        for e in self.entities.items():
            print(f"  {e}")
        print("Relations:")
        for r in self.relations:
            print(f"  {r}")

In [19]:
tokenizer = AutoTokenizer.from_pretrained("Babelscape/rebel-large")
model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/rebel-large")

In [20]:
# Tokenize text, extract entities, obtain relations.
# If the text is too long, it is split into spans.
# Returns a KnowledgePiece object.
#
# Adapted from https://medium.com/nlplanet/building-a-knowledge-base-from-texts-a-full-practical-example-8dbbffb912fa
def from_text_to_kg(text, span_length=128, verbose=False):
    # tokenize whole text
    inputs = tokenizer([text], return_tensors="pt")

    # compute span boundaries
    num_tokens = len(inputs["input_ids"][0])
    if verbose:
        print(f"Input has {num_tokens} tokens")
    num_spans = math.ceil(num_tokens / span_length)
    if verbose:
        print(f"Input has {num_spans} spans")
    overlap = math.ceil((num_spans * span_length - num_tokens) / 
                        max(num_spans - 1, 1))
    spans_boundaries = []
    start = 0
    for i in range(num_spans):
        spans_boundaries.append([start + span_length * i,
                                 start + span_length * (i + 1)])
        start -= overlap
    if verbose:
        print(f"Span boundaries are {spans_boundaries}")

    # transform input with spans
    tensor_ids = [inputs["input_ids"][0][boundary[0]:boundary[1]]
                  for boundary in spans_boundaries]
    tensor_masks = [inputs["attention_mask"][0][boundary[0]:boundary[1]]
                    for boundary in spans_boundaries]
    inputs = {
        "input_ids": torch.stack(tensor_ids),
        "attention_mask": torch.stack(tensor_masks)
    }

    # generate relations
    num_return_sequences = 3
    gen_kwargs = {
        "max_length": 256,
        "length_penalty": 0,
        "num_beams": 3,
        "num_return_sequences": num_return_sequences
    }
    generated_tokens = model.generate(
        **inputs,
        **gen_kwargs,
    )

    # decode relations
    decoded_preds = tokenizer.batch_decode(generated_tokens,
                                           skip_special_tokens=False)

    # create kg
    kg = KnowledgePiece()
    i = 0
    for sentence_pred in decoded_preds:
        current_span_index = i // num_return_sequences
        relations = extract_relations_from_model_output(sentence_pred)
        for relation in relations:
            relation["meta"] = {
                "spans": [spans_boundaries[current_span_index]]
            }
            kg.add_relation(relation)
        i += 1

    return kg

In [21]:
# Sanity test

text = "Grobner bases were introduced by Bruno Buchberger in his 1965 Ph.D. thesis, \
    which also included an algorithm to compute them (Buchberger's algorithm).     \
    He named them after his advisor Wolfgang Gröbner. In 2007, Buchberger received \
    the Association for Computing Machinery's Paris Kanellakis Theory and Practice \
    Award for this work. However, the Russian mathematician Nikolai Günther had introduced \
    a similar notion in 1913, published in various Russian mathematical journals. These \
    papers were largely ignored by the mathematical community until their rediscovery in \
    1987 by Bodo Renschuch et al.[2] An analogous concept for multivariate power series was \
    developed independently by Heisuke Hironaka in 1964, who named them standard bases. \
    This term has been used by some authors to also denote Gröbner bases."

kg = from_text_to_kg(text, verbose=True)
kg.print()

Input has 210 tokens
Input has 2 spans
Span boundaries are [[0, 128], [82, 210]]
Entities:
  ("Buchberger's algorithm", {})
  ('Bruno Buchberger', {})
  ('Grobner base', {})
  ('standard bases', {})
  ('Heisuke Hironaka', {})
  ('standard base', {})
Relations:
  {'head': "Buchberger's algorithm", 'type': 'named after', 'tail': 'Bruno Buchberger', 'meta': {'spans': [[0, 128]]}}
  {'head': 'Grobner base', 'type': 'discoverer or inventor', 'tail': 'Bruno Buchberger', 'meta': {'spans': [[0, 128]]}}
  {'head': 'Grobner base', 'type': 'named after', 'tail': 'Bruno Buchberger', 'meta': {'spans': [[0, 128]]}}
  {'head': 'standard bases', 'type': 'discoverer or inventor', 'tail': 'Heisuke Hironaka', 'meta': {'spans': [[82, 210]]}}
  {'head': 'standard bases', 'type': 'named after', 'tail': 'Heisuke Hironaka', 'meta': {'spans': [[82, 210]]}}
  {'head': 'standard base', 'type': 'discoverer or inventor', 'tail': 'Heisuke Hironaka', 'meta': {'spans': [[82, 210]]}}


In [22]:
vault.graph.edges

OutMultiEdgeView([('manogna4', 'koncham-workspace', 0), ('Plugins for Writers', 'dangerzone-writing-plugin', 0), ('Plugins for Writers', 'obsidian-fountain', 0), ('Plugins for Writers', 'longform', 0), ('Plugins for Writers', 'obsidian-paper-cut', 0), ('Plugins for Writers', 'cm-typewriter-scroll-obsidian', 0), ('Plugins for Writers', 'obsidian-vale', 0), ('Plugins for Writers', 'Dictionary and Spellchecking Plugins', 0), ('decatetsu', 'local-quotes', 0), ('2021-09-11 Edit-Preview Autoswitcher & a Collaboration Guide', 'dataview', 0), ('2021-09-11 Edit-Preview Autoswitcher & a Collaboration Guide', 'iOS Shortcuts', 0), ('2021-09-11 Edit-Preview Autoswitcher & a Collaboration Guide', 'obsidian-reset-font-size', 0), ('2021-09-11 Edit-Preview Autoswitcher & a Collaboration Guide', 'podcast-note', 0), ('2021-09-11 Edit-Preview Autoswitcher & a Collaboration Guide', 'obsidian-stille', 0), ('2021-09-11 Edit-Preview Autoswitcher & a Collaboration Guide', 'obsidian-carry-forward', 0), ('2021-0

In [102]:
def clean_label(label):
    label = label.strip().replace(' ', '_')
    return label

def train_test_val_split(df, **kwargs):
    test_size = kwargs.get('test_size', 0.2)
    val_size = kwargs.get('val_size', 0.1)
    shuffle = kwargs.get('shuffle', False)
    df_train, df_test = train_test_split(df, test_size=test_size, shuffle=shuffle)
    df_train, df_val = train_test_split(df_train, test_size=val_size * 1 / (1 - test_size), shuffle=shuffle)
    return df_train, df_test, df_val

In [111]:
class KnowledgeGraph:
    def __init__(self, nodes):
        self.nodes = nodes
        self.relations = {}
    
    def has_node(self, node):
        return node in self.nodes
    
    def add_relation(self, rel):
        if not (self.has_node(rel['source']) and self.has_node(rel['target'])):
            return
        s, t, r = rel['source'], rel['target'], rel['relation']
        if not (s, t) in self.relations:
            self.relations[(s, t)] = ""
        self.relations[(s, t)] = self.relations[(s, t)] + "/" + r
        
    def print(self, debug=False):
        print(f"There are {len(self.nodes)} nodes")
        if debug:
            for k, v in self.nodes.items():
                print("------------------")
                print(f"{k}")
                v.print()
        print(f"There are {len(self.relations)} relations")
        if debug:
            print(self.relations)
            
    def to_df(self):
        rows = []
        for k, v in self.relations.items():
            rows.append([k[0], v, k[1]])
        df = pd.DataFrame(rows, columns=['source', 'target', 'relation'])
        return df
    
    def dump_to_dir(self, dir):
        df = self.to_df()
        train, test, valid = train_test_val_split(df)
        Path(DIR + f"/{dir}/").mkdir(parents=True, exist_ok=True)

        np.savetxt(DIR + f"/{dir}/test.txt", test.values, fmt='%s', delimiter='\t')
        np.savetxt(DIR + f"/{dir}/valid.txt", valid.values, fmt='%s', delimiter='\t')
        np.savetxt(DIR + f"/{dir}/train.txt", train.values, fmt='%s', delimiter='\t')
        

In [112]:
def from_documents_to_kg(df, vault, verbose=False):
    nodes = {}
    for index, node in tqdm(df.iterrows()):
        node_title = str(node.rel_filepath)
        node_path = node.abs_filepath
        extension = str(node_path).split('.')[-1]
        node_title = "".join(node_title.split('.')[:-1])
        if node.file_exists and extension == 'md':
            text = get_text_from_markdown(node_path)
            nodes[node_title] = from_text_to_kg(text)
        else:
            nodes[node_title] = KnowledgePiece()
    kg = KnowledgeGraph(nodes)
    return kg

In [113]:
# Sanity check
df_small = df.sample(3)

kg = from_documents_to_kg(df_small, vault)
kg.print(debug=True)

3it [00:02,  1.23it/s]

There are 3 nodes
------------------

Entities:
Relations:
------------------
02 - Community Expansions/0205 All Community Expansions/Plugins/obsidian-emoji-toolbar
Entities:
  ('Emoji Toolbar', {})
  ('Emoji', {})
  ('emojis', {})
  ('repository', {})
  ('GitHub', {})
  ('github', {})
Relations:
  {'head': 'Emoji Toolbar', 'type': 'use', 'tail': 'Emoji', 'meta': {'spans': [[0, 128]]}}
  {'head': 'Emoji Toolbar', 'type': 'use', 'tail': 'emojis', 'meta': {'spans': [[0, 128]]}}
  {'head': 'repository', 'type': 'part of', 'tail': 'GitHub', 'meta': {'spans': [[95, 223]]}}
  {'head': 'github', 'type': 'instance of', 'tail': 'repository', 'meta': {'spans': [[95, 223]]}}
  {'head': 'repository', 'type': 'uses', 'tail': 'GitHub', 'meta': {'spans': [[95, 223]]}}
------------------
02 - Community Expansions/0205 All Community Expansions/Plugins/note-synchronizer
Entities:
  ('Anki', {})
  ('softwares', {})
  ('Obsidian notes', {})
  ('Obsidian', {})
  ('Zettelkasten', {})
Relations:
  {'head': '




In [106]:
def add_connectivity_relations(kg, vault):
    rows_list = []
    for node in vault.graph.nodes:
        if node == 'nan':
            continue
        for nbr, eattr in vault.graph.adj[node].items():
            rel1 = {'source': node, 'relation': f'reachable', 'target': nbr}
            rel2 = {'source': nbr, 'relation': 'reachable_reversed', 'target': node}
            kg.add_relation(rel1)
            kg.add_relation(rel2)
            
    return kg

In [107]:
def add_semantic_relations(kg, vault):
    rows_list = []
    for node1 in vault.graph.nodes:
        for node2 in vault.graph.nodes:
            if not (kg.has_node(node1) and kg.has_node(node2)):
                continue
            if 'nan' in (node1, node2):
                continue
            if node1 == node2:
                continue
            entities1 = list(kg.nodes[node1].entities.keys())
            entities2 = list(kg.nodes[node2].entities.keys())
            shared_entities = np.intersect1d(entities1, entities2)
            for entity in shared_entities:
                rel = {'source': node1, 'relation': f'{entity}', 'target': node2}
                kg.add_relation(rel)
            
    return kg

In [108]:
from sklearn.feature_extraction.text import TfidfVectorizer

def add_similarity_relations(kg, vault):
    pass

In [109]:
add_connectivity_relations(kg, vault)
add_semantic_relations(kg, vault)
kg.print(debug=True)

There are 9 nodes
------------------

Entities:
Relations:
------------------
01 - Community/People/kartik-karz
Entities:
  ('GitHub', {})
  ('repository', {})
  ('repositories', {})
  ('Themes', {})
  ('plugins', {})
  ('kartik-karz', {})
Relations:
  {'head': 'GitHub', 'type': 'instance of', 'tail': 'repository', 'meta': {'spans': [[0, 128]]}}
  {'head': 'GitHub', 'type': 'instance of', 'tail': 'repositories', 'meta': {'spans': [[0, 128]]}}
  {'head': 'GitHub', 'type': 'use', 'tail': 'repositories', 'meta': {'spans': [[0, 128]]}}
  {'head': 'Themes', 'type': 'subclass of', 'tail': 'plugins', 'meta': {'spans': [[19, 147]]}}
  {'head': 'kartik-karz', 'type': 'instance of', 'tail': 'repositories', 'meta': {'spans': [[19, 147]]}}
------------------
01 - Community/People/tomzorz
Entities:
  ('GitHub', {})
  ('website', {})
  ('repository', {})
  ('repo', {})
Relations:
  {'head': 'GitHub', 'type': 'instance of', 'tail': 'website', 'meta': {'spans': [[0, 128]]}}
  {'head': 'GitHub', 'type'

In [114]:
# adapted from https://medium.com/nlplanet/building-a-knowledge-base-from-texts-a-full-practical-example-8dbbffb912fa
def save_network_html(kg, filename="network.html"):
    # create network
    net = Network(directed=True, width="700px", height="700px", bgcolor="#eeeeee")

    truncate_label = lambda label: label[:10] + "-" + label[-10:]
    
    # nodes
    color_entity = "#00FF00"
    for e in list(kg.nodes.keys()):
        net.add_node(truncate_label(e), shape="circle", color=color_entity)

    # edges
    for k, v in kg.relations.items():
        net.add_edge(truncate_label(k[0]), truncate_label(k[1]),
                    title=v, label=v)
        
    # save network
    net.repulsion(
        node_distance=200,
        central_gravity=0.2,
        spring_length=200,
        spring_strength=0.05,
        damping=0.09
    )
    net.set_edge_smooth('dynamic')
    net.save_graph(filename,) # notebook=False)

In [115]:
df_small = df.sample(100)

kg = from_documents_to_kg(df_small, vault)
add_connectivity_relations(kg, vault)
add_semantic_relations(kg, vault)
kg.print()

100it [01:19,  1.26it/s]


There are 49 nodes
There are 332 relations


In [117]:
filename = "network.html"
save_network_html(kg, filename=filename)

In [118]:
kg.dump_to_dir('dummy-dummy')