In [36]:
import os
from kgtk.configure_kgtk_notebooks import ConfigureKGTK
from kgtk.functions import kgtk, kypher
from gensim.models import KeyedVectors
import tempfile
import pandas as pd
import numpy as np

In [2]:
# Parameters

# Folder on local machine where to create the output and temporary folders
input_path = "/Volumes/saggu-ssd/kgtk-tutorial-files/datasets/arnold-profiled"
# input_path = "/Users/pedroszekely/Documents/GitHub/kgtk-tutorial-files/datasets/arnold-profiled"

output_path = "/Volumes/saggu-ssd/projects"
# output_path = "/Users/pedroszekely/Downloads/kypher/projects"

project_name = "tutorial-graph-embeddings"

In [3]:
files = [
    "all",
    "label",
    "alias",
    "description",
    "item",
    "qualifiers",
    "p31",
    "p279star"
]
ck = ConfigureKGTK(files, kgtk_path=kgtk_path)
ck.configure_kgtk(input_graph_path=input_path,
                  output_path=output_path,
                  project_name=project_name)

User home: /Users/amandeep
Current dir: /Users/amandeep/Github/kgtk-notebooks/tutorial
KGTK dir: /Users/amandeep/GitHub/kgtk
Use-cases dir: /Users/amandeep/GitHub/kgtk/use-cases


In [4]:
ck.print_env_variables()

OUT: /Volumes/saggu-ssd/projects/tutorial-graph-embeddings
EXAMPLES_DIR: /Users/amandeep/GitHub/kgtk/examples
kypher: kgtk query --graph-cache /Volumes/saggu-ssd/projects/tutorial-graph-embeddings/temp.tutorial-graph-embeddings/wikidata.sqlite3.db
GRAPH: /Volumes/saggu-ssd/kgtk-tutorial-files/datasets/arnold-profiled
TEMP: /Volumes/saggu-ssd/projects/tutorial-graph-embeddings/temp.tutorial-graph-embeddings
STORE: /Volumes/saggu-ssd/projects/tutorial-graph-embeddings/temp.tutorial-graph-embeddings/wikidata.sqlite3.db
kgtk: kgtk
USE_CASES_DIR: /Users/amandeep/GitHub/kgtk/use-cases
all: /Volumes/saggu-ssd/kgtk-tutorial-files/datasets/arnold-profiled/all.tsv.gz
label: /Volumes/saggu-ssd/kgtk-tutorial-files/datasets/arnold-profiled/labels.en.tsv.gz
alias: /Volumes/saggu-ssd/kgtk-tutorial-files/datasets/arnold-profiled/aliases.en.tsv.gz
description: /Volumes/saggu-ssd/kgtk-tutorial-files/datasets/arnold-profiled/descriptions.en.tsv.gz
item: /Volumes/saggu-ssd/kgtk-tutorial-files/datasets/arn

In [49]:
ck.load_files_into_cache()

kgtk query --graph-cache /Volumes/saggu-ssd/projects/tutorial-graph-embeddings/temp.tutorial-graph-embeddings/wikidata.sqlite3.db -i "/Volumes/saggu-ssd/kgtk-tutorial-files/datasets/arnold-profiled/all.tsv.gz" --as all  -i "/Volumes/saggu-ssd/kgtk-tutorial-files/datasets/arnold-profiled/labels.en.tsv.gz" --as label  -i "/Volumes/saggu-ssd/kgtk-tutorial-files/datasets/arnold-profiled/aliases.en.tsv.gz" --as alias  -i "/Volumes/saggu-ssd/kgtk-tutorial-files/datasets/arnold-profiled/descriptions.en.tsv.gz" --as description  -i "/Volumes/saggu-ssd/kgtk-tutorial-files/datasets/arnold-profiled/claims.wikibase-item.tsv.gz" --as item  -i "/Volumes/saggu-ssd/kgtk-tutorial-files/datasets/arnold-profiled/qualifiers.tsv.gz" --as qualifiers  -i "/Volumes/saggu-ssd/kgtk-tutorial-files/datasets/arnold-profiled/derived.P31.tsv.gz" --as p31  -i "/Volumes/saggu-ssd/kgtk-tutorial-files/datasets/arnold-profiled/derived.P279star.tsv.gz" --as p279star  --limit 3
node1	label	node2	id	node2;wikidatatype
P10	a

## Compute ComplEx Graph Embeddings

In this notebook we will compute graph embeddings using `kgtk graph-embeddings` command for the `arnold` subgraph and demonstrate a few applications.

First step is to augment the graph with `derived.P31x.tsv` file which considers occupations for humans as `instance of (P31)`

In [27]:
!kgtk cat -i $item \
-i $GRAPH/derived.P31x.tsv \
-o $GRAPH/claims.wikibase-item.augmented.tsv.gz

#### Compute the `ComplEx` graph embeddings with vectors of dimension `30`

In [91]:
!kgtk --debug graph-embeddings \
-op ComplEx \
--dimension 30 \
-ot kgtk \
-i $GRAPH/claims.wikibase-item.augmented.tsv.gz \
-o $OUT/arnold.embeddings.augmented.30.tsv \
--log $TEMP/ge.log.txt

In Processing, Please go to /Volumes/saggu-ssd/projects/tutorial-graph-embeddings/temp.tutorial-graph-embeddings/ge.log.txt to check details
Processed Finished.


#### The output is in `kgtk` format. Convert it to `word2vec` format for `gensim` similarity computation

In [None]:
def convert_kgtk_to_w2v(input_path, output_path):
    """
    Convert a KGTK file (node1/label/node2) that contains embeddings to the w2v format
    """
    vector_count = 0
    vector_length = 0

    # Read the file once to count the lines as we need to put them at the top of the w2v file
    with open(input_path, "r") as kgtk_file:
        next(kgtk_file)
        for line in kgtk_file:
            items = line.split("\t")

            if vector_count == 0:
                vector_length = len(items[2].split(","))
            vector_count += 1
        kgtk_file.close()

    with open(output_path, "w") as w2v_file:
        w2v_file.write("{} {}\n".format(vector_count, vector_length))
        with open(input_path, "r") as kgtk_file:
            next(kgtk_file)
            for line in kgtk_file:
                items = line.split("\t")
                qnode = items[0]
                vector = items[2].replace(",", " ")
                w2v_file.write(qnode + " " + vector)
            kgtk_file.close()
        w2v_file.close()

In [5]:
convert_kgtk_to_w2v(f"{os.environ['OUT']}/arnold.embeddings.augmented.30.tsv", f"{os.environ['OUT']}/arnold.embeddings.augmented.30.w2v.tsv")

#### Load the vectos into `gensim`

In [9]:
ge_vectors = KeyedVectors.load_word2vec_format(f"{os.environ['OUT']}/arnold.embeddings.augmented.30.w2v.tsv", binary=False)

Define a function to compute the `topn` similar vectors, and get the labels and descriptions of the matching Qnodes.

In [10]:
def kgtk_most_similar(
    vectors,
    positive,
    relation_label="similarity_score",
    kg_path=None,
    add_label_description=True,
    output_path=None,
    topn=25,
):
    """"""
    result = []
    if add_label_description and kg_path:
        fp = tempfile.NamedTemporaryFile(
            mode="w", suffix=".tsv", delete=False, encoding="utf-8"
        )
        fp.write("node1\tlabel\tnode2\n")
        for (qnode, similarity) in vectors.most_similar(positive=positive, topn=topn):
            fp.write("{}\t{}\t{}\n".format(qnode, relation_label, similarity))
        filename = fp.name
        fp.close()

        os.environ["_temp_file"] = filename

        result = !$kypher -i label -i description -i "$_temp_file" --as sim \
--match 'sim: (n1)-[]->(similarity), label: (n1)-[]->(lab), description: (n1)-[]->(des)' \
--return 'distinct n1 as node1, similarity as node2, "similarity" as label, lab as `node1;label`, des as `node1;description`' \
--order-by 'cast(similarity, float) desc' 
        
        os.remove(filename)
        
    else:
        result.append("node1\tlabel\tnode2\n")
        for (qnode, similarity) in vectors.most_similar(positive=positive, topn=topn):
            result.append("{}\t{}\t{}\n".format(qnode, relation_label, similarity))

    if output_path:
        handle = open(output_path, "w")
        for line in result:
            handle.write(line)
            handle.write("\n")
        handle.close()
    else:
        columns = result[0].split("\t")
        data = []
        for line in result[1:]:
            data.append(line.split("\t"))
        return pd.DataFrame(data, columns=columns)

Find the most similar 10 Qnodes to `Q2685` (Arnold Schwarzenegger)

In [11]:
kgtk_most_similar(ge_vectors, positive=['Q2685'], kg_path = os.environ['TEMP'], topn=10)

Unnamed: 0,node1,node2,label,node1;label,node1;description
0,Q106997,0.8682151436805725,similarity,'Jamie Lee Curtis'@en,"'actress, author'@en"
1,Q232052,0.8669180870056152,similarity,'Asia Argento'@en,"'Italian actress, film director and model y ma..."
2,Q76478,0.8607354760169983,similarity,'Kirsten Dunst'@en,'American actress'@en
3,Q483907,0.8600436449050903,similarity,'Jack Black'@en,"'American actor, comedian, musician, music pro..."
4,Q187033,0.8560298681259155,similarity,'Sally Field'@en,'American actress'@en
5,Q178166,0.8542536497116089,similarity,'Vin Diesel'@en,"'American actor, producer, director, and scree..."
6,Q62975,0.8509424924850464,similarity,'Sharon Stone'@en,'American actor and fashion model'@en
7,Q56011,0.849589467048645,similarity,'Anna Magnani'@en,'Italian actress'@en
8,Q133050,0.8495450019836426,similarity,'Susan Sarandon'@en,'American actress and activist'@en
9,Q287793,0.8493508100509644,similarity,'Carl Reiner'@en,"'American actor, film director, producer, writ..."


Find the most similar 10 Qnodes to `Q162255` (Terminator,the movie)

In [13]:
kgtk_most_similar(ge_vectors, positive=['Q162255'], kg_path = os.environ['TEMP'], topn=10)

Unnamed: 0,node1,node2,label,node1;label,node1;description
0,Q4142083,0.9051259756088256,similarity,'The Hunger Games: Mockingjay – Part 1'@en,'2014 film by Francis Lawrence'@en
1,Q1551573,0.897913932800293,similarity,'The Player'@en,'1992 film by Robert Altman'@en
2,Q15732802,0.8950880169868469,similarity,'John Wick'@en,'2014 American action thriller film directed b...
3,Q464042,0.8893483877182007,similarity,'Anaconda'@en,'1997 adventure-horror film directed by Luis L...
4,Q21451640,0.8874788880348206,similarity,'John Wick: Chapter 2'@en,'2017 American action thriller film directed b...
5,Q42198,0.886405348777771,similarity,'Heat'@en,'1995 film directed by Michael Mann'@en
6,Q728267,0.8812209963798523,similarity,'T2-3D: Battle Across Time'@en,"'1996 short film directed by Stan Winston, J. ..."
7,Q146673,0.8801364898681641,similarity,'Basic Instinct'@en,'1992 erotic thriller film by Paul Verhoeven'@en
8,Q574583,0.8758235573768616,similarity,'The Hunger Games: Catching Fire'@en,'2013 film by Francis Lawrence'@en
9,Q217182,0.8751983642578125,similarity,'Edward Scissorhands'@en,'1990 American fantasy romance film by Tim Bur...


In [56]:
v1 = ge_vectors['Q873'] # Meryl Streep
v2 = ge_vectors['Q2685'] # Arnold Schwarzenegger
v3 = ge_vectors['Q103148'] # Lahn River
v4 = ge_vectors['Q104123'] # Pulp Fiction

Compute average of `Meryl Streep` and `Arnold Schwarzenegger`, find most similar Qnodes

In [54]:
aver = np.mean([v1, v2], axis=0 )
kgtk_most_similar(ge_vectors, positive=aver, kg_path = os.environ['TEMP'], topn=10)

Unnamed: 0,node1,node2,label,node1;label,node1;description
0,Q2685,0.9249167442321776,similarity,'Arnold Schwarzenegger'@en,"'Austrian-American actor, businessman, bodybui..."
1,Q232052,0.8726891279220581,similarity,'Asia Argento'@en,"'Italian actress, film director and model y ma..."
2,Q110154,0.8639518618583679,similarity,'George Takei'@en,'American actor and author'@en
3,Q483907,0.8623449206352234,similarity,'Jack Black'@en,"'American actor, comedian, musician, music pro..."
4,Q133050,0.8619319796562195,similarity,'Susan Sarandon'@en,'American actress and activist'@en
5,Q106997,0.8609126806259155,similarity,'Jamie Lee Curtis'@en,"'actress, author'@en"
6,Q437182,0.8573539853096008,similarity,'Assumpta Serna'@en,'Spanish actress'@en
7,Q287793,0.8552923202514648,similarity,'Carl Reiner'@en,"'American actor, film director, producer, writ..."
8,Q3769056,0.8539261817932129,similarity,'Giselda Volodi'@en,'Italian actress'@en
9,Q60863,0.8512262105941772,similarity,'Nadja Uhl'@en,'German actress'@en


Compute average of `Meryl Streep` and `Lahn River`, find most similar Qnodes

In [55]:
aver = np.mean([v3, v1], axis=0 )
kgtk_most_similar(ge_vectors, positive=aver, kg_path = os.environ['TEMP'], topn=10)

Unnamed: 0,node1,node2,label,node1;label,node1;description
0,Q103148,0.915361225605011,similarity,'Lahn'@en,'right tributary of the Rhine River in Germany...
1,Q464556,0.8873997926712036,similarity,'Wisper'@en,'right tributary of Rhine river'@en
2,Q554419,0.8847276568412781,similarity,'Eckbach'@en,'river'@en
3,Q153945,0.8845643997192383,similarity,'Lippe'@en,'river in Germany'@en
4,Q663762,0.8832949995994568,similarity,'Modau'@en,'river in Germany'@en
5,Q319649,0.8826938271522522,similarity,'Möhlin'@en,'river in Germany'@en
6,Q153521,0.8820112347602844,similarity,'Pfrimm'@en,'river in Germany'@en
7,Q18287117,0.8815670609474182,similarity,'Ginsheimer Altrhein'@en,'river in Germany'@en
8,Q168696,0.8802140951156616,similarity,'Nahe'@en,'tributary of Rhine river'@en
9,Q26727445,0.8788406848907471,similarity,'Moersbach'@en,'river in Germany'@en


Compute average of `Lahn River` and `Pulp Fiction`, find most similar Qnodes

In [57]:
aver = np.mean([v3, v4], axis=0)
kgtk_most_similar(ge_vectors, positive=aver, kg_path = os.environ['TEMP'], topn=10)

Unnamed: 0,node1,node2,label,node1;label,node1;description
0,Q104123,0.8470815420150757,similarity,'Pulp Fiction'@en,'1994 American crime film directed by Quentin ...
1,Q329131,0.8154242038726807,similarity,'Jerry Maguire'@en,'1996 American romantic comedy-drama sports fi...
2,Q153723,0.8135038018226624,similarity,'Inglourious Basterds'@en,'2009 film by Quentin Tarantino'@en
3,Q1673,0.8078702688217163,similarity,'Neckar'@en,'right tributary of Rhine river in Germany'@en
4,Q504922,0.8060047626495361,similarity,'Emscher'@en,'river'@en
5,Q26727445,0.8021400570869446,similarity,'Moersbach'@en,'river in Germany'@en
6,Q19850715,0.8018151521682739,similarity,'The Big Short'@en,'2015 American biographical comedy-drama film ...
7,Q192115,0.7993929982185364,similarity,'Sin City'@en,"'2005 American crime thriller film written, pr..."
8,Q153945,0.7981359958648682,similarity,'Lippe'@en,'river in Germany'@en
9,Q191040,0.7966043949127197,similarity,'Mr. & Mrs. Smith'@en,'2005 film by Doug Liman'@en


Compute average of `Meryl Streep` and `Pulp Fiction`, find most similar Qnodes

In [61]:
aver = np.mean([v4, v1], axis=0)
kgtk_most_similar(ge_vectors, positive=aver, kg_path = os.environ['TEMP'], topn=10)

Unnamed: 0,node1,node2,label,node1;label,node1;description
0,Q104123,0.9320459961891174,similarity,'Pulp Fiction'@en,'1994 American crime film directed by Quentin ...
1,Q106428,0.888177216053009,similarity,'Apollo 13'@en,'1995 film by Ron Howard'@en
2,Q27894574,0.8753330707550049,similarity,'Bohemian Rhapsody'@en,'2018 film by Bryan Singer'@en
3,Q172975,0.8733213543891907,similarity,'The Departed'@en,'2006 American crime thriller film directed by...
4,Q188000,0.8715724349021912,similarity,'Crash'@en,'2004 American film by Paul Haggis'@en
5,Q329131,0.8686830997467041,similarity,'Jerry Maguire'@en,'1996 American romantic comedy-drama sports fi...
6,Q5477105,0.8618838787078857,similarity,'Foxcatcher'@en,'2014 film by Bennett Miller'@en
7,Q116845,0.8610105514526367,similarity,'Silver Linings Playbook'@en,'2012 film by David O. Russell'@en
8,Q196665,0.8607305288314819,similarity,'8 Mile'@en,'2002 film starring Eminem directed by Curtis ...
9,Q108006,0.8576011061668396,similarity,'Speed'@en,'1994 American action-thriller film directed b...


## Prepare files for Google Projector

In this section, we will prepare `vectors` and `metadata` files for google projector.

We are focusing on the following types:

- `Q11424` (film)
- `Q33999` (actor)
- `Q4022` (river)
- `Q82955` (politician)

First step is to create a file with the following information ,

1. node1 :- Qnode
2. node1;label :- label for node1
3. node2 :- `instance of` for node1
4. node2;label :- label for node2
5. embedding :- ComplEx graph embedding vector for node1

In [15]:
%%time
kgtk(""" query -i $GRAPH/claims.wikibase-item.augmented.tsv.gz 
         -i p279star 
         -i label 
         -i $OUT/arnold.embeddings.augmented.30.tsv 
         -i $GRAPH/derived.P31x.tsv 
         --match 'item: (n1)-[]->(), 
             P31x: (n1)-[]->(c), 
             p279star: (c)-[]->(class), 
             label: (n1)-[]->(n1_label), 
             label: (class)-[]->(class_label), embeddings: (n1)-[]->(embedding)'
        --where 'class in ["Q11424", "Q33999", "Q4022", "Q82955"]' 
        --return 'distinct n1, kgtk_lqstring_text(n1_label) as `node1;label`, group_concat(distinct class) as node2, group_concat(distinct kgtk_lqstring_text(class_label)) as `node2;label`, embedding as embedding' \
        -o $TEMP/arnold.embeddings.google.projector.tsv
""")

CPU times: user 4.07 ms, sys: 11.8 ms, total: 15.9 ms
Wall time: 3.73 s


#### Take a peek at the file

In [18]:
!head -2 $TEMP/arnold.embeddings.google.projector.tsv

node1	node1;label	node2	node2;label	embedding
Q1000881	Erlau	Q4022	river	-0.278222889,0.298910052,-0.058227286,-0.707951486,-0.300852597,-0.292677164,0.347793698,0.180076435,0.370242327,0.807168007,-0.382684618,0.173900068,0.034428731,-1.144109964,-0.129670262,-0.191929370,-0.716327786,0.031291030,0.180736080,-0.297246993,-0.714032531,-0.364734739,0.823571563,0.272169203,0.657331109,0.312936962,-0.594842017,-0.129835084,0.980592489,-0.074916743


#### Define a function to build the required files for google projector

In [None]:
def build_embedding_projector_metadata(gp_embeddings_path, metadata_path, vectors_path):
    metadata_file = open(metadata_path, "w")
    metadata_file.write("tag\ttype\ttype_label\n")

    vectors_file = open(vectors_path, "w")

    with open(gp_embeddings_path) as qnodes_file:
        next(qnodes_file)
        for line in qnodes_file:
            vals = line.split('\t')
            qnode = vals[0]
            qnode_label = vals[1]
            ftype_label = vals[3]
            embeddings = "\t".join(vals[4].strip().split(","))

            tag = "{} ({})".format(qnode, qnode_label)

            if qnode.startswith("Q"):
                metadata_file.write("{}\t{}\t{}\n".format(tag, qnode, ftype_label))
                vectors_file.write(embeddings)
                vectors_file.write('\n')

    metadata_file.close()
    vectors_file.close()

In [5]:
build_embedding_projector_metadata(f"{os.environ['TEMP']}/arnold.embeddings.google.projector.tsv",
                                  f"{os.environ['OUT']}/arnold.metadata.30.tsv",
                                  f"{os.environ['OUT']}/arnold.vectors.30.tsv")

Peek at the metadata file

In [6]:
!head $OUT/arnold.metadata.30.tsv

tag	type	type_label
Q1000881 (Erlau)	Q1000881	river
Q1001872 (Buersbach)	Q1001872	river
Q1004531 (Bullets Over Broadway)	Q1004531	film
Q1009788 (The Conversation)	Q1009788	film
Q1010099 (Get Carter)	Q1010099	film
Q1012216 (Gorillas in the Mist)	Q1012216	film
Q101410 (François Fillon)	Q101410	politician
Q101797 (Winona Ryder)	Q101797	actor
Q1018487 (Bye Bye Birdie)	Q1018487	film


Peek at the vectors file

In [28]:
!head -2 $OUT/arnold.vectors.30.tsv

-0.278222889	0.298910052	-0.058227286	-0.707951486	-0.300852597	-0.292677164	0.347793698	0.180076435	0.370242327	0.807168007	-0.382684618	0.173900068	0.034428731	-1.144109964	-0.129670262	-0.191929370	-0.716327786	0.031291030	0.180736080	-0.297246993	-0.714032531	-0.364734739	0.823571563	0.272169203	0.657331109	0.312936962	-0.594842017	-0.129835084	0.980592489	-0.074916743
-0.292729884	0.365311205	0.049547877	-0.266633153	-0.517612636	-0.337442279	0.316574246	-0.161643475	0.366951913	1.027803302	-0.281238228	-0.533051729	0.022873841	-1.064334035	0.336856425	0.353341699	-0.658462703	0.076028503	0.257462531	-0.099187657	-0.353778273	-0.526077926	0.809671104	-0.297059596	0.488877356	-0.055506136	-0.431686580	-0.327586085	0.996009290	0.028423335


## Google embedding projector
- open https://projector.tensorflow.org
- Load the vect files using the load button
- configure the visualization

Here we searched on the right for arnold, and we see the closest vecotrs as well as the cluster where it belongs:
![Google embedding projector](assets/gp-arnold.png "Google embedding projector")

#### UMAP visualization of the embeddings, colored by `instance of`

![UMAP Color by Type](assets/gp-color-map-types.png "UMAP Color by Type")