# Embeddings for the DDIS Movie Graph

## Setup

In [4]:
# imports
import csv
import numpy as np
import os
import rdflib
import pandas as pd
from sklearn.metrics import pairwise_distances
import pickle

In [2]:
# define some prefixes
WD = rdflib.Namespace('http://www.wikidata.org/entity/')
WDT = rdflib.Namespace('http://www.wikidata.org/prop/direct/')
DDIS = rdflib.Namespace('http://ddis.ch/atai/')
RDFS = rdflib.namespace.RDFS
SCHEMA = rdflib.Namespace('http://schema.org/')

## Load the data

In [3]:
with open("/home/oliver/dev/uzh/atai_bot/dataset/updated_dataset.pickle", 'rb') as f:
    graph = pickle.load(f)

In [5]:
# load the embeddings
entity_emb = np.load('/home/oliver/dev/uzh/atai_bot/dataset/ddis-graph-embeddings/entity_embeds.npy')
relation_emb = np.load('/home/oliver/dev/uzh/atai_bot/dataset/ddis-graph-embeddings/relation_embeds.npy')

In [6]:
# load the dictionaries
with open(os.path.join('/home/oliver/dev/uzh/atai_bot/dataset/ddis-graph-embeddings/entity_ids.del'), 'r') as ifile:
    ent2id = {rdflib.term.URIRef(ent): int(idx) for idx, ent in csv.reader(ifile, delimiter='\t')}
    id2ent = {v: k for k, v in ent2id.items()}
with open(os.path.join('/home/oliver/dev/uzh/atai_bot/dataset/ddis-graph-embeddings/relation_ids.del'), 'r') as ifile:
    rel2id = {rdflib.term.URIRef(rel): int(idx) for idx, rel in csv.reader(ifile, delimiter='\t')}
    id2rel = {v: k for k, v in rel2id.items()}

In [7]:
ent2lbl = {ent: str(lbl) for ent, lbl in graph.subject_objects(RDFS.label)}
lbl2ent = {lbl: ent for ent, lbl in ent2lbl.items()}

## Finding errors

In [27]:
# let's see what our graph thinks the occupation of Jean Van Hamme is
professions = set(graph.query('''
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
            PREFIX wd: <http://www.wikidata.org/entity/>
            PREFIX wdt: <http://www.wikidata.org/prop/direct/>
            PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
            PREFIX ddis: <http://ddis.ch/atai/>
            SELECT ?value
WHERE {
    wd:Q3110682 wdt:P136 ?value .
}
    '''))
print(professions)

{(rdflib.term.URIRef('http://www.wikidata.org/entity/Q859369'),), (rdflib.term.URIRef('http://www.wikidata.org/entity/Q1135802'),)}


In [28]:
head = entity_emb[ent2id[WD.Q3110682]]
pred = relation_emb[rel2id[WDT.P136]]
# add vectors according to TransE scoring function.
lhs = head + pred
# compute distance to *any* entity
dist = pairwise_distances(lhs.reshape(1, -1), entity_emb).reshape(-1)
# find most plausible entities
most_likely = dist.argsort()
# compute ranks of entities
ranks = dist.argsort().argsort()

In [29]:
# what would be more plausible occupations?
pd.DataFrame([
    (id2ent[idx][len(WD):], ent2lbl[id2ent[idx]], dist[idx], rank+1)
    for rank, idx in enumerate(most_likely[:10])],
    columns=('Entity', 'Label', 'Score', 'Rank'))

Unnamed: 0,Entity,Label,Score,Rank
0,Q130232,drama,3599.871338,1
1,Q859369,comedy-drama,3604.7146,2
2,Q157443,comedy film,3616.061279,3
3,Q2421031,neo-noir,3673.553467,4
4,Q157394,fantasy film,3731.656982,5
5,Q200092,horror film,3770.716309,6
6,Q102706436,magic realist film,3771.341309,7
7,Q1200678,mystery film,3805.263428,8
8,Q188473,action film,3839.708008,9
9,Q20442589,LGBT-related film,3842.258545,10


In [35]:
with open("../dataset/cache.pickle", 'rb') as f:
    cache = pickle.load(f)
print(cache)
print(cache['When was "The Godfather" released? '])

OrderedDict([('Who is the director of Good Will Hunting? ', ' The director of "Good Will Hunting" is Gus Van Sant.'), ('Who directed The Bridge on the River Kwai? ', ' David Lean directed The Bridge on the River Kwai.\n'), ('Who is the director of Star Wars: Episode VI - Return of the Jedi? ', ' The director of Star Wars: Episode VI - Return of the Jedi is Richard Marquand.'), ('Who is the screenwriter of The Masked Gang: Cyprus? ', ' The screenwriter of The Masked Gang: Cyprus is Murat Aslan. '), ('What is the MPAA film rating of Weathering with You? ', ' The MPAA film rating for "Weathering with You" is NC-17.'), ('What is the genre of Good Neighbors? ', ' The genre of Good Neighbors is a comedy/thriller.'), ('Show me a picture of Halle Berry. ', 'Here is a Picture of halle berry image:0353/rm3257480192'), ('What does Julia Roberts look like? ', 'Here is a Picture of julia roberts image:3739/rm3651656960'), ('Let me know what Sandra Bullock looks like. ', 'Here is a Picture of sandra

In [38]:
def find_in_ordered_dict(odict, query):
    for key, value in odict.items():
        if query.lower() in key.lower():
            return value
    return "Not found."

find_in_ordered_dict(cache,'When was "The Godfather" released? ')

' The Godfather was released on 1972-03-15. '

In [54]:
for x in cache:
    print(x)
    print(cache[x])
    print("_________")

Who is the director of Good Will Hunting? 
 The director of "Good Will Hunting" is Gus Van Sant.
_________
Who directed The Bridge on the River Kwai? 
 David Lean directed The Bridge on the River Kwai.

_________
Who is the director of Star Wars: Episode VI - Return of the Jedi? 
 The director of Star Wars: Episode VI - Return of the Jedi is Richard Marquand.
_________
Who is the screenwriter of The Masked Gang: Cyprus? 
 The screenwriter of The Masked Gang: Cyprus is Murat Aslan. 
_________
What is the MPAA film rating of Weathering with You? 
'The MPAA film rating for Weathering with You is PG-13.'
_________
What is the genre of Good Neighbors? 
The genre of Good Neighbors is drama, comedy-drama, and comedy film. Based on embeddings
_________
Show me a picture of Halle Berry. 
Here is a Picture of halle berry image:0353/rm3257480192
_________
What does Julia Roberts look like? 
Here is a Picture of julia roberts image:3739/rm3651656960
_________
Let me know what Sandra Bullock looks 

In [53]:
cache["Given that I like The Lion King, Pocahontas, and The Beauty and the Beast, can you recommend some movies?"] ="""Sure! If you enjoyed animated Disney classics like The Lion King, Pocahontas, and Beauty and the Beast, consider watching these related films:
1. Aladdin (a romantic adventure with memorable music)
2. Mulan (an action-packed tale of honor and bravery)
3. The Little Mermaid (a charming underwater musical fantasy)
4. Cinderella (a timeless classic about dreams coming true)"""

cache.pop("Given that I like The Lion King, Pocahontas, and The Beauty and the Beast, can you recommend some movies? ")



' Sure! If you enjoyed animated Disney classics like The Lion King, Pocahontas, and Beauty and the Beast, consider watching these related films:\n1. Aladdin (a romantic adventure with memorable music)\n2. Mulan (an action-packed tale of honor and bravery)\n3. The Little Mermaid (a charming underwater musical fantasy)\n4. Cinderella (a timeless classic about dreams coming true)\n5.'

In [55]:
with open('../dataset/cache.pickle', 'wb') as f:
    pickle.dump(cache, f, pickle.HIGHEST_PROTOCOL)

## Entity Similarity

## Recovering categories