In [1]:
import numpy as np
import pandas as pd
from rdflib import Namespace, Graph, RDFS
from rdflib.term import URIRef, Literal

In [2]:
# prefixes used in the graph
WD = Namespace('http://www.wikidata.org/entity/')
WDT = Namespace('http://www.wikidata.org/prop/direct/')
SCHEMA = Namespace('http://schema.org/')
DDIS = Namespace('http://ddis.ch/atai/')
QUERY_PREFIX = \
"""PREFIX ddis: <http://ddis.ch/atai/> 
PREFIX wd: <http://www.wikidata.org/entity/> 
PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
PREFIX schema: <http://schema.org/>
"""

In [3]:
graph = Graph()
graph.parse('../data/14_graph.nt', format='turtle')

<Graph identifier=Nf99cf939f7594f8491a04c6b62c036d5 (<class 'rdflib.graph.Graph'>)>

In [4]:
uri2lbl = {uri: str(lbl) for uri, lbl in graph.subject_objects(RDFS.label)}
print(f"Number of uris: {len(uri2lbl)}")
print(f"Example: {dict(list(uri2lbl.items())[:5])}")

Number of uris: 158059
Example: {rdflib.term.URIRef('http://schema.org/description'): 'node description', rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#label'): 'node label', rdflib.term.URIRef('http://www.wikidata.org/entity/P1258'): 'Rotten Tomatoes ID', rdflib.term.URIRef('http://www.wikidata.org/entity/P161'): 'cast member', rdflib.term.URIRef('http://www.wikidata.org/prop/direct/P161'): 'cast member'}


In [5]:
df_uri = pd.DataFrame(list(uri2lbl.items()), columns=['uri', 'label'])
df_uri

Unnamed: 0,uri,label
0,http://schema.org/description,node description
1,http://www.w3.org/2000/01/rdf-schema#label,node label
2,http://www.wikidata.org/entity/P1258,Rotten Tomatoes ID
3,http://www.wikidata.org/entity/P161,cast member
4,http://www.wikidata.org/prop/direct/P161,cast member
...,...,...
158054,http://www.wikidata.org/entity/Q723685,video on demand
158055,http://www.wikidata.org/entity/Q11424,film
158056,http://www.wikidata.org/entity/Q1860,English
158057,http://www.wikidata.org/entity/Q30,United States of America


Prepare predicates

In [6]:
predicates = set(graph.predicates())
df_rel = df_uri[df_uri['uri'].isin(predicates)].reset_index(drop=True)
df_rel

Unnamed: 0,uri,label
0,http://schema.org/description,node description
1,http://www.w3.org/2000/01/rdf-schema#label,node label
2,http://www.wikidata.org/prop/direct/P161,cast member
3,http://www.wikidata.org/prop/direct/P1657,MPAA film rating
4,http://www.wikidata.org/prop/direct/P1981,FSK film rating
...,...,...
248,http://www.wikidata.org/prop/direct/P931,place served by transport hub
249,http://www.wikidata.org/prop/direct/P937,work location
250,http://www.wikidata.org/prop/direct/P941,inspired by
251,http://www.wikidata.org/prop/direct/P945,allegiance


In [7]:
df_rel['label'].duplicated().sum()

0

In [8]:
df_rel[df_rel['uri'].apply(lambda x: x.split('/')[-1][0]!='P')]

Unnamed: 0,uri,label
0,http://schema.org/description,node description
1,http://www.w3.org/2000/01/rdf-schema#label,node label


Entity

In [9]:
entities = set(graph.subjects()) | {s for s in graph.objects() if isinstance(s,URIRef)}

In [10]:
df_ent = df_uri[df_uri['uri'].isin(entities)].reset_index(drop=True)
df_ent

Unnamed: 0,uri,label
0,http://schema.org/description,node description
1,http://www.w3.org/2000/01/rdf-schema#label,node label
2,http://www.wikidata.org/entity/P1258,Rotten Tomatoes ID
3,http://www.wikidata.org/entity/P161,cast member
4,http://www.wikidata.org/prop/direct/P161,cast member
...,...,...
158054,http://www.wikidata.org/entity/Q723685,video on demand
158055,http://www.wikidata.org/entity/Q11424,film
158056,http://www.wikidata.org/entity/Q1860,English
158057,http://www.wikidata.org/entity/Q30,United States of America


In [11]:
df_ent[df_ent['uri'].apply(lambda x: x.split('/')[-1][0]!='Q')]

Unnamed: 0,uri,label
0,http://schema.org/description,node description
1,http://www.w3.org/2000/01/rdf-schema#label,node label
2,http://www.wikidata.org/entity/P1258,Rotten Tomatoes ID
3,http://www.wikidata.org/entity/P161,cast member
4,http://www.wikidata.org/prop/direct/P161,cast member
...,...,...
24867,http://www.wikidata.org/prop/direct/P931,place served by transport hub
24868,http://www.wikidata.org/prop/direct/P937,work location
24869,http://www.wikidata.org/prop/direct/P941,inspired by
24870,http://www.wikidata.org/prop/direct/P945,allegiance


In [25]:
df_ent = df_ent[df_ent['uri'].apply(lambda x: x.split('/')[-1][0]=='Q')].reset_index(drop=True)

In [26]:
df_uri.to_pickle('../data/df_uri.pkl')
df_rel.to_pickle('../data/df_rel.pkl')
df_ent.to_pickle('../data/df_ent.pkl')
df_rel['label'].to_csv('../data/relations.txt', index=False, header=False)

People

In [27]:
PERSON_QUERY = QUERY_PREFIX + \
""" 
SELECT ?person ?lbl WHERE {
?person wdt:P31 wd:Q5 .
?person rdfs:label ?lbl .
}
"""

person_res = graph.query(PERSON_QUERY)

In [28]:
df_person = pd.DataFrame(person_res, columns=['uri', 'label'])
df_person

Unnamed: 0,uri,label
0,http://www.wikidata.org/entity/Q100423423,Viktor Krištof
1,http://www.wikidata.org/entity/Q1012658,Yuji Nomi
2,http://www.wikidata.org/entity/Q1019375,Béatrice Thiriet
3,http://www.wikidata.org/entity/Q102290694,Oleg Kapanets
4,http://www.wikidata.org/entity/Q102443065,Ram Lee
...,...,...
100152,http://www.wikidata.org/entity/Q44442,Dan Castellaneta
100153,http://www.wikidata.org/entity/Q39829,Stephen King
100154,http://www.wikidata.org/entity/Q296577,Frank Welker
100155,http://www.wikidata.org/entity/Q25089,Woody Allen


In [29]:
MOVIE_QUERY = QUERY_PREFIX + \
"""
SELECT ?movie ?lbl WHERE {
?movie wdt:P31 wd:Q11424 .
?movie rdfs:label ?lbl .
}
"""

movie_res = graph.query(MOVIE_QUERY)

In [30]:
df_movie = pd.DataFrame(movie_res, columns=['uri', 'label'])
df_movie

Unnamed: 0,uri,label
0,http://www.wikidata.org/entity/Q1000825,Jan Dara
1,http://www.wikidata.org/entity/Q1001777,Moondram Pirai
2,http://www.wikidata.org/entity/Q1001943,"Buffalo Bill and the Indians, or Sitting Bull'..."
3,http://www.wikidata.org/entity/Q100232971,What We Wanted
4,http://www.wikidata.org/entity/Q1002480,Wanted: Dead or Alive
...,...,...
24379,http://www.wikidata.org/entity/Q23010088,Spider-Man: Homecoming
24380,http://www.wikidata.org/entity/Q17738,Star Wars: Episode IV – A New Hope
24381,http://www.wikidata.org/entity/Q23781155,Avengers: Endgame
24382,http://www.wikidata.org/entity/Q19590955,Rogue One


In [18]:
GENRE_QUERY = QUERY_PREFIX + \
"""
SELECT ?genre ?lbl WHERE {
?genre wdt:P31 wd:Q201658 .
?genre rdfs:label ?lbl .
}
"""
genre_res = graph.query(GENRE_QUERY)

In [19]:
df_genre = pd.DataFrame(genre_res, columns=['uri', 'label'])
df_genre

Unnamed: 0,uri,label
0,http://www.wikidata.org/entity/Q1108032,Mumblecore
1,http://www.wikidata.org/entity/Q11298267,gambling film
2,http://www.wikidata.org/entity/Q1190502,surrealist cinema
3,http://www.wikidata.org/entity/Q1740789,poliziotteschi
4,http://www.wikidata.org/entity/Q21858363,psychedelic film
...,...,...
208,http://www.wikidata.org/entity/Q2484376,thriller film
209,http://www.wikidata.org/entity/Q200092,horror film
210,http://www.wikidata.org/entity/Q188473,action film
211,http://www.wikidata.org/entity/Q157443,comedy film


In [20]:
df_person.to_pickle('../data/df_person.pkl')
df_movie.to_pickle('../data/df_movie.pkl')
df_genre.to_pickle('../data/df_genre.pkl')

In [21]:
df_ent['label'].tolist()

['Jan Dara',
 'Elektra – Original Motion Picture Score',
 'Moondram Pirai',
 "Buffalo Bill and the Indians, or Sitting Bull's History Lesson",
 'Buffalo Girls',
 'Buffalo Girls',
 "Buffalo '66",
 'What We Wanted',
 'Anti-Technodrome gear',
 'Wanted: Dead or Alive',
 'Portable Portal Generator',
 'Linger',
 'Eastern Condors',
 "Jay's Pizza",
 'Amerika',
 'Amerika',
 "Baxter Stockman's teleporter",
 'Sergeant Garcia',
 'Trans Latin American',
 'Bukowski: Born into This',
 'Furiosa',
 'Fatal Move',
 'On the Mountain of Tai Hang',
 'Viktor Krištof',
 'Forever Enthralled',
 'Bulletproof',
 'Bulletproof',
 'Kill the Irishman',
 'Bullseye!',
 'Bully',
 'Bully',
 'Dry Wind',
 'The Big Hit',
 'The Big Hit',
 'Angels Fallen',
 'Gagarine',
 'Golimaar',
 'O Maidens in Your Savage Season',
 'Adhurs',
 'Rose Island',
 'Bunny',
 'Simha',
 'Rakta Charitra',
 'Khaleja',
 'Igor Hnízdo',
 'Jimmy Malone',
 'John Patrick Mason',
 'Ginko',
 'Pilar Estravados',
 'Princess Natalia Dragomiroff',
 'Hildegarde S

In [22]:
df = df_ent[df_ent['label']=="a"]

In [23]:
len(df)

0

In [31]:
df_ent

Unnamed: 0,uri,label
0,http://www.wikidata.org/entity/Q1000825,Jan Dara
1,http://www.wikidata.org/entity/Q100151796,Elektra – Original Motion Picture Score
2,http://www.wikidata.org/entity/Q1001777,Moondram Pirai
3,http://www.wikidata.org/entity/Q1001943,"Buffalo Bill and the Indians, or Sitting Bull'..."
4,http://www.wikidata.org/entity/Q1001994,Buffalo Girls
...,...,...
157754,http://www.wikidata.org/entity/Q723685,video on demand
157755,http://www.wikidata.org/entity/Q11424,film
157756,http://www.wikidata.org/entity/Q1860,English
157757,http://www.wikidata.org/entity/Q30,United States of America
