In [28]:
import requests
import json
import pickle

WIKIDATA_SPARQL_ENDPOINT = """https://query.wikidata.org/bigdata/namespace/wdq/sparql?query="""

def get_simple_map(json, key, value):
    """Must receive a list of dictionaries"""
    return [(item[key]['value'], item[value]['value'])
               for item in json]

def get_query_results(endpoint, query, mime_format):
    headers = {"Accept" : mime_format}
    return requests.get(endpoint+query, headers=headers)


query_bne_entities = """PREFIX wikibase: <http://wikiba.se/ontology>
SELECT ?bne ?wikidata
WHERE { 
    ?wikidata wdt:P950 ?bne .
}
LIMIT 30
"""

#res2 = get_query_results(WIKIDATA_SPARQL_ENDPOINT, query_bne_entities, "application/json")

#map2 = get_simple_map(res2.json()['results']['bindings'],"bne","wikidata")
#for key,value in map2:
#    print(key,"\t",value)

res3 = get_query_results(WIKIDATA_SPARQL_ENDPOINT, query2, "application/json")
jsonlist = res3.json()["results"]["bindings"]


entities = []
relations = []
subs = []

def add_element(element, complete_list):
    try:
        #Item is on the list -> you don't need to add again
        return complete_list.index(element)
    except ValueError:
        # Item is not on the list -> add it and return id
        complete_list.append(element)
        return len(complete_list)-1

def extract_entity(entity):
    if entity["type"] == "uri":
        return entity["value"]
    elif entity["type"] == "literal":
        return entity
    elif entity["type"] == "bnode":
        return entity

# for triplet in jsonlist:
#     id_obj = add_element(extract_entity(triplet["object"]), entities)
#     id_subj = add_element(extract_entity(triplet["subject"]), entities)
#     id_pred = add_element(extract_entity(triplet["predicate"]), relations)
    
#     subs.append([id_obj, id_subj, id_pred])
    
    
class Dataset():
    WIKIDATA_ENDPOINT = """https://query.wikidata.org/bigdata/namespace/wdq/sparql?query="""
    entities = []
    relations = []
    subs = []
    
    def show(self, verbose=False):
        print("%d entities, %d relations, %d tripletas" % 
              (len(self.entities), len(self.relations), len(self.subs)))
        if verbose is True:
            print("\nEntities (%d):" % len(self.entities))
            for entity in self.entities:
                print(entity)
            print("\nRelations (%d):" % len(self.relations))
            for relation in self.relations:
                print(relation)
            print("\nTripletas (%d):" % len(self.subs))
            for sub in self.subs:
                print(sub)
        
    
    def add_element(self, element, complete_list, only_uri=False):
        if only_uri is True and type(element) is not type(""):
            return False
        elif element is False:
            return False
        
        try:
            # Item is on the list, return same id
            return complete_list.index(element)
        except ValueError:
            # Item is not on the list, append and return id
            complete_list.append(element)
            return len(complete_list)-1
        

    def extract_entity(self,entity):
        if entity["type"] == "uri":
            # Not all 'uri' values are valid entities
            uri = entity["value"].split('/')
            
            if uri[2] == 'www.wikidata.org' and (uri[3] == "reference" or uri[4] == "statement"):
                #print (uri, "is not an entity")
                return False
            
            else:
                return entity["value"]
            
        elif entity["type"] == "literal":
            return entity
        elif entity["type"] == "bnode":
            return entity

    def load_dataset_from_json(self, json, only_uri=False):
        for triplet in jsonlist:
            id_obj = self.add_element(self.extract_entity(triplet["object"]), self.entities, only_uri=only_uri)
            id_subj = self.add_element(self.extract_entity(triplet["subject"]), self.entities, only_uri=only_uri)
            id_pred = self.add_element(self.extract_entity(triplet["predicate"]), self.relations, only_uri=only_uri)

            if id_obj is False or id_subj is False or id_pred is False:
                continue
            else:
                self.subs.append([id_obj, id_subj, id_pred])
            
    def load_dataset_from_query(self, query, only_uri=False):
        headers = {"Accept" : "application/json"}
        response = requests.get(self.WIKIDATA_ENDPOINT + query, headers=headers)
        jsonlist = response.json()["results"]["bindings"]
        
        self.load_dataset_from_json(jsonlist, only_uri=only_uri)
    
    def save_to_binary(self, filepath):
        all_dataset = {
            'entities': self.entities,
            'relations': self.relations,
            'subs': self.subs
        }
        try:
            f = open(filepath, "wb+")
        except Exception:
            print("The path you provided is not valid")
            return False
        pickle.dump(all_dataset, f)
        f.close()
    
    def load_from_binary(self, filepath):
        try:
            f = open(filepath, "rb")
        except Exception:
            print("The path you provided is not valid")
            return False
        all_dataset = pickle.load(f)
        f.close()
        
        self.entities = all_dataset['entities']
        self.relations = all_dataset['relations']
        self.subs = all_dataset['subs']
        
    
datos = Dataset()
query2 = """PREFIX wikibase: <http://wikiba.se/ontology>
construct { ?wikidata ?predicate ?object . ?object ?predicate2 ?object2 . ?object2 ?predicate3 ?object3 }
WHERE { ?wikidata wdt:P950 ?bne .
?wikidata ?predicate ?object .
?object ?predicate2 ?object2 .
?object2 ?predicate3 ?object3
} LIMIT 10000
"""
datos.load_dataset_from_query(query2, only_uri=True)
datos.show()

#datos.save_to_binary("aquimismo")


        
#print(len(jsonlist))
#jsonlist

6 entities, 10 relations, 0 tripletas


In [2]:
"http://www.wikidata.org/entity/Qxxx".split('/')

['http:', '', 'www.wikidata.org', 'entity', 'Qxxx']