In [34]:
from time import time

num_parallel = 100
queries = ['allUris:"barackobama.com"'] * 100


tic = time()
make_queries(queries, parallel=num_parallel)
print("num. of parallel queries: {}, num. of seconds: {:.2f}".format(x, time()-tic))

http://kg.diffbot.com/kg/dql_endpoint
num. of parallel queries: 100, num. of seconds: 41.99
num. of parallel queries: 32, num. of seconds: 42.59
num. of parallel queries: 10, num. of seconds: 44.59
num. of parallel queries: 3, num. of seconds: 47.48
num. of parallel queries: 1, num. of seconds: 53.65


In [161]:
from wikidata.client import Client
from traceback import format_exc


WIKIDATA_PREFIX = "wikidata.org/wiki/"
DBPEDIA_PREFIX = "http://dbpedia.org/resource/"
WIKIPEDIA_PREFIX = "wikipedia.org/wiki/"


def get_postfix(string, prefix):
    """ Given a string and a prefix returns postfix. If not found 
    then returns None. """
    
    beg_index = string.find(prefix)
    if beg_index != -1:
        end_index = beg_index + len(prefix)
        return string[end_index:]
    else:
        return None


def wikipedia2dbpedia(wikipedia_url):
    article_name = get_postfix(wikipedia_url, prefix=WIKIPEDIA_PREFIX)
    
    if article_name is None:
        print("Warning: cannot convert to DBpedia URI '{}'".format(wikipedia_url))
        return wikipedia_url
    else:
        return DBPEDIA_PREFIX + article_name                


def wikidataid2wikipedia(wikidata_q_id="Q42"):
    try:
        client = Client()  # doctest: +SKIP
        entity = client.get(wikidata_q_id, load=True)
        can_get = ("sitelinks" in entity.attributes and
                   "enwiki" in entity.attributes["sitelinks"] and
                   "url" in entity.attributes["sitelinks"]["enwiki"])
        if can_get:
            return entity.attributes["sitelinks"]["enwiki"]["url"]
        else:
            return ""
    except KeyboardInterrupt:
        raise KeyboardInterrupt()
    except:
        print("Warning: cannot process '{}'".format(wikidata_q_id))
        #print(format_exc())
        return ""
    
def get_wikidata_id(wikidata_url):
    wikidata_id = get_postfix(wikidata_url, prefix=WIKIDATA_PREFIX)
    if wikidata_id is None:
        print("Warning: cannot extract WikiData ID '{}'".format(wikidata_url))
        return ""
    else:
        return wikidata_id

def wikidata2dbpedia(wikidata_uri):
    wikidata_id = get_wikidata_id(wikidata_uri)
    
    if wikidata_id != "":
        wikipedia_uri = wikidataid2wikipedia(wikidata_id)
        return wikipedia2dbpedia(wikipedia_uri)
    else:
        print("Warning: cannot extract DBpedia URI from a Wikidata URI")
        return wikidata_uri
   
from time import time

tic = time()

for i in range(100):
    wikidata_uri = "http://wikidata.org/wiki/Q{}".format(i)
    print(wikidata_uri, wikidata2dbpedia(wikidata_uri))
    
print(time() - tic)

http://wikidata.org/wiki/Q0

http://wikidata.org/wiki/Q1
http://dbpedia.org/resource/Universe
http://wikidata.org/wiki/Q2
http://dbpedia.org/resource/Earth
http://wikidata.org/wiki/Q3
http://dbpedia.org/resource/Life
http://wikidata.org/wiki/Q4
http://dbpedia.org/resource/Death
http://wikidata.org/wiki/Q5
http://dbpedia.org/resource/Human
http://wikidata.org/wiki/Q6

http://wikidata.org/wiki/Q7

http://wikidata.org/wiki/Q8
http://dbpedia.org/resource/Happiness
http://wikidata.org/wiki/Q9

http://wikidata.org/wiki/Q10

http://wikidata.org/wiki/Q11

http://wikidata.org/wiki/Q12

http://wikidata.org/wiki/Q13
http://dbpedia.org/resource/Triskaidekaphobia
http://wikidata.org/wiki/Q14

http://wikidata.org/wiki/Q15
http://dbpedia.org/resource/Africa
http://wikidata.org/wiki/Q16
http://dbpedia.org/resource/Canada
http://wikidata.org/wiki/Q17
http://dbpedia.org/resource/Japan
http://wikidata.org/wiki/Q18
http://dbpedia.org/resource/South_America
http://wikidata.org/wiki/Q19
http://dbpedia.org/r

KeyboardInterrupt: 

In [119]:
from diffbot_api import *
from ttl import *

import json
from collections import namedtuple


Candidate = namedtuple("Candidate", ["score", "name", "wiki", "types", "names", "uris"])


def find_wiki_uri(uris):
    for uri in uris:
        if "wikipedia.org" in uri:
            return uri
    
    return None


def link_by_importance(diffbot_query_response):
    data = diffbot_query_response["data"]
    candidates = []
    for hit in data:
        uris = set(hit["allUris"])
        if "origin" in hit: uris.add( hit["origin"] )
        if "origins" in hit: uris.union( set(hit["origins"]) )
        if "wikipediaUri" in hit: uris.add( hit["wikipediaUri"] )
        
        if "importance" in hit:
            c = Candidate(hit["importance"],
                          hit["name"],
                          find_wiki_uri(uris),
                          hit["types"],
                          hit["allNames"],
                          uris)
            candidates.append(c)
        else:
            print("Warning: Skipping a hit without importance value.")

    return sorted(candidates, reverse=True)
    

phrases = "New York, windows, catwalk, teardrops, commuters, curved, floor, shape, video, walls, Grand Central Terminal"
phrases =  [Phrase(phrase.strip(), 1, len(phrase.strip()), "http://" + phrase.strip())
                   for phrase in phrases.split(",")]
context = "Inside, it’s even wackier: curved walls, windows in the shape of teardrops, and a catwalk with a tiny video screen embedded in the floor that shows an endless loop of antlike commuters rushing through Grand Central Terminal in New York."
phrases


cq = CachedQuery()

# def mfs(input_ttl):
# graph, context, phrases = parse_d2kb_ttl(input_ttl)

# print("# triples input:", len(graph))
for phrase in phrases:
    # get candidate wikipedia links from the Diffbot KB
    r = cq.make_query('name:"{}"'.format(phrase.text))
    db_response = json.loads(r.content)
    candidates = link_by_importance(db_response) # context here does not matter
    
    best = candidates[0]
    print("{}: {}: {}: {}".format(phrase.text, best.name, ", ".join(best.types), best.uris), end="\n\n")
    
#     # save the results
#     graph.add( (phrase.subj, CLASS_URI, NONE_URI) )
#     graph.add( (phrase.subj, LINK_URI, NONE_URI) )
# print("# triples output:", len(graph))

# output_ttl = str(graph.serialize(format='n3', encoding="utf-8"), "utf-8")
#return output_ttl

cq.close()

New York: New York University: Organization, EducationalInstitution: {'stern.nyu.edu\nhttp://bakos.us', 'yellowpages.com/new-york-ny/mip/new-york-university-868814', 'scps.nyu.edu', 'wikidata.org/entity/Q49210', 'yelp.com/biz/new-york-university-new-york-18', 'facebook.com/103256838688', 'crunchbase.com/organization/new-york-university', 'yellowpages.com/new-york-ny/mip/new-york-university-462705593', 'twitter.com/nyuniversity', 'instagram.com/nyuniversity', 'nyu.edu/employees/resources-and-services/dining-catering/torch-club.html\nhttp://www.stern.nyu.edu', 'foursquare.com/v/new-york-university/5058de7be4b007870dd7ad7a', 'angel.co/new-york-university-1', 'yellowpages.com/new-york-ny/mip/new-york-university-523861823', 'linkedin.com/company/3159', 'nyu.edu', 'illusionstoinference.org\nhttp://unionsquarelaser.com\nhttp://pesaranlab.org\nhttp://stern.nyu.edu', 'linkedin.com/company/new-york-university', 'yellowpages.com/new-york-ny/mip/new-york-university-533090312', '/new-york-universit

In [None]:
gct
russians

In [19]:
%load_ext autoreload
%autoreload 2

# DQL

In [None]:
query_and_save('allUris:"barackobama.com"', "data/all-uris.json")
query_and_save('wikipediaUri:"en.wikipedia.org/wiki/Barack_Obama"', "data/wiki-uri.json")
query_and_save('allUris:"en.wikipedia.org/wiki/Barack\_Obama"', "data/all-uris-wiki.json")
query_and_save('origins:"en.wikipedia.org/wiki/Barack_Obama"', "data/origins.json")

In [None]:
for entity_type in entity_types:
    query_and_save(
        query='type:{}'.format(entity_type),
        output_fpath="data/{}.json".format(entity_type))
    
query_and_save(
    query='type:Person name:"Alexander Panchenko"',
    output_fpath="data/ap.json")


query_and_save(
    query='type:Person employments.employer.name:"Diffbot"',
    output_fpath="data/diffbot-employees.json")


query_and_save(
    query='type:Person employments.{title:"CEO" employer.name:"Diffbot"}',
    output_fpath="data/diffbot-ceo.json")

query_and_save(
    query='type:Person employments.{employer.name:"Diffbot" isCurrent:true}',
    output_fpath="data/diffbot-current-employees.json")


# Testing type of links

In [None]:
query_and_save(
    query='type:Person name:"Angela Merkel"',
    output_fpath="data/am.json")

query_and_save(
    query='type:Person name:"Barack Obama"',
    output_fpath="data/bo.json")

query_and_save(
    query='type:Person name:"Nicolas Sarkozy"',
    output_fpath="data/ns.json")

query_and_save(
    query='type:Person name:"Diego Maradona"',
    output_fpath="data/dm.json")