In [1]:
!pip install networkx
!pip install rdflib
!pip install numpy
!pip install sparqlwrapper

Collecting networkx
[?25l  Downloading https://files.pythonhosted.org/packages/f3/f4/7e20ef40b118478191cec0b58c3192f822cace858c19505c7670961b76b2/networkx-2.2.zip (1.7MB)
[K    100% |████████████████████████████████| 1.7MB 10.1MB/s ta 0:00:01
Building wheels for collected packages: networkx
  Running setup.py bdist_wheel for networkx ... [?25ldone
[?25h  Stored in directory: /Users/alexyoo/Library/Caches/pip/wheels/68/f8/29/b53346a112a07d30a5a84d53f19aeadaa1a474897c0423af91
Successfully built networkx
Installing collected packages: networkx
Successfully installed networkx-2.2
[33mYou are using pip version 10.0.1, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Collecting rdflib
[?25l  Downloading https://files.pythonhosted.org/packages/3c/fe/630bacb652680f6d481b9febbb3e2c3869194a1a5fc3401a4a41195a2f8f/rdflib-4.2.2-py3-none-any.whl (344kB)
[K    100% |████████████████████████████████| 348kB 9.6MB/s eta 0:00:01
[?2

In [2]:
import rdflib
import numpy as np
from collections import Counter
from SPARQLWrapper import SPARQLWrapper, JSON
import networkx as nx
import requests

In [3]:
def query_wiki_article_title(query):
    params = {
    'action':"query",
    'list':"search",
    'srsearch': query,
    'format':"json"
    }
    resp = requests.get("https://en.wikipedia.org/w/api.php", params)
    if resp.status_code != 200:
        return None
    results = resp.json()
    if len(results):
        return results['query']['search'][0]['title'].replace(" ", "_")

In [4]:
def get_link_set(article_link):
    q ="""PREFIX p: <http://www.wikidata.org/prop/>
    SELECT DISTINCT ?thing ?relation
    WHERE { ?thing ?relation <"""+ article_link +"""> . }
    LIMIT 10"""
    sparql = SPARQLWrapper("http://dbpedia.org/sparql")
    sparql.setReturnFormat(JSON)
    sparql.setQuery(q)  # the previous query as a literal string
    json = sparql.query().convert()
    links = json["results"]["bindings"]
    return set([obj["thing"]["value"] for obj in links]) 

In [5]:
def get_link_set_outlinks(article_link):
    q ="""PREFIX dbr: <http://dbpedia.org/resource/>
    PREFIX owl: <http://www.w3.org/2002/07/owl#>
    SELECT DISTINCT ?p ?o
    WHERE {
    <""" + article_link + """> ?p ?o . 
    ?o a owl:Thing . 
    }
    LIMIT 10"""
    sparql = SPARQLWrapper("http://dbpedia.org/sparql")
    sparql.setReturnFormat(JSON)
    sparql.setQuery(q)  # the previous query as a literal string
    json = sparql.query().convert()
    links = json["results"]["bindings"]
    return set([obj["o"]["value"] for obj in links]) 

In [6]:
def create_graph(raw_titles, get_link_set_fn):
    G = nx.Graph()
    for cand in raw_titles:
        formatted_cand = query_wiki_article_title(cand)
        candlink = "http://dbpedia.org/resource/" + formatted_cand
        G.add_node(candlink)
        linkSet = get_link_set_fn(candlink)
        for link in linkSet:
            G.add_node(link)
            # Explore 1x time
            deg2linkSet = get_link_set_fn(link)
            for link2 in deg2linkSet:
                G.add_node(link2)
                G.add_edge(link, link2)
            # Create edges to links
            G.add_edge(candlink, link)
    return G

In [16]:
def generate_graph_pickles(output_candidates_fname="output_candidates"):
    with open(output_candidates_fname) as f:
        contents = f.readlines()
        contents = [x.strip() for x in contents] 
        for i, titles in enumerate(contents):
            print ("creating topic " + str(i) + " pickle file")
            G = create_graph(titles.split(), get_link_set)
            fname = "Topics/topic" +str(i)+ "G"
            nx.write_gml(G, fname)
        f.close()

In [48]:
def extract_ranked_cands(seed_cands, top_links):
    """
    Extract the generated candidates' rank out from the raw order after
    DBpedia exploration.

    :param seed_cands: list of generated candidates
    :param top_links: list of (link, score) tuples 
    """
    cand_pool = set(seed_cands)
    print (cand_pool)
    top_cands = []
    for raw_link, _ in top_links:
        last_sep = raw_link.rfind("/")
        raw_title = raw_link[last_sep + 1:]
        title = raw_title.lower()        
        if title in cand_pool:
            top_cands.append(title)
    return top_cands

In [49]:
def get_best_label(label_list,num):
    fname = "Topics/topic" +str(num)+ "G"
    G = nx.read_gml(fname)
    Gc = max(nx.connected_component_subgraphs(G), key=len)
    
    centrality_measure = nx.betweenness_centrality(Gc)
    top_links = sorted(centrality_measure.items(), key=lambda x: x[1], reverse=True)
    cands_ranks = extract_ranked_cands(label_list, top_links)
    return cands_ranks

In [50]:
get_best_label(d[0],0)

{'oracle_database', 'microsoft_exchange_server', 'sun_microsystems', 'web_application', 'postgresql', 'virtualization', 'operating_system', 'hypervisor', 'sharepoint', 'windows_server_2003', 'windows_2000', 'cloud_computing', 'hyper-v', 'windows_server_2008', 'vmware', 'application_server', 'netware', 'desktop_virtualization', 'microsoft_sql_server'}


['vmware',
 'sharepoint',
 'windows_2000',
 'oracle_database',
 'hyper-v',
 'virtualization',
 'postgresql',
 'cloud_computing',
 'hypervisor',
 'sun_microsystems',
 'microsoft_exchange_server',
 'microsoft_sql_server',
 'windows_server_2008',
 'netware',
 'windows_server_2003',
 'application_server',
 'web_application',
 'operating_system',
 'desktop_virtualization']

In [10]:
mygraph = nx.read_gml("Topics/topic0G")

In [12]:
Gc = max(nx.connected_component_subgraphs(mygraph), key=len)

In [13]:
clo_gen = nx.betweenness_centrality(Gc)

In [14]:
sorted(clo_gen.items(), key=lambda x: x[1], reverse=True)[:20]

[('http://dbpedia.org/resource/Quest_Software', 0.16633517915829701),
 ('http://dbpedia.org/resource/History_of_operating_systems',
  0.1103421187817401),
 ('http://dbpedia.org/resource/VMware', 0.10076071149512589),
 ('http://dbpedia.org/resource/SharePoint', 0.09884272261551647),
 ('http://dbpedia.org/resource/Windows_2000', 0.0936466166606129),
 ('http://dbpedia.org/resource/Oracle_Database', 0.09075914437434665),
 ('http://dbpedia.org/resource/Batch_processing', 0.08844876038028432),
 ('http://dbpedia.org/resource/Hyper-V', 0.08765223583792187),
 ('http://dbpedia.org/resource/Virtualization', 0.08568079725357391),
 ('http://dbpedia.org/resource/PostgreSQL', 0.08538075306424328),
 ('http://dbpedia.org/resource/Cloud_computing', 0.08347109933432417),
 ('http://dbpedia.org/resource/Hypervisor', 0.08019741316299675),
 ('http://dbpedia.org/resource/List_of_computing_and_IT_abbreviations',
  0.07717333723838925),
 ('http://dbpedia.org/resource/Sun_Microsystems', 0.07377822512217176),
 ('