In [None]:
import requests
import string
import re
import urllib.parse
import csv
import time
from py2neo import Graph
from queue import Queue
import progressbar
import numpy as np
import os
import pyparsing as pp

def runWithRetry(query):
    try:
        return graph.run(query)
    except Exception as e:
        print(e.args[0])
        time.sleep(1)
        return runWithRetry(query)

def getWithRetry(url):
    try:
        return requests.get(url, headers = {"User-Agent":"Wiki_NLP/0.0 (https://github.com/greenguy33/wikidata-subgraph-builder; hfreedma@uci.edu)"})
    except Exception:
        print("Connection timed out and retrying URL: ", url)
        time.sleep(1)
        return getWithRetry(url)
    
#graph = Graph("http://localhost:7474/db/data/")
    
def main():
    output_filename = "roman_history"
    primaryDomain = "History of Rome"
    repo_name = "na"
    
    if os.path.exists("data/output/"+output_filename+"_all_results.txt") == False:
        #strongLinks = getStrongLinks(primaryDomain)
        strongLinks = [primaryDomain]
        #strongLinks.remove("Rome")
        networkHops = 1
        networkList, redirectMap = getWikipediaNetworkList(strongLinks, networkHops)
        linkData = parseWikipediaPagesInNetwork(networkList, redirectMap)

        with open("data/output/"+output_filename+"_all_results.txt", 'w', newline='', encoding = "utf-8") as csvfile:
            writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
            writer.writerows(linkData)
    
    linkData = []
    with open("data/output/"+output_filename+"_all_results.txt", 'r', encoding = "utf-8") as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='|')
        for row in reader:
            linkData.append(row)
    
    training_data, test_data = checkWikidataForConnections(linkData, repo_name)
    
    print("final training size: ", len(training_data))
    headers = ['Origin Page','Destination Page','Link Text','Sentence Text','Wikidata Property Label','Direction']
    with open("data/output/training/"+output_filename+"_training.txt", 'w', newline='', encoding = "utf-8") as csvfile:
        writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
        writer.writerow(headers)
        writer.writerows(training_data)

    print("final test size: ", len(test_data))
    headers = ['Origin Page','Destination Page','Link Text','Sentence Text']
    with open("data/output/test/"+output_filename+"_test.txt", 'w', newline='', encoding = "utf-8") as csvfile:
        writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
        writer.writerow(headers)
        writer.writerows(test_data)

# Neo4j Cypher lookup
# Handles page redirects cleanly
def getWikipediaNetworkList(domains, networkHops):
    param_query = "MATCH (source:Page {title: \"subj\"}) {direction} (target:Page) RETURN target.title"
    networkList = set()
    redirectMap = {}
    for domain in domains:
        queue = Queue()
        queue.put({domain:0})
        networkList.add(domain)
        while(queue.empty() == False):
            nextItem = queue.get()
            element = next(iter(nextItem))
            layer = nextItem[element]
            if layer >= networkHops:
                break
            query = param_query.replace("subj", element.replace("\"","\\\"")).replace("{direction}", "- [:Link] -")
            query_data = runWithRetry(query).data()
            for res in query_data:
                page = res["target.title"]
                # Get only forward links to see if page is a redirect
                query = param_query.replace("subj", page.replace("\"","\\\"")).replace("{direction}", "- [:Link] -> ")
                query_data = runWithRetry(query).data()
                if (len(query_data) == 1):
                    redirected_page = query_data[0]["target.title"]
                    #print("Found redirect: ", page, " to ", redirected_page)
                    redirectMap[page] = redirected_page
                    page = redirected_page
                networkList.add(page)
                queue.put({page:layer+1})
    print("Total network size: ", str(len(networkList)))
    return networkList, redirectMap

# Request and parse Wikipedia HTML
# Only parses introduction/abstract section of pages
# A shortcoming is that this code will fail to parse link text with a . in it
def parseWikipediaPagesInNetwork(networkList, redirectMap):
    linkData = []
    for line in progressbar.progressbar(networkList, redirect_stdout=True):
        if line in redirectMap:
            line = redirectMap[line]
            print("Redirected to: ", line)
        pageTitle = urllib.parse.quote(line)
        getUrl = f"https://en.wikipedia.org/w/api.php?action=parse&page={pageTitle}&format=json"
        jsonRes = getWithRetry(getUrl).json()
        if 'parse' not in jsonRes:
            print("Could not retrieve data for page: ", line)
        else:
            jsonRes = jsonRes['parse']['text']['*']
            absStart = jsonRes.split("<p>", 1)
            if len(absStart) == 1:
                print("Unable to parse page: ", line)
            else:
                absText = absStart[1].split("<h2")[0]
                sentences = re.split(r'\n|\. |\.<sup id=', absText)
                for i in sentences:
                    linkSplit = i.split("<a href=")
                    if len(linkSplit) > 1:
                        for link in linkSplit:
                            if link.startswith("\"/wiki/"):
                                destinationPage = link.split("\"/wiki/")[1].split("\"")[0]
                                if destinationPage.replace("_"," ") in networkList:
                                    if ">" not in link or "<" not in link:
                                        print("failed to parse link text: ", link)
                                    else:
                                        linkTitle = link.split(">")[1].split("<")[0]
                                        sentString = re.sub('<[^>]+>', '', i)
                                        sentString = sentString.translate(str.maketrans('', '', string.punctuation))
                                        sentString = re.sub('91[0-9]+93', '', sentString)
                                        destinationPage = destinationPage.replace("|","")
                                        linkTitle = linkTitle.replace("|","")
                                        if sentString.startswith("citeref"):
                                            sentString = sentString.split(" ",2)[2]
                                        if "redirect" not in sentString:
                                            newRow = [line.replace("_"," "), destinationPage.replace("_"," "), linkTitle, sentString]
                                            linkData.append(newRow)
    return linkData

# Query Wikidata SPARQL Endpoint
def checkWikidataForConnections(linkData, repo_name):
    print("Querying Wikidata")
    filedir = "data/output/"
    # Modify to query Wikidata directly
    #queryHead = f"http://localhost:7200/repositories/{repo_name}?name=&infer=false&sameAs=false&query="
    queryHead = "https://query.wikidata.org/bigdata/namespace/wdq/sparql?query="
    training_data = []
    test_data = []
    
    queryTail = "select ?s ?p ?o where {"
    itemsInQuery = {}
    batchSize = 20
    batchCount = 0
    
    wp2wd = {}
    # load wp2wd map
    with open("data/wp2wd.txt", 'r', encoding="utf-8") as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='|')
        for row in reader:
            wikiString = ""
            for i in range(len(row) - 1):
                if i != 0 and row[i] != row[-1]:
                    wikiString += ","
                wikiString += row[i]
            wp2wd[wikiString] = row[-1]
    
    for i in progressbar.progressbar(range(len(linkData)), redirect_stdout=True):
    #for i in range(len(linkData)):
        #print(linkData[i])
        if queryTail == "select ?s ?p ?o where {":
            queryTail += "{"
        else:
            queryTail += "UNION {"
        #subj = "<https://en.wikipedia.org/wiki/" + linkData[i][0].replace(" ","_") + ">"
        #obj = "<https://en.wikipedia.org/wiki/" + linkData[i][1].replace(" ","_") + ">"
        subject_present = True
        object_present = True
        if linkData[i][0] not in wp2wd:
            subject_present = False
            print("Failed to find Wikidata ID for ", linkData[i][0])
        if linkData[i][1] not in wp2wd:
            object_present = False
            print("Failed to find Wikidata ID for ", linkData[i][1])
        if (subject_present and object_present):
            subj = "wd:" + wp2wd[linkData[i][0]]
            obj = "wd:" + wp2wd[linkData[i][1]]
            queryTail += "VALUES ?s {"+subj+"} VALUES ?o {"+obj+"} ?s ?p ?o } UNION { VALUES ?s {"+obj+"} VALUES ?o {"+subj+"} ?s ?p ?o }"
        
        linkData[i][0] = linkData[i][0].translate(str.maketrans('', '', string.punctuation)).replace("httpsenwikipediaorgwiki","")
        linkData[i][1] = linkData[i][1].translate(str.maketrans('', '', string.punctuation)).replace("httpsenwikipediaorgwiki","")
        linkData[i][2] = linkData[i][2].translate(str.maketrans('', '', string.punctuation))
        
        itemsInQuery[subj+obj] = linkData[i]
        
        if (i % batchSize == 0 and i != 0) or i == len(linkData) - 1:
            keysToDelete = set()
            queryTail += "}"
            queryTail = urllib.parse.quote(queryTail)
            #print(queryHead + queryTail)
            res = str(getWithRetry(queryHead + queryTail).content)
            # This is probably the worst error handling possible
            if "Error: 429" in res or "414 Request-URI Too Large" in res:
            #if 'Bad Request' in res:
                print(res)
                raise Exception("Server returned error")
            
            # WIKIDATA QUERY SERVICE PARSER
            elements = []
            if "<result>" in res:
                #print(res)
                ressplit = res.split("<result>")
                for i in range(1,len(ressplit)):
                    urisplit = ressplit[i].split("<uri>")
                    for j in range(1,len(urisplit)):
                        elements.append(urisplit[j].split("</uri>")[0].replace("http://www.wikidata.org/entity/","").replace("http://www.wikidata.org/prop/direct/",""))
            reshape = np.reshape(elements, (-1,3))
            
            # GRAPH DB API PARSER
            #if "http://www.wikidata.org/prop/direct/" in res:
            #    split_res = str(res[11:-1]).split("\\r\\n")[:-1]
            #    for row in split_res:
            #        csv_line = pp.commaSeparatedList.copy().addParseAction(pp.tokenMap(lambda s: s.strip('"')))
            #        row = csv_line.parseString(row).asList()
            #        print(row)
            #        x = row[1].split("http://www.wikidata.org/prop/direct/")
            #        wdp = x[1].split("\\")[0]
            #        resSubj = "<" + row[0].replace("\\","") + ">"
            #        resObj = "<" + row[2].replace("\\","") + ">"
            
            for datarow in reshape:
                resSubj = "wd:"+datarow[0]
                resObj = "wd:"+datarow[1]
                wdp = datarow[2]
                if (resSubj+resObj in itemsInQuery):
                    thisRow = itemsInQuery[resSubj+resObj]
                    training_row = [thisRow[0], thisRow[1], thisRow[2], thisRow[3], wdp, "Forwards"]
                    #print(training_row)
                    training_data.append(training_row)
                    keysToDelete.add(resSubj+resObj)
                elif (resObj+resSubj in itemsInQuery):
                    thisRow = itemsInQuery[resObj+resSubj]
                    training_row = [thisRow[0], thisRow[1], thisRow[2], thisRow[3], wdp, "Backwards"]
                    #print(training_row)
                    training_data.append(training_row)
                    keysToDelete.add(resObj+resSubj)
                else:
                    #print(itemsInQuery)
                    raise Exception("Did not find row from query result in map: ", resSubj, " ", resObj)
            
            for key in keysToDelete:
                del itemsInQuery[key]
            for key in itemsInQuery:
                test_data.append(itemsInQuery[key])
                
            itemsInQuery = {}
            queryTail = "select ?s ?p ?o where {"
            batchCount = batchCount + 1
            #print("batch ", batchCount, " complete")

    return training_data, test_data

# Neo4j Cypher lookup
# DOES NOT Handle page redirects cleanly (as of now)
def getStrongLinks(domain):
    print("Finding strong links for: ", domain)
    query = "MATCH (source:Page {title: \""+domain+"\"}) - [link1:Link] - (target:Page) - [link2:Link] - (source) WHERE link1 <> link2 RETURN distinct target.title"
    query_data = runWithRetry(query).data()
    strong_links = [domain]
    for res in query_data:
        page = res["target.title"]
        strong_links.append(page)
    print("Found ", str(len(strong_links)), " strong links")
    return strong_links

if __name__ == "__main__":
    main()

Querying Wikidata


  7% (3661 of 49696) |#                  | Elapsed Time: 0:02:44 ETA:   0:40:03

Connection timed out and retrying URL:  https://query.wikidata.org/bigdata/namespace/wdq/sparql?query=select%20%3Fs%20%3Fp%20%3Fo%20where%20%7B%7BVALUES%20%3Fs%20%7Bwd%3AQ17036918%7D%20VALUES%20%3Fo%20%7Bwd%3AQ12560%7D%20%3Fs%20%3Fp%20%3Fo%20%7D%20UNION%20%7B%20VALUES%20%3Fs%20%7Bwd%3AQ12560%7D%20VALUES%20%3Fo%20%7Bwd%3AQ17036918%7D%20%3Fs%20%3Fp%20%3Fo%20%7DUNION%20%7BVALUES%20%3Fs%20%7Bwd%3AQ17036918%7D%20VALUES%20%3Fo%20%7Bwd%3AQ822%7D%20%3Fs%20%3Fp%20%3Fo%20%7D%20UNION%20%7B%20VALUES%20%3Fs%20%7Bwd%3AQ822%7D%20VALUES%20%3Fo%20%7Bwd%3AQ17036918%7D%20%3Fs%20%3Fp%20%3Fo%20%7DUNION%20%7BVALUES%20%3Fs%20%7Bwd%3AQ17036918%7D%20VALUES%20%3Fo%20%7Bwd%3AQ71084%7D%20%3Fs%20%3Fp%20%3Fo%20%7D%20UNION%20%7B%20VALUES%20%3Fs%20%7Bwd%3AQ71084%7D%20VALUES%20%3Fo%20%7Bwd%3AQ17036918%7D%20%3Fs%20%3Fp%20%3Fo%20%7DUNION%20%7BVALUES%20%3Fs%20%7Bwd%3AQ17036918%7D%20VALUES%20%3Fo%20%7Bwd%3AQ173065%7D%20%3Fs%20%3Fp%20%3Fo%20%7D%20UNION%20%7B%20VALUES%20%3Fs%20%7Bwd%3AQ173065%7D%20VALUES%20%3Fo%20%7Bwd%3AQ1

  7% (3801 of 49696) |#                  | Elapsed Time: 0:03:11 ETA:   0:30:14

Failed to find Wikidata ID for  Metelli


 10% (5161 of 49696) |#                  | Elapsed Time: 0:04:13 ETA:   0:33:46

Failed to find Wikidata ID for  Maecenas


 14% (7181 of 49696) |##                 | Elapsed Time: 0:05:43 ETA:   0:33:40

Failed to find Wikidata ID for  Alexander VI
Failed to find Wikidata ID for  Alexander VI


 15% (7601 of 49696) |##                 | Elapsed Time: 0:06:02 ETA:   0:36:12

Failed to find Wikidata ID for  Occitan


 28% (14221 of 49696) |#####             | Elapsed Time: 0:11:05 ETA:   0:26:20

Failed to find Wikidata ID for  Alexander VI


 36% (18141 of 49696) |######            | Elapsed Time: 0:14:05 ETA:   0:19:52

Failed to find Wikidata ID for  Founding myth


 37% (18841 of 49696) |######            | Elapsed Time: 0:14:36 ETA:   0:24:57

Failed to find Wikidata ID for  Occitan


 38% (18981 of 49696) |######            | Elapsed Time: 0:14:42 ETA:   0:23:50

Failed to find Wikidata ID for  Maecenas


 39% (19561 of 49696) |#######           | Elapsed Time: 0:15:07 ETA:   0:22:25

Failed to find Wikidata ID for  Gaius Calpurnius Piso


 42% (20921 of 49696) |#######           | Elapsed Time: 0:16:10 ETA:   0:19:19

Failed to find Wikidata ID for  Palazzo Nuovo


 51% (25681 of 49696) |#########         | Elapsed Time: 0:19:46 ETA:   0:18:08

Failed to find Wikidata ID for  Occitan


 52% (26241 of 49696) |#########         | Elapsed Time: 0:20:09 ETA:   0:15:50

Failed to find Wikidata ID for  Gaius Calpurnius Piso


 55% (27521 of 49696) |#########         | Elapsed Time: 0:21:05 ETA:   0:16:24