In [1]:
import requests
import string
import re
import urllib.parse
import csv
import time
from py2neo import Graph
from queue import Queue
import progressbar
import numpy as np
import os
import pyparsing as pp

def runWithRetry(query):
    try:
        return graph.run(query)
    except Exception as e:
        print(e.args[0])
        time.sleep(1)
        return runWithRetry(query)

def getWithRetry(url):
    try:
        return requests.get(url)
    except Exception:
        print("Connection timed out and retrying URL: ", url)
        time.sleep(1)
        return getWithRetry(url)
    
graph = Graph("http://localhost:7474/db/data/")
    
def main():
    output_filename = "roman_history_results"
    primaryDomain = "History of Rome"
    repo_name = "roman_history_wd_links"
    
    if os.path.exists("data/output/"+output_filename+"_all_results.txt") == False:
        strongLinks = getStrongLinks(primaryDomain)
        #strongLinks = [primaryDomain]
        strongLinks.remove("Rome")
        networkHops = 1
        networkList, redirectMap = getWikipediaNetworkList(strongLinks, networkHops)
        linkData = parseWikipediaPagesInNetwork(networkList, redirectMap)

        with open("data/output/"+output_filename+"_all_results.txt", 'w', newline='', encoding = "utf-8") as csvfile:
            writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
            writer.writerows(linkData)
    
    linkData = []
    with open("data/output/"+output_filename+"_all_results.txt", 'r', encoding = "utf-8") as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='|')
        for row in reader:
            linkData.append(row)
    
    #training_data, test_data = checkWikidataForConnections(linkData, repo_name)
    
    #print("final training size: ", len(training_data))
    #headers = ['Origin Page','Destination Page','Link Text','Sentence Text','Wikidata Property Label','Direction']
    #with open("data/output/training/"+output_filename+"_training.txt", 'w', newline='', encoding = "utf-8") as csvfile:
    #    writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
    #    writer.writerow(headers)
    #    writer.writerows(training_data)

    #print("final test size: ", len(test_data))
    #headers = ['Origin Page','Destination Page','Link Text','Sentence Text']
    #with open("data/output/test/"+output_filename+"_test.txt", 'w', newline='', encoding = "utf-8") as csvfile:
    #    writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
    #    writer.writerow(headers)
    #    writer.writerows(test_data)

# Neo4j Cypher lookup
# Handles page redirects cleanly
def getWikipediaNetworkList(domains, networkHops):
    param_query = "MATCH (source:Page {title: \"subj\"}) {direction} (target:Page) RETURN target.title"
    networkList = set()
    redirectMap = {}
    for domain in domains:
        queue = Queue()
        queue.put({domain:0})
        networkList.add(domain)
        while(queue.empty() == False):
            nextItem = queue.get()
            element = next(iter(nextItem))
            layer = nextItem[element]
            if layer >= networkHops:
                break
            query = param_query.replace("subj", element.replace("\"","\\\"")).replace("{direction}", "- [:Link] -")
            query_data = runWithRetry(query).data()
            for res in query_data:
                page = res["target.title"]
                # Get only forward links to see if page is a redirect
                query = param_query.replace("subj", page.replace("\"","\\\"")).replace("{direction}", "- [:Link] -> ")
                query_data = runWithRetry(query).data()
                if (len(query_data) == 1):
                    redirected_page = query_data[0]["target.title"]
                    #print("Found redirect: ", page, " to ", redirected_page)
                    redirectMap[page] = redirected_page
                    page = redirected_page
                networkList.add(page)
                queue.put({page:layer+1})
    print("Total network size: ", str(len(networkList)))
    return networkList, redirectMap

# Request and parse Wikipedia HTML
# Only parses introduction/abstract section of pages
# A shortcoming is that this code will fail to parse link text with a . in it
def parseWikipediaPagesInNetwork(networkList, redirectMap):
    linkData = []
    for line in progressbar.progressbar(networkList, redirect_stdout=True):
        if line in redirectMap:
            line = redirectMap[line]
            print("Redirected to: ", line)
        pageTitle = urllib.parse.quote(line)
        getUrl = f"https://en.wikipedia.org/w/api.php?action=parse&page={pageTitle}&format=json"
        jsonRes = getWithRetry(getUrl).json()
        if 'parse' not in jsonRes:
            print("Could not retrieve data for page: ", line)
        else:
            jsonRes = jsonRes['parse']['text']['*']
            absStart = jsonRes.split("<p>", 1)
            if len(absStart) == 1:
                print("Unable to parse page: ", line)
            else:
                absText = absStart[1].split("<h2")[0]
                sentences = re.split(r'\n|\. |\.<sup id=', absText)
                for i in sentences:
                    linkSplit = i.split("<a href=")
                    if len(linkSplit) > 1:
                        for link in linkSplit:
                            if link.startswith("\"/wiki/"):
                                destinationPage = link.split("\"/wiki/")[1].split("\"")[0]
                                if destinationPage.replace("_"," ") in networkList:
                                    if ">" not in link or "<" not in link:
                                        print("failed to parse link text: ", link)
                                    else:
                                        linkTitle = link.split(">")[1].split("<")[0]
                                        sentString = re.sub('<[^>]+>', '', i)
                                        sentString = sentString.translate(str.maketrans('', '', string.punctuation))
                                        sentString = re.sub('91[0-9]+93', '', sentString)
                                        destinationPage = destinationPage.replace("|","")
                                        linkTitle = linkTitle.replace("|","")
                                        if sentString.startswith("citeref"):
                                            sentString = sentString.split(" ",2)[2]
                                        if "redirect" not in sentString:
                                            newRow = [line.replace("_"," "), destinationPage.replace("_"," "), linkTitle, sentString]
                                            linkData.append(newRow)
    return linkData

# Query Wikidata SPARQL Endpoint
def checkWikidataForConnections(linkData, repo_name):
    filedir = "data/output/"
    queryHead = f"http://localhost:7200/repositories/{repo_name}?name=&infer=false&sameAs=false&query="
    training_data = []
    test_data = []
    
    queryTail = "select ?s ?p ?o where {"
    itemsInQuery = {}
    # "header too big" error when set to 50
    batchSize = 10
    batchCount = 0
    
    #for i in progressbar.progressbar(range(len(linkData)), redirect_stdout=True):
    for i in range(len(linkData)):
        #print(linkData[i])
        if queryTail == "select ?s ?p ?o where {":
            queryTail += "{"
        else:
            queryTail += "UNION {"
        subj = "<https://en.wikipedia.org/wiki/" + linkData[i][0].replace(" ","_") + ">"
        obj = "<https://en.wikipedia.org/wiki/" + linkData[i][1].replace(" ","_") + ">"
        queryTail += "VALUES ?s {"+subj+"} VALUES ?o {"+obj+"} ?s ?p ?o } UNION { VALUES ?s {"+obj+"} VALUES ?o {"+subj+"} ?s ?p ?o }"
        linkData[i][0] = linkData[i][0].translate(str.maketrans('', '', string.punctuation)).replace("httpsenwikipediaorgwiki","")
        linkData[i][1] = linkData[i][1].translate(str.maketrans('', '', string.punctuation)).replace("httpsenwikipediaorgwiki","")
        linkData[i][2] = linkData[i][2].translate(str.maketrans('', '', string.punctuation))
        
        itemsInQuery[subj+obj] = linkData[i]
        
        if (i % batchSize == 0 and i != 0) or i == len(linkData) - 1:
            keysToDelete = set()
            queryTail += "}"
            #print(queryTail)
            queryTail = urllib.parse.quote(queryTail)
            #print(queryHead + queryTail)
            res = str(getWithRetry(queryHead + queryTail).content)
            if 'Bad Request' in res:
                raise Exception("Error in query")
            if "http://www.wikidata.org/prop/direct/" in res:
                split_res = str(res[11:-1]).split("\\r\\n")[:-1]
                for row in split_res:
                    csv_line = pp.commaSeparatedList.copy().addParseAction(pp.tokenMap(lambda s: s.strip('"')))
                    row = csv_line.parseString(row).asList()
                    print(row)
                    x = row[1].split("http://www.wikidata.org/prop/direct/")
                    wdp = x[1].split("\\")[0]
                    resSubj = "<" + row[0] + ">"
                    resObj = "<" + row[2] + ">"
                    if (resSubj+resObj in itemsInQuery):
                        thisRow = itemsInQuery[resSubj+resObj]
                        training_row = [thisRow[0], thisRow[1], thisRow[2], thisRow[3], wdp, "Forwards"]
                        training_data.append(training_row)
                        keysToDelete.add(resSubj+resObj)
                    elif (resObj+resSubj in itemsInQuery):
                        thisRow = itemsInQuery[resObj+resSubj]
                        training_row = [thisRow[0], thisRow[1], thisRow[2], thisRow[3], wdp, "Backwards"]
                        training_data.append(training_row)
                        keysToDelete.add(resObj+resSubj)
                    else:
                        raise Exception("Did not find row from query result in map: ", resSubj, " ", resObj)
            
            for key in keysToDelete:
                del itemsInQuery[key]
            for key in itemsInQuery:
                test_data.append(itemsInQuery[key])
                
            itemsInQuery = {}
            queryTail = "select ?s ?p ?o where {"
            batchCount = batchCount + 1
            print("batch ", batchCount, " complete")

    return training_data, test_data

# Neo4j Cypher lookup
# DOES NOT Handle page redirects cleanly (as of now)
def getStrongLinks(domain):
    print("Finding strong links for: ", domain)
    query = "MATCH (source:Page {title: \""+domain+"\"}) - [link1:Link] - (target:Page) - [link2:Link] - (source) WHERE link1 <> link2 RETURN distinct target.title"
    query_data = runWithRetry(query).data()
    strong_links = [domain]
    for res in query_data:
        page = res["target.title"]
        strong_links.append(page)
    print("Found ", str(len(strong_links)), " strong links")
    return strong_links

if __name__ == "__main__":
    main()

Finding strong links for:  History of Rome
Found  8  strong links
Total network size:  12740


  3% (427 of 12740) |                    | Elapsed Time: 0:03:18 ETA:   3:07:21

failed to parse link text:  "/wiki/Lucius_Marcius_Philippus_(consul_91_BC)" title="Lucius Marcius Philippus (consul 91 BC)">L


                                                                                 3% (427 of 12740) |                    | Elapsed Time: 0:03:18 ETA:   3:32:29

failed to parse link text:  "/wiki/Marcus_Aemilius_Lepidus_(triumvir)" title="Marcus Aemilius Lepidus (triumvir)">M


  6% (884 of 12740) |#                   | Elapsed Time: 0:06:49 ETA:   1:18:52

failed to parse link text:  "/wiki/J._R._R._Tolkien" title="J


  9% (1151 of 12740) |#                  | Elapsed Time: 0:08:50 ETA:   1:51:05

failed to parse link text:  "/wiki/Robert_E._Howard" title="Robert E


  9% (1211 of 12740) |#                  | Elapsed Time: 0:09:15 ETA:   1:18:54

failed to parse link text:  "/wiki/National_Fascist_Party" title="National Fascist Party">National Fascist Party


                                                                                 9% (1211 of 12740) |#                  | Elapsed Time: 0:09:15 ETA:   1:36:26

failed to parse link text:  "/wiki/National_Fascist_Party" title="National Fascist Party">National Fascist Party


 12% (1624 of 12740) |##                 | Elapsed Time: 0:12:36 ETA:   1:19:59

Unable to parse page:  List of sovereign states in 1528


 14% (1857 of 12740) |##                 | Elapsed Time: 0:14:25 ETA:   1:11:37

failed to parse link text:  "/wiki/J._R._R._Tolkien" title="J


 16% (2114 of 12740) |###                | Elapsed Time: 0:16:22 ETA:   1:06:36

failed to parse link text:  "/wiki/J._R._R._Tolkien" title="J


 17% (2257 of 12740) |###                | Elapsed Time: 0:17:22 ETA:   1:06:51

failed to parse link text:  "/wiki/Publius_Canidius_Crassus" title="Publius Canidius Crassus">P


 25% (3274 of 12740) |####               | Elapsed Time: 0:24:40 ETA:   0:59:36

failed to parse link text:  "/wiki/Bernard_of_Clairvaux" title="Bernard of Clairvaux">St


 26% (3345 of 12740) |####               | Elapsed Time: 0:25:09 ETA:   0:50:27

failed to parse link text:  "/wiki/M._C._Escher" title="M


 40% (5207 of 12740) |#######            | Elapsed Time: 0:38:04 ETA:   0:43:39

failed to parse link text:  "/wiki/J._R._R._Tolkien" title="J


 48% (6178 of 12740) |#########          | Elapsed Time: 0:44:42 ETA:   0:46:37

failed to parse link text:  "/wiki/J._R._R._Tolkien" title="J


 53% (6841 of 12740) |##########         | Elapsed Time: 0:49:20 ETA:   0:36:35

Unable to parse page:  List of sovereign states in 1662


 54% (6890 of 12740) |##########         | Elapsed Time: 0:49:41 ETA:   0:34:40

Unable to parse page:  List of sovereign states in 1800


 54% (6963 of 12740) |##########         | Elapsed Time: 0:50:13 ETA:   0:41:56

failed to parse link text:  "/wiki/J._R._R._Tolkien" title="J


 56% (7150 of 12740) |##########         | Elapsed Time: 0:51:31 ETA:   0:37:26

failed to parse link text:  "/wiki/Lucius_Tarquinius_Superbus" title="Lucius Tarquinius Superbus">L


                                                                                56% (7150 of 12740) |##########         | Elapsed Time: 0:51:32 ETA:   0:42:20

failed to parse link text:  "/wiki/Lucius_Junius_Brutus" title="Lucius Junius Brutus">L
failed to parse link text:  "/wiki/Lucius_Tarquinius_Superbus" title="Lucius Tarquinius Superbus">L


 57% (7268 of 12740) |##########         | Elapsed Time: 0:52:19 ETA:   0:34:10

failed to parse link text:  "/wiki/Catholic_Church" title="Catholic Church">Roman 


 60% (7710 of 12740) |###########        | Elapsed Time: 0:55:22 ETA:   0:34:43

Unable to parse page:  List of sovereign states in 1660


 62% (7928 of 12740) |###########        | Elapsed Time: 0:56:51 ETA:   0:37:05

failed to parse link text:  "/wiki/Cape_St._Vincent" title="Cape St


 65% (8321 of 12740) |############       | Elapsed Time: 0:59:36 ETA:   0:32:52

failed to parse link text:  "/wiki/Augustine_of_Hippo" title="Augustine of Hippo">St


 66% (8481 of 12740) |############       | Elapsed Time: 1:00:47 ETA:   0:50:10

Unable to parse page:  List of sovereign states in 1400


                                                                                66% (8481 of 12740) |############       | Elapsed Time: 1:00:47 ETA:   0:57:49

failed to parse link text:  "/wiki/M._C._Escher" title="M


 67% (8556 of 12740) |############       | Elapsed Time: 1:01:15 ETA:   0:29:06

failed to parse link text:  "/wiki/J._R._R._Tolkien" title="J


 74% (9446 of 12740) |##############     | Elapsed Time: 1:07:33 ETA:   0:20:21

Could not retrieve data for page:  Cushitic peoples


 79% (10137 of 12740) |##############    | Elapsed Time: 1:12:20 ETA:   0:17:14

Unable to parse page:  List of battles 1301–1600


 83% (10588 of 12740) |##############    | Elapsed Time: 1:15:23 ETA:   0:12:59

failed to parse link text:  "/wiki/Pope_Pius_IX" title="Pope Pius IX">Bl


 83% (10670 of 12740) |###############   | Elapsed Time: 1:15:57 ETA:   0:12:22

failed to parse link text:  "/wiki/J._R._R._Tolkien" title="J


 85% (10883 of 12740) |###############   | Elapsed Time: 1:17:22 ETA:   0:10:23

failed to parse link text:  "/wiki/Francis_of_Assisi" title="Francis of Assisi">St


 86% (11078 of 12740) |###############   | Elapsed Time: 1:18:39 ETA:   0:10:13

failed to parse link text:  "/wiki/M._C._Escher" title="M


 89% (11343 of 12740) |################  | Elapsed Time: 1:20:24 ETA:   0:08:08

failed to parse link text:  "/wiki/Jerome" title="Jerome">St


 90% (11511 of 12740) |################  | Elapsed Time: 1:21:34 ETA:   0:06:54

failed to parse link text:  "/wiki/List_of_obelisks_in_Rome" title="List of obelisks in Rome">St


 91% (11649 of 12740) |################  | Elapsed Time: 1:22:31 ETA:   0:07:06

failed to parse link text:  "/wiki/Gnaeus_Octavius_(consul_87_BC)" title="Gnaeus Octavius (consul 87 BC)">Cn


                                                                                91% (11649 of 12740) |################  | Elapsed Time: 1:22:31 ETA:   0:07:44

failed to parse link text:  "/wiki/Augustus" title="Augustus">C
failed to parse link text:  "/wiki/Roman_emperor" title="Roman emperor">imp


 96% (12339 of 12740) |################# | Elapsed Time: 1:27:25 ETA:   0:02:26

failed to parse link text:  "/wiki/J._R._R._Tolkien" title="J


 96% (12342 of 12740) |################# | Elapsed Time: 1:27:26 ETA:   0:02:40

failed to parse link text:  "/wiki/Carlo_Emanuele_Muzzarelli" title="Carlo Emanuele Muzzarelli">C


100% (12740 of 12740) |##################| Elapsed Time: 1:30:23 Time:  1:30:23
