In [None]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import random
import spacy 

In [None]:
def getLinkData(link):
    Wikidata = requests.get(link)
    htmldata = BeautifulSoup(Wikidata.content, 'html.parser')
    result = htmldata.prettify()
    return result

def checkIfVisited(curlink, storelinks, startIdx, endIdx):
    for i in range(startIdx, endIdx):
        if("https://en.wikipedia.org" + curlink == storelinks[i]):
            return True
    return False

In [None]:
totNumSteps = 1000

storelinks, storelinks2, storeStepIndices, storeStepNames, storeStepLinks, storeDistances = ([] for i in range(6))
startPage = "https://en.wikipedia.org/wiki/Casiopea"
curPageData = getLinkData(startPage) #call function that returns web page data
numLinks = 0

storeStepLinks.append("0" + " " + startPage + '\n')
for numSteps in range(totNumSteps):
    links = [i for i in range(len(curPageData)) if curPageData.startswith("<a href=\"/wiki", i)] # isolate each hyperlink in the html text by searching for all anchor elements
    titleIdx = curPageData.find("<title") # find index of title in the html text
    storeStepNames.append(str(numSteps) + " " + (curPageData[titleIdx + 11 : titleIdx + (curPageData[titleIdx : titleIdx+200]).find(' -')]) + '\n') # add title of current article to list
    for i in range(len(links)): # iterate through list of link indices
        curlink = curPageData[links[i] + 9 : links[i] + (curPageData[links[i] : links[i]+200]).find('" title')] # for each hyperlink within the page, isolate from html text
        if(curlink and curlink.find(':') < 0 and not checkIfVisited(curlink, storelinks, int(0 if numSteps == 0 else storeStepIndices[numSteps - 1]), numLinks)): #filter out: empty links, links already visited in the same iteration, and internal wikipedia links such as "Directory:" and "Help:"
            storelinks.append("https://en.wikipedia.org" + curlink) # store all links in a list, used for choosing next link
            storelinks2.append(str(numSteps) + " " + storelinks[-1] + '\n') # store all links in a "text-file-friendly" format
            storeDistances.append(str(links[i]/len(curPageData)) + '\n') # calculate and store distance between parent link and current link
            numLinks += 1 # keep track of number of links scraped
    storeStepIndices.append(numLinks) # after each step, keep track of how many links have been scraped, and add it to this list. This allows us to know what range of indices of the link array contains all links from a certain iteration. 
    randIdx = storeStepIndices[numSteps] - storeStepIndices[numSteps - 1] # upper bound for randomized value below (number of links scraped in this iteration)
    curIdx = 0 if numSteps == 0 else storeStepIndices[numSteps - 1] # randomize index of next link, from range [number of links from previous iteration, number of links from this iteration)
    curIdx += 1 if (randIdx == 1) else random.randrange(1, numLinks if (randIdx == 0) else randIdx) # add randomized value, accounting for edge cases of randIdx = 0 or 1.
    nextlink = storelinks[-50] if (curIdx >= len(storelinks)) else storelinks[curIdx] # index into link array using randomized index to choose next link
    storeStepLinks.append(str(numSteps + 1) + " " + nextlink + '\n') # add next step link to step links array
    curPageData = getLinkData(nextlink)
    print(str(numSteps) + ", " + str(numLinks))

file1 = open('links.txt', 'w', encoding = 'utf-8')
file1.writelines(storelinks2)
file1.close()

file2 = open('StepNames.txt', 'w', encoding = 'utf-8')
file2.writelines(storeStepNames)
file2.close()

file3 = open('StepLinks.txt', 'w', encoding = 'utf-8')
file3.writelines(storeStepLinks)
file3.close()

file5 = open('Distances.txt', 'w', encoding = 'utf-8')
file5.writelines(storeDistances)
file5.close()

numUnique = len(set(storelinks))
print(numUnique)
print(str((float(numUnique) / float(6600000)) * 100) + "% of English Wikipedia scraped")

In [None]:
storelinksUnique = sorted((set(storelinks)), key = storelinks.index) # create ordered set of all links for graph building
print(len(storelinksUnique))
print(storelinksUnique)
destinationIdx = random.randrange(0, len(storelinksUnique)) # for experimementation, we let the destination link be randomly chosen rather than letting the user choose.
destination = storelinksUnique[destinationIdx]
print(destinationIdx)
print(destination)
nlp = spacy.load('en_core_web_sm') # load NLP dataset to generate NLP factors (heuristic for A*)
destinationNLP = nlp(destination)
startNLP = nlp(startPage)
print(startNLP.similarity(destinationNLP))

NLPsimilarities = []
for i in range(len(storelinksUnique)): 
    print(i)
    NLPsimilarities.append(str(destinationNLP.similarity(nlp(storelinksUnique[i]))) + '\n')

file4 = open('NLPsimilarities.txt', 'w', encoding = 'utf-8')
file4.writelines(NLPsimilarities)
file4.close()

for i in range(len(storelinksUnique)):
    storelinksUnique[i] += '\n'

file6 = open('OrederedSet.txt', 'w', encoding = 'utf-8')
file6.writelines(storelinksUnique)
file6.close()

## Important information about final data:

### Number of iterations: 100000
### Number of (non-unique) links: 318190 
### Number of unique links: 149585
### Start page: https://en.wikipedia.org/wiki/Casiopea
### Destination page: https://en.wikipedia.org/wiki/Elena_Aiello (index 113099 in storelinksUnique)