# Wikipedia Speedrun Project: Web Scraping!

#### Import relevant libraries

In [110]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import random

#### Initialize important functions


In [3]:
def getLinkData(link):
    Wikidata = requests.get(link)
    htmldata = BeautifulSoup(Wikidata.content, 'html.parser')
    result = htmldata.prettify()
    return result

def checkIfVisited(curlink, storelinks, startIdx, endIdx):
    for i in range(startIdx, endIdx):
        if("https://en.wikipedia.org" + curlink == storelinks[i]):
            return True
    return False



#### Current records (step 1)

10000 steps, 478013 unique links, 1 hr 15 min 3.4s runtime


1000 steps, 152151 unique links, 5m 37.2s runtime


100 steps, 29460 unique links, 40.7s runtime


10 steps, 3757 unique links, 4.6s runtime


1 step, 67 unique links, 0.7s runtime

### step 1: recurse through hyperlinks until nearly all links are found

In [113]:

totNumSteps = 10000 # number of iterations... control this for testing!

storelinks, storelinks2, storeStepIndices, storeStepNames, storeStepLinks = ([] for i in range(5))
startPage = "https://en.wikipedia.org/wiki/Casiopea"
curPageData = getLinkData(startPage) #call function that returns web page data
numLinks = 0

storeStepLinks.append("0" + " " + startPage + '\n')
for numSteps in range(totNumSteps):
    links = [i for i in range(len(curPageData)) if curPageData.startswith("<a href=\"/wiki", i)] # isolate each hyperlink in the html text by searching for all anchor elements
    titleIdx = curPageData.find("<title") # find index of title in the html text
    storeStepNames.append(str(numSteps) + " " + (curPageData[titleIdx + 11 : titleIdx + (curPageData[titleIdx : titleIdx+200]).find(' -')]) + '\n') # add title of current article to list
    for i in range(len(links)): # iterate through list of link indices
        curlink = curPageData[links[i] + 9 : links[i] + (curPageData[links[i] : links[i]+200]).find('" title')] # for each hyperlink within the page, isolate from html text
        if(curlink and curlink.find(':') < 0 and not checkIfVisited(curlink, storelinks, int(0 if numSteps == 0 else storeStepIndices[numSteps - 1]), numLinks)): #filter out: empty links, links already visited in the same iteration, and internal wikipedia links such as "Directory:" and "Help:"
            storelinks.append("https://en.wikipedia.org" + curlink) # store all links in a list, used for choosing next link
            storelinks2.append(str(numSteps) + " " + storelinks[-1] + '\n') # store all links in a "text-file-friendly" format
            numLinks += 1 # keep track of number of links scraped
    storeStepIndices.append(numLinks) # after each step, keep track of how many links have been scraped, and add it to this list. This allows us to know what range of indices of the link array contains all links from a certain iteration. 
    randIdx = storeStepIndices[numSteps] - storeStepIndices[numSteps - 1] # upper bound for randomized value below (number of links scraped in this iteration)
    curIdx = 0 if numSteps == 0 else storeStepIndices[numSteps - 1] # randomize index of next link, from range [number of links from previous iteration, number of links from this iteration)
    curIdx += 1 if (randIdx == 1) else random.randrange(1, numLinks if (randIdx == 0) else randIdx) # add randomized value, accounting for edge cases of randIdx = 0 or 1.
    nextlink = storelinks[-50] if (curIdx >= len(storelinks)) else storelinks[curIdx] # index into link array using randomized index to choose next link
    storeStepLinks.append(str(numSteps + 1) + " " + nextlink + '\n') # add next step link to step links array
    curPageData = getLinkData(nextlink)
    print(str(numSteps) + ", " + str(numLinks))

file1 = open('links.txt', 'w', encoding = 'utf-8')
file1.writelines(storelinks2)
file1.close()

file2 = open('StepNames.txt', 'w', encoding = 'utf-8')
file2.writelines(storeStepNames)
file2.close()

file3 = open('StepLinks.txt', 'w', encoding = 'utf-8')
file3.writelines(storeStepLinks)
file3.close()

numUnique = len(set(storelinks))
print(numUnique)
print(str((float(numUnique) / float(6600000)) * 100) + "% of English Wikipedia scraped")

0, 67
1, 281
2, 427
3, 559
4, 591
5, 780
6, 1203
7, 1437
8, 1451
9, 1493
10, 1592
11, 2024
12, 2287
13, 2553
14, 2823
15, 3087
16, 3361
17, 3634
18, 3906
19, 4558
20, 4922
21, 5255
22, 5579
23, 5763
24, 6619
25, 6692
26, 7128
27, 7747
28, 8595
29, 8809
30, 9409
31, 9948
32, 9991
33, 10097
34, 10214
35, 10395
36, 11051
37, 12143
38, 12489
39, 13151
40, 13582
41, 14157
42, 14368
43, 14555
44, 14856
45, 15000
46, 15624
47, 15736
48, 15770
49, 15928
50, 16408
51, 16721
52, 17420
53, 18060
54, 19002
55, 20237
56, 20740
57, 20791
58, 20800
59, 20827
60, 21104
61, 21241
62, 21355
63, 21387
64, 22240
65, 22453
66, 22504
67, 23357
68, 23779
69, 24465
70, 24495
71, 24565
72, 24742
73, 24890
74, 25021
75, 25168
76, 25301
77, 25431
78, 25576
79, 25846
80, 26257
81, 26303
82, 26316
83, 26650
84, 28774
85, 32076
86, 32338
87, 32458
88, 32839
89, 33044
90, 33314
91, 33477
92, 33796
93, 34075
94, 34302
95, 34537
96, 34735
97, 34962
98, 35257
99, 35475
100, 35707
101, 35751
102, 35761
103, 35768
104, 3

### step 2: iterate through found (but unvisited) links in chunks, thus slowly increasing the amount of connections between links discovered. 