# Wikipedia Speedrun Project: Web Scraping!

#### Import relevant libraries

In [110]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import random

#### Initialize important functions


In [3]:
def getLinkData(link):
    Wikidata = requests.get(link)
    htmldata = BeautifulSoup(Wikidata.content, 'html.parser')
    result = htmldata.prettify()
    return result

def checkIfVisited(curlink, storelinks, startIdx, endIdx):
    for i in range(startIdx, endIdx):
        if("https://en.wikipedia.org" + curlink == storelinks[i]):
            return True
    return False



#### Current records (step 1)


10000 steps ????

1000 steps, 152151 unique links, 5m 37.2s runtime


100 steps, 29460 unique links, 40.7s runtime


10 steps, 3757 unique links, 4.6s runtime


1 step, 67 unique links, 0.7s runtime


### step 1: recurse through hyperlinks until nearly all links are found

In [112]:

totNumSteps = 10000 # number of iterations... control this for testing!

storelinks, storelinks2, storeStepIndices, storeStepNames, storeStepLinks = ([] for i in range(5))
startPage = "https://en.wikipedia.org/wiki/Casiopea"
curPageData = getLinkData(startPage) #call function that returns web page data
numLinks = 0

storeStepLinks.append("0" + " " + startPage + '\n')
for numSteps in range(totNumSteps):
    links = [i for i in range(len(curPageData)) if curPageData.startswith("<a href=\"/wiki", i)] # isolate each hyperlink in the html text by searching for all anchor elements
    titleIdx = curPageData.find("<title") # find index of title in the html text
    storeStepNames.append(str(numSteps) + " " + (curPageData[titleIdx + 11 : titleIdx + (curPageData[titleIdx : titleIdx+200]).find(' -')]) + '\n') # add title of current article to list
    for i in range(len(links)): # iterate through list of link indices
        curlink = curPageData[links[i] + 9 : links[i] + (curPageData[links[i] : links[i]+200]).find('" title')] # for each hyperlink within the page, isolate from html text
        if(curlink and curlink.find(':') < 0 and not checkIfVisited(curlink, storelinks, int(0 if numSteps == 0 else storeStepIndices[numSteps - 1]), numLinks)): #filter out: empty links, links already visited in the same iteration, and internal wikipedia links such as "Directory:" and "Help:"
            storelinks.append("https://en.wikipedia.org" + curlink) # store all links in a list, used for choosing next link
            storelinks2.append(str(numSteps) + " " + storelinks[-1] + '\n') # store all links in a "text-file-friendly" format
            numLinks += 1 # keep track of number of links scraped
    storeStepIndices.append(numLinks) # after each step, keep track of how many links have been scraped, and add it to this list. This allows us to know what range of indices of the link array contains all links from a certain iteration. 
    randIdx = storeStepIndices[numSteps] - storeStepIndices[numSteps - 1] # upper bound for randomized value below (number of links scraped in this iteration)
    curIdx = 0 if numSteps == 0 else storeStepIndices[numSteps - 1] # randomize index of next link, from range [number of links from previous iteration, number of links from this iteration)
    curIdx += 1 if (randIdx == 1) else -50 if (randIdx == 0 and numSteps != 0) else random.randrange(1, numLinks if (randIdx == 0) else randIdx) # add randomized value, accounting for edge cases of randIdx = 0 or 1.
    nextlink = storelinks[curIdx] # index into link array using randomized index to choose next link
    storeStepLinks.append(str(numSteps + 1) + " " + nextlink + '\n') # add next step link to step links array
    curPageData = getLinkData(nextlink)
    print(str(numSteps) + ", " + str(numLinks))

file1 = open('links.txt', 'w', encoding = 'utf-8')
file1.writelines(storelinks2)
file1.close()

file2 = open('StepNames.txt', 'w', encoding = 'utf-8')
file2.writelines(storeStepNames)
file2.close()

file3 = open('StepLinks.txt', 'w', encoding = 'utf-8')
file3.writelines(storeStepLinks)
file3.close()

numUnique = len(set(storelinks))
print(numUnique)
print(str((float(numUnique) / float(6600000)) * 100) + "% of English Wikipedia scraped")

0, 67
1, 349
2, 707
3, 1351
4, 1446
5, 1492
6, 1824
7, 2010
8, 2035
9, 3034
10, 3918
11, 4310
12, 4402
13, 5230
14, 5653
15, 6078
16, 6370
17, 6389
18, 6565
19, 6591
20, 7263
21, 7442
22, 7636
23, 7795
24, 8124
25, 8290
26, 8566
27, 8705
28, 9074
29, 9283
30, 9304
31, 9571
32, 9744
33, 10140
34, 10205
35, 10257
36, 10486
37, 10586
38, 10680
39, 10770
40, 10862
41, 10953
42, 10986
43, 11205
44, 11245
45, 11658
46, 11664
47, 11687
48, 11938
49, 12034
50, 12048
51, 12465
52, 12626
53, 13460
54, 13516
55, 13627
56, 13654
57, 14104
58, 14302
59, 15314
60, 15345
61, 15392
62, 15516
63, 15648
64, 15771
65, 15914
66, 15919
67, 16052
68, 16525
69, 16656
70, 17001
71, 17331
72, 17657
73, 17803
74, 18002
75, 18814
76, 19273
77, 19377
78, 19483
79, 19595
80, 19880
81, 19945
82, 20495
83, 20507
84, 20532
85, 20576
86, 20604
87, 21082
88, 21121
89, 21250
90, 21405
91, 22359
92, 22720
93, 23191
94, 23557
95, 23989
96, 24394
97, 24744
98, 25138
99, 25243
100, 25640
101, 26053
102, 26554
103, 26572
104

### step 2: iterate through found (but unvisited) links in chunks, thus slowly increasing the amount of connections between links discovered. 