# Web Crawl 
Web crawlers are recursive in nature.  
They retrieve page contents for a URL, examine another URL for retrieval and repeat, ad infinitum.  
# Six Degrees of Wikipedia


In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup as bs
import re

#### get a link of a wikipedia page of your interest
e.g. https://en.wikipedia.org/wiki/Women_Who_Code  

In [None]:
link = "/wiki/Women_Who_Code"

In [None]:
# The following retrieves all links on wikipedia site: 
html = urlopen("http://en.wikipedia.org/wiki/Women_Who_Code")
bsObj = bs(html, "lxml")
for link in bsObj.find("div", {"id":"bodyContent"}).findAll("a", href=re.compile("^(/wiki/)((?!:).)*$")):
    if 'href' in link.attrs:
        print(link.attrs['href'])

## Recurseively retrieve links

In [None]:
import datetime
import random

In [None]:
def getLinks(article_url):
    html = urlopen("http://en.wikipedia.org"+article_url)
    bsObj = bs(html, "lxml")
    return bsObj.find("div", {"id":"bodyContent"}).findAll("a", href = re.compile("^(/wiki/)((?!:).)*$"))

In [None]:
link = "/wiki/Women_Who_Code"

In [None]:
random.seed(datetime.datetime.now())
links = getLinks(link)
# The following will recurse for a very very very long time, hit stop button to step out
while len(links) > 0:
    newArticle = links[random.randint(0, len(links)-1)].attrs["href"]
    print(newArticle)
    links = getLinks(newArticle)

## Traverse a site

In [None]:
# stopping at six degrees, by definition of the 'Six Degrees', this would return every possible link on the site. :p
def getLinks(pageUrl, loop_num):
    html = urlopen("http://en.wikipedia.org"+pageUrl)
    bsObj = bs(html, "lxml")
    for link in bsObj.findAll("a", href = re.compile("^(/wiki/)((?!:).)*$")):
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
                # we have encountered a new page
                newPage = link.attrs['href']
                print(newPage)
                pages.add(newPage)
                if(loop_num < 6):
                    getLinks(newPage, loop_num+1)

In [None]:
pages = set() # to avoid duplicate links
getLinks(link, 5)

## Points to consider
 * What data are we gathering?
 * Drill down or follow the next outbound link?
 * Conditions where we do not want to scrape a site? e.g. non-English content?
 * How to protect against legal action?

## Retrieve external links example

In [None]:
import urllib.request
from urllib.parse import urlsplit

In [None]:
# Retrieves a list of all Internal lnks found on a page
def getInternalLinks(bsObj, includeUrl):
    internalLinks = []
    # Finds all links that begin with a "/"
    for link in bsObj.findAll("a", href=re.compile("^(/|.*"+includeUrl+")")):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in internalLinks:
                internalLinks.append(link.attrs['href'])
    return internalLinks

In [None]:
# Retrieves a list of all external links found on a page
def getExternalLinks(bsObj, excludeUrl):
    externalLinks = []
    # Finds all links that start with "http" or "www" that do not contain the current URL
    for link in bsObj.findAll("a", href = re.compile("^(http|www)((?!"+excludeUrl+").)*$")):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in externalLinks:
                externalLinks.append(link.attrs['href'])
    return externalLinks

In [None]:
def splitAddress(address):
    return urlsplit(address).netloc

In [None]:
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
def getRandomExternalLink(startingPage):
    html = opener.open(startingPage)
    bsObj = bs(html, 'lxml')
    print('Site: ' + startingPage)
    externalLinks = getExternalLinks(bsObj, splitAddress(startingPage))
    if len(externalLinks) == 0:
        internalLinks = getInternalLinks(bsObj, startingPage)
        if len(internalLinks) != 0:
            return followExternalOnly(internalLinks[random.randint(0, len(internalLinks)-1)])
    else:
        return externalLinks[random.randint(0, len(externalLinks)-1)]

In [None]:
def followExternalOnly(startingSite):
    externalLink = getRandomExternalLink(startingSite)
    if externalLink is None:
        return
    elif externalLink not in pages:
        # we have encountered a new page
        print("Random external link is: "+externalLink)
        pages.add(externalLink)
        followExternalOnly(externalLink)

In [None]:
URL = 'https://www.womenwhocode.com/'

In [None]:
pages = set()
random.seed(datetime.datetime.now())
followExternalOnly(URL)

#### Reference:
 * Web Scraping with Python, Ryan Mitchell  
See README.md  