In [None]:
from urllib.request import urlopen
from urllib.parse   import urlparse
import urllib.error as err  # WS to get error codes
from bs4 import BeautifulSoup
import datetime
import random
import re
import sys   # WS for exit()

In [None]:
random.seed(datetime.datetime.now().microsecond)  # WS added microsecond to satisfy new random.seed

In [None]:
# WS added this to cleanly stop a cell
class StopEx(Exception):
    def _render_traceback_(self):
        pass
# to stop a cell in a try,except block: 'raise StopEx' will provide no long traceback 

In [None]:
html = urlopen('http://en.wikipedia.org/wiki/Kevin_Bacon')
bs   = BeautifulSoup(html, 'html.parser')
for link in bs.find_all('a'):
    if 'href' in link.attrs:
        print(link.attrs['href'])

## Retrieving Articles Only

In [None]:
html = urlopen('http://en.wikipedia.org/wiki/Kevin_Bacon')
bs   = BeautifulSoup(html, 'html.parser')
for link in bs.find('div', {'id':'bodyContent'}).find_all('a', 
                    href=re.compile('^(/wiki/)((?!:).)*$')):
    if 'href' in link.attrs:
        print(link.attrs['href'])

## Random Walk

In [None]:
def getLinks(articleUrl):
    html = urlopen('http://en.wikipedia.org{}'.format(articleUrl))
    bs   = BeautifulSoup(html, 'html.parser')
    return bs.find('div', {'id':'bodyContent'}).find_all('a', href=re.compile('^(/wiki/)((?!:).)*$'))

In [None]:
links = getLinks('/wiki/Kevin_Bacon')
num = 10  # WS added to cut output off
while (len(links) > 0) and (num > 0):  # WS added num
    newArticle = links[random.randint(0, len(links)-1)].attrs['href']
    print(newArticle)
    links = getLinks(newArticle)
    num -= 1

## Recursively crawling an entire site

In [None]:
#pages = set()
def getLinks(pageUrl, num):  # WS added num to limit output
    global pages  # WS global, since this recursively called
    html = urlopen('http://en.wikipedia.org{}'.format(pageUrl))
    bs   = BeautifulSoup(html, 'html.parser')
    for link in bs.find_all('a', href=re.compile('^(/wiki/)')):
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
                #We have encountered a new page
                newPage = link.attrs['href']
                if len(pages) >= num: return  # WS limit output
                print(newPage)
                pages.add(newPage)
                getLinks(newPage, num)  # WS

In [None]:
pages = set()
getLinks('', 5)

In [None]:
pages

## Collecting Data Across an Entire Site

In [None]:
def getLinks(pageUrl, num):  # WS added num
    global pages
    html = urlopen('http://en.wikipedia.org{}'.format(pageUrl))
    bs   = BeautifulSoup(html, 'html.parser')
    try:
        print(bs.h1.get_text())
        print(bs.find(id ='mw-content-text').find_all('p')[0])
        print(bs.find(id='ca-edit').find('span').find('a').attrs['href'])
    except AttributeError:
        print('This page is missing something! Continuing.')
    
    for link in bs.find_all('a', href=re.compile('^(/wiki/)')):
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
                #We have encountered a new page
                newPage = link.attrs['href']
                if len(pages) >= num: return  # WS limit output
                print('-'*20)
                print(newPage)
                pages.add(newPage)
                getLinks(newPage, num)  # WS

In [None]:
pages = set()
getLinks('', 10) 

In [None]:
pages

## Crawling across the Internet

In [None]:
#Retrieves a list of all Internal links found on a page
def getInternalLinks(bs, includeUrl):
    includeUrl = '{}://{}'.format(urlparse(includeUrl).scheme, urlparse(includeUrl).netloc)
    internalLinks = []
    #Finds all links that begin with a "/"
    for link in bs.find_all('a', href=re.compile('^(/|.*'+includeUrl+')')):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in internalLinks:
                if(link.attrs['href'].startswith('/')):
                    internalLinks.append(includeUrl+link.attrs['href'])
                else:
                    internalLinks.append(link.attrs['href'])
    return internalLinks
            
#Retrieves a list of all external links found on a page
def getExternalLinks(bs, excludeUrl):
    externalLinks = []
    #Finds all links that start with "http" that do
    #not contain the current URL
    for link in bs.find_all('a', href=re.compile('^(http|www)((?!'+excludeUrl+').)*$')):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in externalLinks:
                externalLinks.append(link.attrs['href'])
    return externalLinks

def getRandomExternalLink(startingPage):
    if startingPage == None: return  # WS
    try:  # WS added
        html = urlopen(startingPage)
    except err.HTTPError as e:  # WS
        print('{} for {}: STOPPING HERE'.format(e, startingPage))
        raise StopEx  # WS
    bs = BeautifulSoup(html, 'html.parser')
    externalLinks = getExternalLinks(bs, urlparse(startingPage).netloc)
    if len(externalLinks) == 0:
        print('No external links, looking around the site for one')
        domain = '{}://{}'.format(urlparse(startingPage).scheme, urlparse(startingPage).netloc)
        internalLinks = getInternalLinks(bs, domain)
        if len(internalLinks) == 0:
            print('No internal links either: STOPPING HERE')  # WS addition
            raise StopEx  # WS
        else:
            return getRandomExternalLink(internalLinks[random.randint(0,
                                    len(internalLinks)-1)])
    else:
        return externalLinks[random.randint(0, len(externalLinks)-1)]
    
def followExternalOnly(startingSite):
    externalLink = getRandomExternalLink(startingSite)
    print('Random external link is: {}'.format(externalLink))
    followExternalOnly(externalLink)

In [None]:
pages = set()            
followExternalOnly('http://oreilly.com')  # WS this is still buggy, added some error checks

## Collect all External Links from a Site

In [None]:
# Collects a list of all external URLs found on the site
allExtLinks = set()  # WS these are global, will be visible inside functions
allIntLinks = set()
def getAllExternalLinks(siteUrl):
    try:  # WS
        html = urlopen(siteUrl)
    except err.HTTPError as e:  # WS
        print('{} for {}: STOPPING HERE'.format(e, siteUrl))
        raise StopEx  # WS
    domain = '{}://{}'.format(urlparse(siteUrl).scheme,
                              urlparse(siteUrl).netloc)
    #print('domain: {}'.format(domain))
    bs = BeautifulSoup(html, 'html.parser')
    internalLinks = getInternalLinks(bs, domain)
    externalLinks = getExternalLinks(bs, domain)

    print('EXTERNAL links')  # WS
    for link in externalLinks:
        if link not in allExtLinks:
            allExtLinks.add(link)
            print(link)
            
    print('\nINTERNAL links')  # WS
    for link in internalLinks:
        if link not in allIntLinks:
            allIntLinks.add(link)
            print(link)  # WS
            getAllExternalLinks(link)

In [None]:
url = 'http://oreilly.com'  # WS from book
#url = 'https://github.com/wesmith'  # WS experiment: works, up to Forbidden error
#url = 'https://www.researchgate.net/profile/Warren-Smith-13/stats'  # WS, forbidden right away
#url = 'https://www.researchgate.net'  # forgidden right away
#url = 'https://www.thehartford.com/aarp/umbrella-insurance'  # huge number of internal links
#url = 'https://profoundphysics.com/'  # stops at first internal link
allIntLinks.add(url)  # add this after the fact to the set
getAllExternalLinks(url)

In [None]:
allIntLinks

In [None]:
allExtLinks  # WS this from all pages, including internal pages