# Web Crawlers

In [1]:
from urllib.request import urlopen
from urllib.parse import urlparse
from bs4 import BeautifulSoup 
import re
import datetime
import random

In [2]:
html = urlopen('http://en.wikipedia.org/wiki/Kevin_Bacon')
bs = BeautifulSoup(html, 'html.parser')

for link in bs.find_all('a'):
    if 'href' in link.attrs:
        print(link.attrs['href'])

#bodyContent
/wiki/Main_Page
/wiki/Wikipedia:Contents
/wiki/Portal:Current_events
/wiki/Special:Random
/wiki/Wikipedia:About
//en.wikipedia.org/wiki/Wikipedia:Contact_us
https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en
/wiki/Help:Contents
/wiki/Help:Introduction
/wiki/Wikipedia:Community_portal
/wiki/Special:RecentChanges
/wiki/Wikipedia:File_upload_wizard
/wiki/Main_Page
/wiki/Special:Search
/w/index.php?title=Special:CreateAccount&returnto=Kevin+Bacon
/w/index.php?title=Special:UserLogin&returnto=Kevin+Bacon
/w/index.php?title=Special:CreateAccount&returnto=Kevin+Bacon
/w/index.php?title=Special:UserLogin&returnto=Kevin+Bacon
/wiki/Help:Introduction
/wiki/Special:MyContributions
/wiki/Special:MyTalk
#
#Early_life_and_education
#Acting_career
#Early_work
#1980s
#1990s
#2000s
#2010s
#Other_ventures
#Six_Degrees_of_Kevin_Bacon
#Personal_life
#Accolades
#Awards_and_nominations
#Other_honors
#S

## Retrieving Articles Only

In [3]:
html = urlopen('http://en.wikipedia.org/wiki/Kevin_Bacon')
bs = BeautifulSoup(html, 'html.parser')

for link in bs.find('div', {'id':'bodyContent'}).find_all(
    # note: they akk start with /wiki/ and don't contain a : in the url
    'a', href=re.compile('^(/wiki/)((?!:).)*$')):
    print(link.attrs['href'])

/wiki/Kevin_Bacon_(disambiguation)
/wiki/Philadelphia
/wiki/Kevin_Bacon_filmography
/wiki/Kyra_Sedgwick
/wiki/Sosie_Bacon
/wiki/Edmund_Bacon_(architect)
/wiki/Michael_Bacon_(musician)
/wiki/Leading_man
/wiki/Character_actor
/wiki/Golden_Globe_Award
/wiki/Screen_Actors_Guild_Award
/wiki/Primetime_Emmy_Award
/wiki/National_Lampoon%27s_Animal_House
/wiki/Footloose_(1984_film)
/wiki/Diner_(1982_film)
/wiki/JFK_(film)
/wiki/A_Few_Good_Men
/wiki/Apollo_13_(film)
/wiki/Mystic_River_(film)
/wiki/Frost/Nixon_(film)
/wiki/Friday_the_13th_(1980_film)
/wiki/Tremors_(1990_film)
/wiki/The_River_Wild
/wiki/The_Woodsman_(2004_film)
/wiki/Crazy,_Stupid,_Love
/wiki/Patriots_Day_(film)
/wiki/Losing_Chase
/wiki/Loverboy_(2005_film)
/wiki/Golden_Globe_Award_for_Best_Actor_%E2%80%93_Miniseries_or_Television_Film
/wiki/Screen_Actors_Guild_Award_for_Outstanding_Performance_by_a_Male_Actor_in_a_Miniseries_or_Television_Movie
/wiki/Michael_Strobl
/wiki/HBO
/wiki/Taking_Chance
/wiki/Fox_Broadcasting_Company
/wik

## Random Walk

In [4]:
def getLinks(articleUrl):
    html = urlopen(f'http://en.wikipedia.org{articleUrl}')
    bs = BeautifulSoup(html, 'html.parser')
    return bs.find('div', {'id':'bodyContent'}).find_all('a', href=re.compile('^(/wiki/)((?!:).)*$'))

In [5]:
# this way results are different each time
random.seed(datetime.datetime.now().strftime('%s'))

links = getLinks('/wiki/Kevin_Bacon')
#while len(links) > 0:
# to make it terminate
for i in range(10):
    newArticle = links[random.randint(0, len(links)-1)].attrs['href']
    print(newArticle)
    links = getLinks(newArticle)

/wiki/Saturn_Award_for_Best_Actor_in_a_Network_or_Cable_Television_Series
/wiki/Breaking_Bad
/wiki/Chemical_equation
/wiki/Phosphoric_acid
/wiki/Peroxymonosulfuric_acid
/wiki/Hydrogen_disulfide
/wiki/International_Chemical_Identifier
/wiki/Space-filling_model
/wiki/Chemical_element
/wiki/Chemical_symbol


## Recursively crawling an entire site

In [6]:
# note that here we're going for all links again, not just articles
# I'm limiting how far it can go (only 1 page deep)
def getAllLinks(pageUrl, pages, maxPageDepth=2, currentPageDepth=0):
    if currentPageDepth == maxPageDepth:
        return

    html = urlopen(f'http://en.wikipedia.org{pageUrl}')
    bs = BeautifulSoup(html, 'html.parser')
    for link in bs.find_all('a', href=re.compile('^(/wiki/)')):
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
                #We have encountered a new page
                newPage = link.attrs['href']
                print(newPage)
                pages.add(newPage)
                getAllLinks(newPage, pages, maxPageDepth, currentPageDepth+1)

In [7]:
pages = set()

getAllLinks('', pages, 2, 0)

/wiki/Main_Page
/wiki/Wikipedia:Contents
/wiki/Portal:Current_events
/wiki/Special:Random
/wiki/Wikipedia:About
/wiki/Help:Contents
/wiki/Help:Introduction
/wiki/Wikipedia:Community_portal
/wiki/Special:RecentChanges
/wiki/Wikipedia:File_upload_wizard
/wiki/Special:Search
/wiki/Special:MyContributions
/wiki/Special:MyTalk
/wiki/Talk:Main_Page
/wiki/Special:WhatLinksHere/Main_Page
/wiki/Special:RecentChangesLinked/Main_Page
/wiki/Wikipedia:File_Upload_Wizard
/wiki/Special:SpecialPages
/wiki/Wikipedia
/wiki/Free_content
/wiki/Encyclopedia
/wiki/Help:Introduction_to_Wikipedia
/wiki/Special:Statistics
/wiki/English_language
/wiki/File:The_Mercury_7_(15258556433).jpg
/wiki/Mercury_Seven
/wiki/Astronaut
/wiki/Project_Mercury
/wiki/NASA
/wiki/Scott_Carpenter
/wiki/Gordon_Cooper
/wiki/John_Glenn
/wiki/Gus_Grissom
/wiki/Wally_Schirra
/wiki/Alan_Shepard
/wiki/Deke_Slayton
/wiki/Project_Gemini
/wiki/Apollo_program
/wiki/Space_Shuttle
/wiki/Apollo_14
/wiki/Apollo_1
/wiki/Apollo_7
/wiki/Atrial_fibr

In [8]:
len(pages)

159

## Collecting Data Across an Entire Site

In this example, crawling Wiki and getting content. Like before, I'm limiting how deep the rabbit hole goes.

In [9]:
def getLinksContent(pageUrl, pages, maxPageDepth=2, currentPageDepth=0):
    if currentPageDepth == maxPageDepth:
        return
    
    html = urlopen(f'http://en.wikipedia.org{pageUrl}')
    bs = BeautifulSoup(html, 'html.parser')
    try:
        print(bs.h1.get_text())
        #mw-parser-output
        bodyContent = bs.find('div', {'id':'bodyContent'}).find_all('p')
        if len(bodyContent):
            print(bodyContent[0])
        print(bs.find(id='ca-edit').find('a').attrs['href'])
    except AttributeError:
        print('This page is missing something! Continuing.')
    
    for link in bs.find_all('a', href=re.compile('^(/wiki/)')):
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
                #We have encountered a new page
                newPage = link.attrs['href']
                print('-'*20)
                print(newPage)
                pages.add(newPage)
                getLinksContent(newPage, pages, maxPageDepth, currentPageDepth+1)

In [10]:
pages = set()

getLinksContent('/wiki/General-purpose_programming_language', pages, 2, 0) 

General-purpose programming language
<p>In <a class="mw-redirect" href="/wiki/Computer_software" title="Computer software">computer software</a>, a <b>general-purpose programming language</b> (<b>GPL</b>) is a <a href="/wiki/Programming_language" title="Programming language">programming language</a> for building <a href="/wiki/Software" title="Software">software</a> in a wide variety of application <a href="/wiki/Domain_(software_engineering)" title="Domain (software engineering)">domains</a>. Conversely, a <a href="/wiki/Domain-specific_language" title="Domain-specific language">domain-specific programming language</a> (DSL) is used within a specific area. For example, <a href="/wiki/Python_(programming_language)" title="Python (programming language)">Python</a> is a GPL, while <a href="/wiki/SQL" title="SQL">SQL</a> is a DSL for <a href="/wiki/Query_language" title="Query language">querying relational databases</a>.
</p>
/w/index.php?title=General-purpose_programming_language&action=

## Crawling across the Internet

like before, I added a max links to follow...

note: if using requests library, in order to let internal servers redirect, it needs to be specified (urlparse instead does it automatically)

In [11]:
urlparse('https://www.tomoumer.com')

ParseResult(scheme='https', netloc='www.tomoumer.com', path='', params='', query='', fragment='')

In [14]:
#Retrieves a list of all Internal links found on a page
def getInternalLinks(bs, url):
    netloc = urlparse(url).netloc
    scheme = urlparse(url).scheme
    internalLinks = set()
    for link in bs.find_all('a'):
        if not link.attrs.get('href'):
            continue
        parsed = urlparse(link.attrs['href'])
        if parsed.netloc == '':
            internalLinks.add(f'{scheme}://{netloc}/{link.attrs["href"].strip("/")}')
        elif parsed.netloc == netloc:
            internalLinks.add(link.attrs['href'])
    return list(internalLinks)
            
#Retrieves a list of all external links found on a page
def getExternalLinks(bs, url):
    netloc = urlparse(url).netloc
    externalLinks = set()
    for link in bs.find_all('a'):
        if not link.attrs.get('href'):
            continue
        parsed = urlparse(link.attrs['href'])
        if parsed.netloc != '' and parsed.netloc != netloc:
            externalLinks.add(link.attrs['href'])
    return list(externalLinks)

def getRandomExternalLink(startingPage):
    bs = BeautifulSoup(urlopen(startingPage), 'html.parser')
    externalLinks = getExternalLinks(bs, startingPage)
    if not len(externalLinks):
        print('No external links, looking around the site for one')
        internalLinks = getInternalLinks(bs, startingPage)
        return getRandomExternalLink(random.choice(internalLinks))
    else:
        return random.choice(externalLinks)
    
def followExternalOnly(startingSite, maxFollow=5, currentFollow=0):
    if maxFollow == currentFollow:
        return
    externalLink = getRandomExternalLink(startingSite)
    print(f'Random external link is: {externalLink}')
    followExternalOnly(externalLink, maxFollow, currentFollow + 1)

In [15]:
followExternalOnly('https://www.oreilly.com/')

Random external link is: https://www.oreilly.co.jp/index.shtml
Random external link is: https://makezine.jp/
Random external link is: https://eepurl.com/bJoWZf
Random external link is: https://login.mailchimp.com/signup/email-referral/?aid=f1be389980677f59500fb9203
Random external link is: https://www.creditkarma.com/


## Collect all External Links from a Site

In [16]:
def getAllExternalLinks(url, allExtLinks, allIntLinks, maxPageDepth=3, currentPageDepth=0):
    if currentPageDepth == maxPageDepth:
        return
    bs = BeautifulSoup(urlopen(url), 'html.parser')
    internalLinks = getInternalLinks(bs, url)
    externalLinks = getExternalLinks(bs, url)
    for link in externalLinks:
        if link not in allExtLinks:
            allExtLinks.append(link)
            print(link)

    for link in internalLinks:
        if link not in allIntLinks:
            allIntLinks.append(link)
            getAllExternalLinks(link, allExtLinks, allIntLinks, maxPageDepth, currentPageDepth + 1)

In [17]:
# Collects a list of all external URLs found on the site
allExtLinks = []
allIntLinks = []

allIntLinks.append('https://oreilly.com')
getAllExternalLinks('https://www.oreilly.com/', allExtLinks, allIntLinks, 3, 0)

https://oreilly.hk/
https://learning.oreilly.com/search/?query=author%3A%22Ken%20Kousen%22&extended_publisher_data=true&highlight=true&include_assessments=false&include_case_studies=true&include_courses=true&include_playlists=true&include_collections=true&include_notebooks=true&include_sandboxes=true&include_scenarios=true&is_academic_institution_account=false&source=user&sort=date_added&facet_json=true&json_facets=true&page=0&include_facets=false
https://oreilly.id/
https://channelstore.roku.com/details/c8a2d0096693eb9455f6ac165003ee06/oreilly
https://learning.oreilly.com/search/?query=author%3A%22Arianne%20Dee%22&extended_publisher_data=true&highlight=true&include_assessments=false&include_case_studies=true&include_courses=true&include_playlists=true&include_collections=true&include_notebooks=true&include_sandboxes=true&include_scenarios=true&is_academic_institution_account=false&source=suggestion&sort=date_added&facet_json=true&json_facets=true&page=0&include_facets=false
https://ww