# Capítulo 3 - Escrevendo Web Crawlers

In [2]:
import re
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('http://en.wikipedia.org/wiki/Kevin_Bacon')

bs = BeautifulSoup(html, 'html.parser')
for link in bs.find_all('a'):
	if 'href' in link.attrs:
		print(link.attrs['href'])


/wiki/Wikipedia:Protection_policy#semi
#mw-head
#searchInput
/wiki/Kevin_Bacon_(disambiguation)
/wiki/File:Kevin_Bacon_SDCC_2014.jpg
/wiki/Philadelphia,_Pennsylvania
/wiki/Kevin_Bacon_filmography
/wiki/Kyra_Sedgwick
/wiki/Sosie_Bacon
#cite_note-1
/wiki/Edmund_Bacon_(architect)
/wiki/Michael_Bacon_(musician)
/wiki/Holly_Near
/wiki/Wikipedia:Citation_needed
http://baconbros.com/
#cite_note-2
#cite_note-actor-3
/wiki/Footloose_(1984_film)
/wiki/JFK_(film)
/wiki/A_Few_Good_Men
/wiki/Apollo_13_(film)
/wiki/Mystic_River_(film)
/wiki/Balto_(film)
/wiki/Sleepers
/wiki/The_Woodsman_(2004_film)
/wiki/Animal_House
/wiki/Diner_(1982_film)
/wiki/Tremors_(1990_film)
/wiki/Crazy,_Stupid,_Love
/wiki/Friday_the_13th_(1980_film)
/wiki/Flatliners
/wiki/The_River_Wild
/wiki/Wild_Things_(film)
/wiki/Stir_of_Echoes
/wiki/Hollow_Man
/wiki/Frost/Nixon_(film)
/wiki/X-Men:_First_Class
/wiki/Black_Mass_(film)
/wiki/Patriots_Day_(film)
/wiki/Fox_Broadcasting_Company
/wiki/The_Following
/wiki/HBO
/wiki/Taking_Chan

In [3]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

html = urlopen('http://en.wikipedia.org/wiki/Kevin_Bacon')

bs = BeautifulSoup(html, 'html.parser')

links = bs.find('div', {'id': 'bodyContent'}).find_all('a',href=re.compile('^(/wiki/)((?!:).)*$'))

for link in links:
	print(link.attrs['href'])

/wiki/Kevin_Bacon_(disambiguation)
/wiki/Philadelphia,_Pennsylvania
/wiki/Kevin_Bacon_filmography
/wiki/Kyra_Sedgwick
/wiki/Sosie_Bacon
/wiki/Edmund_Bacon_(architect)
/wiki/Michael_Bacon_(musician)
/wiki/Holly_Near
/wiki/Footloose_(1984_film)
/wiki/JFK_(film)
/wiki/A_Few_Good_Men
/wiki/Apollo_13_(film)
/wiki/Mystic_River_(film)
/wiki/Balto_(film)
/wiki/Sleepers
/wiki/The_Woodsman_(2004_film)
/wiki/Animal_House
/wiki/Diner_(1982_film)
/wiki/Tremors_(1990_film)
/wiki/Crazy,_Stupid,_Love
/wiki/Friday_the_13th_(1980_film)
/wiki/Flatliners
/wiki/The_River_Wild
/wiki/Wild_Things_(film)
/wiki/Stir_of_Echoes
/wiki/Hollow_Man
/wiki/Frost/Nixon_(film)
/wiki/Black_Mass_(film)
/wiki/Patriots_Day_(film)
/wiki/Fox_Broadcasting_Company
/wiki/The_Following
/wiki/HBO
/wiki/Taking_Chance
/wiki/Golden_Globe_Award
/wiki/Screen_Actors_Guild_Award
/wiki/Primetime_Emmy_Award
/wiki/Streaming_television
/wiki/I_Love_Dick_(TV_series)
/wiki/Golden_Globe_Award_for_Best_Actor_%E2%80%93_Television_Series_Musical_or

In [4]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import datetime
import random
import re

random.seed(datetime.datetime.now())

def getLinks(articleURL):
	html = urlopen('http://en.wikipedia.org{}'.format(articleURL))
	bs = BeautifulSoup(html, 'html.parser')
	return bs.find('div', {'id':'bodyContent'}).find_all('a',href=re.compile('^(/wiki/)((?!:).)*$'))

links = getLinks('/wiki/Kevin_Bacon')
while len(links) > 0:
	newArticle = links[random.randint(0, len(links)-1)].attrs['href']
	print(newArticle)
	links = getLinks(newArticle)

since Python 3.9 and will be removed in a subsequent version. The only 
supported seed types are: None, int, float, str, bytes, and bytearray.
  random.seed(datetime.datetime.now())


/wiki/Will_Smith
/wiki/Meryl_Streep
/wiki/Jeanne_Moreau


KeyboardInterrupt: 

### Rastreando um Site Completo

In [5]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

pages = set()
def getLinks(pageUrl):
	global pages
	html = urlopen('http://en.wikipedia.org{}'.format(pageUrl))
	bs = BeautifulSoup(html, 'html.parser')
	links = bs.find_all('a',href=re.compile('^(/wiki/)'))
	for link in links:
		if 'href' in link.attrs:
			if link.attrs['href'] not in pages:
				# Encontramos uma pagina nova
				newPage = link.attrs['href']
				print(newPage)
				pages.add(newPage)
				getLinks(newPage)

In [6]:
getLinks('')

/wiki/Wikipedia
/wiki/Main_Page
/wiki/Free_content
/wiki/Wikipedia:Protection_policy#semi
/wiki/Wikipedia:Requests_for_page_protection
/wiki/Wikipedia:Requests_for_page_protection/Administrator_instructions
/wiki/Wikipedia:Protection_policy
/wiki/Wikipedia:Lists_of_protected_pages
/wiki/Wikipedia:Protection_policy#Semi-protection
/wiki/Wikipedia:Perennial_proposals
/wiki/Wikipedia:Reliable_sources/Perennial_sources
/wiki/Wikipedia:Reliable_sources
/wiki/Wikipedia:WikiProject_Reliability
/wiki/Wikipedia:WRE
/wiki/File:People_icon.svg
/wiki/Special:WhatLinksHere/File:People_icon.svg
/wiki/Help:What_links_here
/wiki/Wikipedia:Project_namespace#How-to_and_information_pages
/wiki/Wikipedia:Policies_and_guidelines
/wiki/Wikipedia:WikiProject_Politics
/wiki/Wikipedia:WikiProject_Poland
/wiki/Wikipedia:Protection_policy#pending
/wiki/Wikipedia:WikiProject_Parliamentary_Procedure
/wiki/User:Emb021
/wiki/File:Flag_of_the_United_States.svg
/wiki/Scalable_Vector_Graphics
/wiki/SVG_(disambiguation)

KeyboardInterrupt: 

### Coletando dados de um site completo

In [21]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

pages = set()
def getLinks(pageUrl):
    global pages
    html = urlopen('http://en.wikipedia.org{}'.format(pageUrl))
    bs = BeautifulSoup(html, 'html.parser')
    try:
        print(bs.h1.get_text())
        print(bs.find(id ='mw-content-text').find_all('p')[0])
        print(bs.find(id='ca-edit').find('span').find('a').attrs['href'])
    except AttributeError:
        print('Esta página não foi encontrada. Continue!!')

    for link in bs.find_all('a', href=re.compile('^(/wiki/)')):
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
                #We have encountered a new page
                newPage = link.attrs['href']
                print('-'*20)
                print(newPage)
                pages.add(newPage)
                getLinks(newPage)

In [22]:
getLinks('')

Main Page
<p><b><a href="/wiki/Zufar_ibn_al-Harith_al-Kilabi" title="Zufar ibn al-Harith al-Kilabi">Zufar ibn al-Harith al-Kilabi</a></b> (died <abbr title="circa">c.</abbr><span style="white-space:nowrap;"> 695</span>) was a Muslim commander from the <a href="/wiki/Tribes_of_Arabia" title="Tribes of Arabia">Arab tribe</a> of <a href="/wiki/Banu_Amir" title="Banu Amir">Banu Amir</a>, and the leader of the <a href="/wiki/Qays" title="Qays">Qays</a> faction in the late 7th century. During the <a href="/wiki/First_Fitna" title="First Fitna">First Muslim Civil War</a> he led his tribe in <a href="/wiki/Aisha" title="Aisha">A'isha</a>'s army against Caliph <a href="/wiki/Ali" title="Ali">Ali</a> at the <a href="/wiki/Battle_of_the_Camel" title="Battle of the Camel">Battle of the Camel</a> in 656. In 657 he fought under <a href="/wiki/Mu%27awiya_I" title="Mu'awiya I">Mu'awiya</a> against Ali at the <a href="/wiki/Battle_of_Siffin" title="Battle of Siffin">Battle of Siffin</a>. During the <a 

KeyboardInterrupt: 

### Rastreando pela Internet

In [34]:
from urllib.request import urlopen
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import re
import datetime
import random

pages = set()
random.seed(datetime.datetime.now())

#Retrieves a list of all Internal links found on a page
def getInternalLinks(bs, includeUrl):
    includeUrl = '{}://{}'.format(urlparse(includeUrl).scheme, urlparse(includeUrl).netloc)
    internalLinks = []
    #Finds all links that begin with a "/"
    for link in bs.find_all('a', href=re.compile('^(/|.*'+includeUrl+')')):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in internalLinks:
                if(link.attrs['href'].startswith('/')):
                    internalLinks.append(includeUrl+link.attrs['href'])
                else:
                    internalLinks.append(link.attrs['href'])
    return internalLinks

#Retrieves a list of all external links found on a page
def getExternalLinks(bs, excludeUrl):
    externalLinks = []
    #Finds all links that start with "http" that do
    #not contain the current URL
    for link in bs.find_all('a', href=re.compile('^(http|www)((?!'+excludeUrl+').)*$')):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in externalLinks:
                externalLinks.append(link.attrs['href'])
    return externalLinks

def getRandomExternalLink(startingPage):
    html = urlopen(startingPage)
    bs = BeautifulSoup(html, 'html.parser')
    externalLinks = getExternalLinks(bs, urlparse(startingPage).netloc)
    if len(externalLinks) == 0:
        print('No external links, looking around the site for one')
        domain = '{}://{}'.format(urlparse(startingPage).scheme, urlparse(startingPage).netloc)
        internalLinks = getInternalLinks(bs, domain)
        return getRandomExternalLink(internalLinks[random.randint(0,
                                                                  len(internalLinks)-1)])
    else:
        return externalLinks[random.randint(0, len(externalLinks)-1)]

def followExternalOnly(startingSite):
    externalLink = getRandomExternalLink(startingSite)
    print('Random external link is: {}'.format(externalLink))
    followExternalOnly(externalLink)

since Python 3.9 and will be removed in a subsequent version. The only 
supported seed types are: None, int, float, str, bytes, and bytearray.
  random.seed(datetime.datetime.now())


In [35]:
followExternalOnly('https://www.globo.com')

Random external link is: https://globoesporte.globo.com/futebol/copa-do-brasil/
Random external link is: https://ge.globo.com/
Random external link is: http://sportv.globo.com/site/
Random external link is: https://ge.globo.com/esports/free-fire/
Random external link is: http://g1.globo.com/principios-editoriais-do-grupo-globo.html
Random external link is: http://revistaepocasp.globo.com/


URLError: <urlopen error [WinError 10060] Uma tentativa de conexão falhou porque o componente conectado não respondeu
corretamente após um período de tempo ou a conexão estabelecida falhou
porque o host conectado não respondeu>

In [40]:
# Collects a list of all external URLs found on the site
allExtLinks = set()
allIntLinks = set()


def getAllExternalLinks(siteUrl):
    html = urlopen(siteUrl)
    domain = '{}://{}'.format(urlparse(siteUrl).scheme,
                              urlparse(siteUrl).netloc)
    bs = BeautifulSoup(html, 'html.parser')
    internalLinks = getInternalLinks(bs, domain)
    externalLinks = getExternalLinks(bs, domain)

    for link in externalLinks:
        if link not in allExtLinks:
            allExtLinks.add(link)
            print(link)
    for link in internalLinks:
        if link not in allIntLinks:
            allIntLinks.add(link)
            getAllExternalLinks(link)

site = "https://www.infobae.com"
allIntLinks.add(site)
getAllExternalLinks(site)

https://www.infobae.com
https://www.infobae.com/ultimas-noticias-america/
https://www.infobae.com/america/mexico/
https://www.infobae.com/america/venezuela/
https://www.infobae.com/america/eeuu/
https://www.infobae.com/america/colombia/
https://www.infobae.com/america/america-latina/
https://www.infobae.com/america/entretenimiento/
https://www.infobae.com/que-puedo-ver/
https://www.infobae.com/america/deportes/
https://www.infobae.com/america/mundo/
https://www.infobae.com/america/tecno/
https://www.infobae.com/latinpower/
https://www.infobae.com/america/perrosygatos/
https://www.infobae.com/america/fotos/
https://www.infobae.com/america/cultura/
https://www.infobae.com/america/ciencia-america
https://www.infobae.com/america/the-new-york-times/
https://www.infobae.com/america/agencias/
https://www.infobae.com/america/inhouse
https://www.infobae.com/america/soluciones/
https://www.infobae.com/america/peru/
https://www.infobae.com/america/pandora-papers/
https://www.infobae.com/america/e

HTTPError: HTTP Error 404: Not Found