<a href="https://colab.research.google.com/github/ttury/Web-Scraping-with-Python/blob/main/%EC%99%B8%EB%B6%80%EB%A7%81%ED%81%AC_%EA%B5%AC%EB%B6%84%EB%AA%BB%ED%95%B4%EC%84%9C_fail(Ch_3).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen("https://en.wikipedia.org/wiki/Dream")
bs = BeautifulSoup(html, "html.parser")
for link in bs.findAll("a"):
  if "href" in link.attrs:
    print(link.attrs["href"])

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

html = urlopen("https://en.wikipedia.org/wiki/Kevin_Bacon")
bs = BeautifulSoup(html, "html.parser")
for link in bs.find("div", {"id":"bodyContent"}).findAll('a', href = re.compile('^(/wiki/)((?!:).)*$')):
  if 'href' in link.attrs:
    print(link.attrs["href"])

In [None]:
# 케빈 베이컨 문서 내 항목 링크 중 랜덤으로 선택해 이동

from urllib.request import urlopen
from bs4 import BeautifulSoup
import datetime
import random
import re

random.seed(datetime.datetime.now())

def getLinks(articleUrl):
  html = urlopen("http://en.wikipedia.org{}".format(articleUrl))
  bs = BeautifulSoup(html, "html.parser")
  return bs.find("div", {"id":"bodyContent"}).findAll("a", href = re.compile("^(/wiki/)((?!:).)*$"))

links = getLinks('/wiki/Kevin_Bacon')
while len(links) > 0:
  newArticle = links[random.randint(0, len(links) - 1)].attrs['href']
  print(newArticle)
  links = getLinks(newArticle)

In [None]:
# 예외 처리 버전

from urllib.request import urlopen
from urllib.request import HTTPError
from urllib.request import URLError
from bs4 import BeautifulSoup
import re
import datetime
import random

random.seed(datetime.datetime.now())

def getLinks(articleUrl):
  try:
    html = urlopen('https://en.wikipedia.org{}'.format(articleUrl))
  except HTTPError as e:
    return None
  except URLError as e:
    return None
  try:
    bs = BeautifulSoup(html, 'html.parser')
  except AttributeError as e:
    return None
  return bs.find('div', {'id' : 'bodyContent'}).findAll('a', href = re.compile('^(/wiki/)((?!:).)*$'))

links = getLinks('/wiki/Kevin_Bacon')
while len(links) > 0 and links != None:
  newArticle = links[random.randint(0, len(links) - 1)].attrs['href']
  print(newArticle)
  links = getLinks(newArticle)

In [None]:
# 위키백과 메인 페이지 내 다른 링크로 이동(중복 제외)

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

pages = set()
def getLinks(pageUrl):
  global pages
  html = urlopen('http://en.wikipedia.org{}'.format(pageUrl))
  bs = BeautifulSoup(html, 'html.parser')
  for link in bs.findAll('a', href = re.compile('^(/wiki/)')):
    if 'href' in link.attrs:
      if link.attrs['href'] not in pages: #새 페이지 발견
        newPage = link.attrs['href']
        print(newPage)
        pages.add(newPage)
        getLinks(newPage)

getLinks('')

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

pages = set() # set을 이용한 중복 탐색 페이지 탐지
def getLinks(pageUrl):
  global pages
  html = urlopen('http://en.wikipedia.org{}'.format(pageUrl))
  bs = BeautifulSoup(html, 'html.parser')

# 정보 출력
  try:
    print(bs.h1.get_text()) # 제목 출력
    print(bs.find(id = 'mw-content-text').find('p').get_text()) # 첫 번째 문단 출력
    print(bs.find(id = 'ca-edit').find('a').attrs['href']) # 편집 페이지 출력
  except AttributeError:
    print('This page is missing something! No Worries though!') # 제목 -> 첫문단 -> 편집 순의 우선순위로 AttributeError 탐지(존재 여부)

# 다음 페이지 탐색
  for link in bs.findAll('a', href = re.compile('^(/wiki/)')):
    if 'href' in link.attrs:
      if link.attrs['href'] not in pages:
        newPage = link.attrs['href']
        pages.add(newPage)
        print('---------------------------\n' + newPage)
        getLinks(newPage)

getLinks('')

In [None]:
from urllib.request import urlopen
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import re
import datetime
import random

pages = set()
random.seed(datetime.datetime.now())

def getInternalLinks(bs, includeUrl):
  includeUrl = '{}://{}'.format(urlparse(includeUrl).scheme, urlparse(includeUrl).netloc)
  internalLinks = []
  for link in bs.findAll('a', href = re.compile('^(/|.*' + includeUrl + ')')):
    if link.attrs['href'] is not None:
      if link.attrs['href'] not in internalLinks:
        if link.attrs['href'].startswith('/'):
          internalLinks.append(includeUrl + link.attrs['href'])
        else:
          internalLinks.append(link.attrs['href'])
  return internalLinks
  

def getExternalLinks(bs, excludeUrl):
  externalLinks = []
  for link in bs.findAll('a', href = re.compile('^(https|www)((?!' + excludeUrl + ').)*$')):
    if link.attrs['href'] is not None:
      if link.attrs['href'] not in externalLinks:
        externalLinks.append(link.attrs['href'])
  return externalLinks

def getRandomExternalLink(startingPage):
  html = urlopen(startingPage)
  bs = BeautifulSoup(html, 'html.parser')
  externalLinks = getExternalLinks(bs, urlparse(startingPage).netloc)

  if len(externalLinks) == 0:
    print('No external links, looking around the site for one')
    domain = '{}://{}'.format(urlparse(startingPage).scheme, urlparse(startingPage).netloc)
    internalLinks = getInternalLinks(bs, domain)
    return getRandomExternalLink(internalLinks[random.randint(0, len(internalLinks) - 1)])

  else:
    return externalLinks[random.randint(0, len(externalLinks) - 1)]

def followExternalOnly(startingPage):
  externalLink = getRandomExternalLink(startingPage)
  print('random external link is: {}'.format(externalLink))
  followExternalOnly(externalLink)

followExternalOnly('https://okky.kr/')

In [None]:
from urllib.request import urlopen
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import re
import datetime
import random

random.seed(datetime.datetime.now())

allExtLinks = set()
allIntLinks = set()

def getInternalLinks(bs, includeUrl):
  includeUrl = '{}://{}'.format(urlparse(includeUrl).scheme, urlparse(includeUrl).netloc)
  internalLinks = []
  for link in bs.findAll('a', href = re.compile('^(/|.*' + includeUrl + ')')):
    if link.attrs['href'] is not None:
      if link.attrs['href'] not in internalLinks:
        if link.attrs['href'].startswith('/'):
          internalLinks.append(includeUrl + link.attrs['href'])
        else:
          internalLinks.append(link.attrs['href'])
  return internalLinks
  

def getExternalLinks(bs, excludeUrl):
  externalLinks = []
  for link in bs.findAll('a', href = re.compile('(?=^(https|www))(?=((?!' + excludeUrl + ').)*$)')):
    if link.attrs['href'] is not None:
      if link.attrs['href'] not in externalLinks:
        externalLinks.append(link.attrs['href'])
  return externalLinks

def findAllExtLinksInThisSite(siteUrl):
  html = urlopen(siteUrl)
  domain = '{}://{}'.format(urlparse(siteUrl).scheme, urlparse(siteUrl).netloc)
  bs = BeautifulSoup(html, 'html.parser')
  internalLinks = getInternalLinks(bs, domain)
  externalLinks = getExternalLinks(bs, domain)
  print(externalLinks)

  for link in externalLinks:
    if link not in allExtLinks:
      allExtLinks.add(link)
      print(link)
  
  for link in internalLinks:
    if link not in allIntLinks:
      allIntLinks.add(link)
      findAllExtLinksInThisSite(link)

allIntLinks.add('https://www.oreilly.com')
findAllExtLinksInThisSite('https://www.oreilly.com')