In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

base_url = 'http://www.webscrapingfordatascience.com/crawler/'
links_seen = set() #空のセットを定義する

def visit(url, links_seen):
        html = requests.get(url).text #htmlを.get()で取得
        html_soup = BeautifulSoup(html, 'html.parser') #BSでparse
        links_seen.add(url) #links_seenに追加する
        
        for link in html_soup.find_all('a'):
                link_url = link.get('href') #ここで得られるのは相対URLになる
                
                if link_url is None:
                        continue
                
                full_url = urljoin(url, link_url) #二つのurlを結合する
                #絶対アドレスが指定されていても、それが返るので問題ない
                
                if full_url in links_seen:
                        continue
                
                print('Found a new Page:', full_url)
                visit(full_url, links_seen)

visit(base_url, links_seen)        

In [1]:
!pip install records==0.5.2
!pip install sqlalchemy==1.2
import sqlalchemy
import requests
import records
import sqlite3
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from sqlalchemy.exc import IntegrityError



In [None]:
db = records.Database('sqlite:///crawler_database.db')

db.query('''CREATE TABLE IF NOT EXISTS links(
url text PRIMARY KEY,
created_at datetime,
visited_at datetime NULL)'''
             )
db.query('''CREATE TABLE IF NOT EXISTS numbers(
url text, number integer,
PRIMARY KEY
(url, number))''')

def store_link(url):
    try:
        db.query('''INSERT INTO links (url, created_at)
                   VALUES (:url, CURRENT_TIMESTAMP)''', url = url)
    except IntegrityError as ie:
        #このリンクは存在するので何もしない
        pass

def store_number(url, number):
    try:
        db.query('''INSERT INTO numbers (url, number)
                   VALUES (:url, :number)''', url = url, number = number)
    except IntegrityError as ie:
        #この数字は存在するので何もしない
        pass

def mark_visited(url):
    db.query('''UPDATE links SET visited_at = CURRENT_TIMESTAMP
               WHERE url = :url''', url = url)

def get_random_unvisited_link():
    link = db.query('''SELECT * FROM links
                      WHERE visited_at IS NULL
                      ORDER BY RANDOM() LIMIT 1''').first()
    return None if link is None else link.url

def visit(url):
    html = requests.get(url).text
    html_soup = BeautifulSoup(html, 'html.parser')
    new_links = []
    tds = html_soup.find_all('td')
    for td in tds:
        store_number(url, int(
            td.text.strip()
        ))
    links = html_soup.find_all('a')
    for link in links:
        link_url = link.get('href')
        if link_url is None:
            continue
        full_url = urljoin(url, link_url)
        new_links.append(full_url)
    return new_links

store_link('http://www.webscrapingfordatascience.com/crawler/')
url_to_visit = get_random_unvisited_link()
while url_to_visit is not None:
    print('Now visiting...', url_to_visit)
    new_links = visit(url_to_visit)
    for link in new_links:
        store_link(link)
    mark_visited(url_to_visit)
    url_to_visit = get_random_unvisited_link()



In [None]:
import requests
import records
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urldefrag
from sqlalchemy.exc import IntegrityError

db = records.Database('sqlite:///wikipedia.db')

db.query('''CREATE TABLE IF NOT EXISTS pages (
            url text PRIMARY KEY,
            page_title text NULL,
            created_at datetime,
            visited_at datetime NULL)'''
        )

db.query('''CREATE TABLE IF NOT EXISTS links
            (url text,
             url_to text,
             PRIMARY KEY (url, url_to))''') #二つ以上をprimary keyにするとき

base_url = 'https://ja.wikipedia.org/wiki/メインページ/'

def store_page(url):
    try:
        db.query('''INSERT INTO pages (url, created_at)
                    VALUES (:url, CURRENT_TIMESTAMP)''', url = url)
    except IntegrityError as ie:
        #このページはすでに存在する
        pass

def store_link(url, url_to):
    try:
        db.query('''INSERT INTO links (url, url_to)
                    VALUES (:url,  :url_to)''', url = url, url_to = url_to
                )
    except IntegrityError as ie:
        #このリンクはすでに存在する
        pass

def set_visited(url):
    db.query('''UPDATE pages SET
                visited_at = CURRENT_TIMESTAMP
                WHERE url = :url''',
                url = url)

def set_title(url, page_title):
    db.query('''UPDATE pages SET
                page_title = :page_title
                WHERE url = :url''',
                url = url, page_title = page_title)

def get_random_unvisited_page():
    link = db.query('''SELECT * FROM pages
                       WHERE visited_at IS NULL
                       ORDER BY RANDOM() LIMIT 1''').first()
    return None if link is None else link.url

def visit(url):
    print('now visiting...', url)
    html = requests.get(url).text
    html_soup = BeautifulSoup(html, 'html.parser')
    page_title = html_soup.find(id = 'firstHeading')
    page_title = page_title.text if page_title else ''
    print('page title:', page_title)
    
    set_title(url, page_title)
    
    for link in html_soup.find_all('a'):
        link_url = link.get('href')
        if link_url is None:
            #hrefがない場合は、スキップになる
            continue #(終了)
        full_url = urljoin(base_url, link_url)
        #フラグメント識別子の部分を削除する
        full_url = urldefrag(full_url)[0]
        if not full_url.startswith(base_url):
            #これは外部リンクなのでスキップ
            continue
        store_link(url, full_url)
        store_page(full_url)
    set_visited(url)

store_page(base_url)

url_to_visit = get_random_unvisited_page()
while url_to_visit is not None:
    visit(url_to_visit)
    url_to_visit = get_random_unvisited_page()

In [22]:
import requests
import records
import re
import os, os.path
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urldefrag
from sqlalchemy.exc import IntegrityError

db = records.Database('sqlite:///wikipedia2.db')

#クロールしたページとこれからクロールするページをこのテーブルに記録
db.query('''CREATE TABLE IF NOT EXISTS pages (
                    url text PRIMARY KEY,
                    created_at datetime,
                    html_file text NULL,
                    visited_at datetime NULL)''')

#このテーブルに<a>タグを記録する
db.query('''CREATE TABLE IF NOT EXISTS links (
                    url text,
                    link_url text,
                    PRIMARY KEY (url, link_url))''')

#このテーブルに<img>タグを記録する
db.query('''CREATE TABLE IF NOT EXISTS images (
                    url text,
                    img_url text,
                    img_file text,
                    PRIMARY KEY  (url, img_url))''')

base_url = 'http://en.wikipedia.org/wiki/'
file_store = './downloads/'

if not os.path.exists(file_store): #.exists()関数、boolを返す
        os.makedirs(file_store)

def url_to_file_name(url):
        url = str(url).strip().replace(' ', '_')
        return re.sub(r'(?u)[^-\w.]', '', url)

def download(url, filename):
        res = requests.get(url, stream = True)
        with open(os.path.join(file_store, filename), 'wb') as the_image:
                for byte_chunk in res.iter_content(chunk_size = 4096*4):
                        the_image.write(byte_chunk)

def store_page(url):
        try:
                db.query('''INSERT INTO pages (url, created_at)
                                    VALUES (:url, CURRENT_TIMESTAMP)''', url = url)
        except IntegrityError as ie:
                pass

def store_link(url, link_url):
        try:
                db.query('''INSERT INTO links (url, link_url)
                                    VALUES (:url, :link_url)''',
                                    url = url, link_url = link_url)
        except IntegrityError as ie:
                pass
    
def store_image(url, img_url, img_file):
        try:
                db.query('''INSERT INTO images (url, img_url, img_file)
                                    VALUES (:url, :img_url, :img_file)''',
                                    url = url, img_url = img_url, img_file = img_file)
        except IntegrityError as ie:
                pass

def set_visited(url, html_file):
        db.query('''UPDATE pages
                            SET visited_at = CURRENT_TIMESTAMP,
                                    html_file = :html_file
                            WHERE url = :url''',
                            html_file = html_file, url = url)

def get_random_unvisited_page():
        link = db.query('''SELECT * FROM pages
                                      WHERE visited_at IS NULL
                                      ORDER BY RANDOM() LIMIT 1''').first()
        return None if link is None else link.url

def should_visit(link_url):
        link_url = urldefrag(link_url)[0]
            #defragで#を外す。[1]には#が入る
        if not link_url.startswith(base_url):
                return None
        else:
                return link_url

def visit(url):
        print('now visiting...', url)
        html = requests.get(url).text
        html_soup = BeautifulSoup(html, 'html.parser')
        
        #<a>タグのリンクを保存する
        links = html_soup.find_all('a')
        for link in links:
                link_url = link.get('href')
                if link_url is None:
                        continue
                link_url = urljoin(base_url, link_url)
                store_link(url, link_url)
                full_url = should_visit(link_url)
                if full_url:
                        #クローリングのキュー
                        store_page(full_url)
        
        #imgのsrcに指定されたファイルを保存
        imgs = html_soup.find_all('img')
        for img in imgs:
                img_url = img.get('src')
                if img_url is None:
                        continue
                img_url = urljoin(base_url, img_url)
                filename = url_to_file_name(img_url)
                download(img_url, filename)
                store_image(url, img_url, filename)
        
        #HTMLコンテンツを保存する
        filename = url_to_file_name(url) + '.html'
        fullname = os.path.join(file_store, filename)
        with open(fullname, 'w', encoding = 'utf-8')as the_html:
                the_html.write(html)
        set_visited(url, filename)

store_page(base_url)
url_to_visit = get_random_unvisited_page()
while url_to_visit is not None:
        visit(url_to_visit)
        url_to_visit = get_random_unvisited_page()

now visiting... http://en.wikipedia.org/wiki/Wikipedia:Contact_us
now visiting... http://en.wikipedia.org/wiki/Boris_Johnson_Is_a_Fucking_Cunt
now visiting... http://en.wikipedia.org/wiki/2022_Australian_federal_election


KeyboardInterrupt: 