In [1]:
import urllib.request as urlreq
import urllib.error as urlerr
import urllib.parse as urlparse
import urllib.robotparser as urlrp
from bs4 import BeautifulSoup
import re
import datetime
import time
import sys
sys.path.append('../')
from common.utils import *
from common.disk_cache import DiskCache
from common.db_cache import DBCache
from datetime import timedelta

def link_crawler(seed_url, link_regex, delay=3, max_depth=-1, max_urls=-1, user_agent='wswp', proxies=None, num_retries=1, scrape_callback=None, cache=None):
    crawl_queue = [seed_url]
    seen = {seed_url:0}
    num_urls = 0
    rp = urlrp.RobotFileParser()
    rp.set_url(urlparse.urljoin(seed_url, '/robots.txt'))
    rp.read()
    D = Downloader(delay=delay, user_agent=user_agent,
                  proxies=proxies, num_retries=num_retries, cache=cache)
    while crawl_queue:
        url = crawl_queue.pop()
        depth = seen[url]
        if rp.can_fetch(user_agent, url):
            html = D(url)
            if scrape_callback:
                scrape_callback(url, html)
            if depth != max_depth:
                for link in get_links(html.decode()):
                    # skip all login pages
                    if re.search('login|register', link):
                        continue
                    if re.search(link_regex, link):
                        # form absolute link
                        link = urlparse.urljoin(seed_url, link)
                        # check if this link is already seen
                        if link not in seen:
                            seen[link] = depth + 1
                            crawl_queue.append(link)
        else:
            print('blocked by robots.txt, ', url)
                    
    return seen

In [2]:

# first time, no cache
start = time.time()
all_links = link_crawler('http://example.webscraping.com', '/(index|view)', max_depth=2, cache=DiskCache())
end = time.time()
print("download without cache: %.2f seconds" % (end-start))

Downloading:  http://example.webscraping.com
Downloading:  http://example.webscraping.com/places/default/index/1
Downloading:  http://example.webscraping.com/places/default/index/2
Downloading:  http://example.webscraping.com/places/default/index/0
Downloading:  http://example.webscraping.com/places/default/view/Barbados-20
Downloading:  http://example.webscraping.com/places/default/view/Bangladesh-19
Downloading:  http://example.webscraping.com/places/default/view/Bahrain-18
Downloading:  http://example.webscraping.com/places/default/view/Bahamas-17
Downloading:  http://example.webscraping.com/places/default/view/Azerbaijan-16
Downloading:  http://example.webscraping.com/places/default/view/Austria-15
Downloading:  http://example.webscraping.com/places/default/view/Australia-14
Downloading:  http://example.webscraping.com/places/default/view/Aruba-13
Downloading:  http://example.webscraping.com/places/default/view/Armenia-12
Downloading:  http://example.webscraping.com/places/default/

In [3]:
#second time, with cache
start = time.time()
all_links = link_crawler('http://example.webscraping.com', '/(index|view)', max_depth=2, cache=DiskCache())
end = time.time()
print("download with cache: %.2f seconds" % (end-start))

download with cache: 0.41 seconds


In [4]:
all_links = link_crawler('http://example.webscraping.com', '/(index|view)', max_depth=2, cache=DiskCache(expires=timedelta(seconds=5)))

Downloading:  http://example.webscraping.com
Downloading:  http://example.webscraping.com/places/default/index/1
Downloading:  http://example.webscraping.com/places/default/index/2
Downloading:  http://example.webscraping.com/places/default/index/0
Downloading:  http://example.webscraping.com/places/default/view/Barbados-20
Downloading:  http://example.webscraping.com/places/default/view/Bangladesh-19
Downloading:  http://example.webscraping.com/places/default/view/Bahrain-18
Downloading:  http://example.webscraping.com/places/default/view/Bahamas-17
Downloading:  http://example.webscraping.com/places/default/view/Azerbaijan-16
Downloading:  http://example.webscraping.com/places/default/view/Austria-15
Downloading:  http://example.webscraping.com/places/default/view/Australia-14
Downloading:  http://example.webscraping.com/places/default/view/Aruba-13
Downloading:  http://example.webscraping.com/places/default/view/Armenia-12
Downloading:  http://example.webscraping.com/places/default/

In [2]:
db_cache = DBCache()

start = time.time()
all_links = link_crawler('http://example.webscraping.com', '/(index|view)', max_depth=2, cache=db_cache)
end = time.time()
print("download without cache: %.2f seconds" % (end-start))

download without cache: 0.42 seconds


In [3]:
start = time.time()
all_links = link_crawler('http://example.webscraping.com', '/(index|view)', max_depth=2, cache=db_cache)
end = time.time()
print("download with cache: %.2f seconds" % (end-start))

download with cache: 0.43 seconds


In [3]:
db_cache.update_expire(expires=timedelta(seconds=5))
start = time.time()
all_links = link_crawler('http://example.webscraping.com', '/(index|view)', max_depth=2, cache=db_cache)
end = time.time()
print("download without cache: %.2f seconds" % (end-start))

timestamp_1
download without cache: 0.40 seconds


In [5]:
start = time.time()
all_links = link_crawler('http://example.webscraping.com', '/(index|view)', max_depth=2, cache=db_cache)
end = time.time()
print("download without cache: %.2f seconds" % (end-start))

Downloading:  http://example.webscraping.com
Downloading:  http://example.webscraping.com/places/default/index/1
Downloading:  http://example.webscraping.com/places/default/index/2
Downloading:  http://example.webscraping.com/places/default/index/0
Downloading:  http://example.webscraping.com/places/default/view/Barbados-20
Downloading:  http://example.webscraping.com/places/default/view/Bangladesh-19
Downloading:  http://example.webscraping.com/places/default/view/Bahrain-18
Downloading:  http://example.webscraping.com/places/default/view/Bahamas-17
Downloading:  http://example.webscraping.com/places/default/view/Azerbaijan-16
Downloading:  http://example.webscraping.com/places/default/view/Austria-15
Downloading:  http://example.webscraping.com/places/default/view/Australia-14
Downloading:  http://example.webscraping.com/places/default/view/Aruba-13
Downloading:  http://example.webscraping.com/places/default/view/Armenia-12
Downloading:  http://example.webscraping.com/places/default/