In [1]:
import urllib.request as urlreq
import urllib.error as urlerr
import urllib.parse as urlparse
import urllib.robotparser as urlrp
import re
import datetime
import time
import sys
sys.path.append('../')
from common.utils import *
from common.db_cache import *
from common.sequential_crawler import *
from common.threaded_crawler import *
from common.process_crawler import *
import csv
from zipfile import ZipFile
from io import BytesIO, TextIOWrapper

class AlexaCallback():
    def __init__(self, max_urls=50):
        self.max_urls = max_urls;
        self.seed_url = 'http://s3.amazonaws.com/alexa-static/top-1m.csv.zip'
        
    def __call__(self, url, html):
        if url == self.seed_url:
            urls = []
            with ZipFile(BytesIO(html)) as zf:
                csv_filename = zf.namelist()[0]
                for _, website in csv.reader(TextIOWrapper(zf.open(csv_filename))):
                    urls.append('http://' + website)
                    if len(urls) == self.max_urls:
                        break
        
            return urls


In [2]:
scrape_callback = AlexaCallback()

cache = DBCache()
start = time.time()
all_links = link_crawler(scrape_callback.seed_url, user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
                         scrape_callback=scrape_callback, cache=cache, ignore_robots=True)
end = time.time()
print("sequential download: %.2f seconds" % (end-start))

Downloading:  http://s3.amazonaws.com/alexa-static/top-1m.csv.zip
Downloading:  http://bing.com
Downloading:  http://mail.ru
Downloading:  http://google.com.hk
Downloading:  http://office.com
Downloading:  http://microsoft.com
Downloading:  http://ebay.com
Downloading:  http://ok.ru
Downloading:  http://hao123.com
Downloading:  http://alipay.com
Downloading:  http://google.ca
Downloading:  http://xvideos.com
Downloading:  http://pages.tmall.com
Downloading:  http://t.co
Downloading:  http://google.com.mx
Downloading:  http://yahoo.co.jp
Downloading:  http://pornhub.com
Downloading:  http://google.es
Downloading:  http://twitch.tv
Downloading:  http://google.it
Downloading:  http://netflix.com
Downloading:  http://linkedin.com
Downloading:  http://google.ru
Downloading:  http://google.fr
Downloading:  http://yandex.ru
Downloading:  http://google.com.br
Downloading:  http://google.co.uk
Downloading:  http://google.de
Downloading:  http://weibo.com
Downloading:  http://jd.com
Downloading:

In [3]:
cache.clear()
cache = DBCache()
start = time.time()
threaded_crawler(scrape_callback.seed_url, scrape_callback=scrape_callback, cache=cache, max_threads=10, timeout=10)
end = time.time()
print("threaded download: %.2f seconds" % (end-start))

Downloading:  http://s3.amazonaws.com/alexa-static/top-1m.csv.zip
Downloading:  http://bing.com
Downloading:  http://mail.ru
Downloading:  http://google.com.hkDownloading: 
 Downloading: http://microsoft.com 
Downloading: http://office.comDownloading:  Downloading: Downloading: 
 http://ok.ru  http://google.cahttp://alipay.com
http://xvideos.com


Downloading:  http://ebay.com
Downloading:  http://hao123.com
Downloading:  http://pages.tmall.com
Downloading:  http://t.co
Downloading:  http://google.com.mx
Downloading:  http://yahoo.co.jp
Downloading:  http://pornhub.com
Downloading:  http://google.es
Downloading:  http://twitch.tv
Downloading:  http://google.it
Downloading:  http://netflix.com
Downloading:  http://linkedin.com
Downloading:  http://google.ru
Downloading:  http://google.fr
Downloading:  http://yandex.ru
Downloading:  http://google.com.br
Downloading:  http://google.co.uk
Downloading:  http://google.de
Downloading:  http://login.tmall.com
Downloading:  http://weibo.com
Dow

In [4]:
# multiprocess doesn't work in interactive editor, use the process_test.py instead