In [1]:
# download函数
    # 添加缓存机制
    # 限速添加至下载类中，读取缓存则不用限速
    
import urllib
class Downloader:
    def __init__(self, delay=5, user_agent='yuhao', num_retries=1, cache=None):
        self.throttle = Throttle(delay)
        self.user_agent = user_agent
        self.num_retries = num_retries
        self.cache = cache
    def __call__(self, url):
        result = None
        if self.cache:
            try:
                result = self.cache[url]
            except KeyError:
                #url is not available in cache
                pass
            else:
                if result['code'] == None:
                     return result['html']
                if self.num_retries > 0 and 500 <= result['code'] <600:
                    result = None
        if result is None:
            self.throttle.wait(url)
            headers = {'User-agent':self.user_agent}
            result = self.download(url, headers, self.num_retries)
            if self.cache:
                self.cache[url] = result
            return result['html']
    def download(self, url, headers, num_retries, data=None):
        print ('Downloading:',url)
        # 创建请求而不是url， url读取会被禁止, 设置代理(默认代理会被拒)
        req = urllib.request.Request(url, headers={'User-agent':user_agent})
        code = None
        try:
            # 读取后编码成字符串，在后面re.findall中才能匹配
            html = urllib.request.urlopen(req).read().decode('utf-8')

        except urllib.error.HTTPError as e:
            print ('Download error:',e.reason)
            html = None
            code = e.code
            if(num_retries > 0):
                # 处理500-600的服务器错误
                if hasattr(e, 'code') and 500 <= e.code <600:
                    return download(url, num_retries - 1)
        
        return {'html':html,'code':code}
        
    




In [2]:
# 链接爬虫4.0
   # 加入缓存
    
    
# 爬虫限速
# Throttle类记录每个域名最近访问时间
class Throttle:
    def __init__(self, delay):
        self.delay = delay
        self.domains = {}
    def wait(self, url):
        domain = urllib.parse.urlparse(url).netloc
        last_accessed = self.domains.get(domain)
        if self.delay > 0 and last_accessed is not None:
            sleep_secs = self.delay - (datetime.now() - last_accessed).seconds
            if sleep_secs > 0:
                time.sleep(sleep_secs)
        self.domains[domain] = datetime.now()    

        
import re
import datetime
import csv
import urllib.robotparser as r_p
import time
import lxml.html

rp = r_p.RobotFileParser()
rp.set_url('http://example.webscraping.com/robots.txt')
rp.read()

user_agent = 'GoodCrawler'



# 链接爬取
def link_crawler4(seed_url, link_regex, num_retries=1, delay=5, max_depth=2, cache=None):
    max_depth = 2
    crawl_queue = [seed_url]
    # 避免重复url
    seen = {seed_url:0}
    
    num_urls = 0
    D= Downloader(delay=delay, user_agent=user_agent, num_retries=num_retries, cache=cache)
    
    while crawl_queue:
        url = crawl_queue.pop()
        depth = seen[url]
        print( rp.can_fetch(user_agent, url))
        # 检查robots.txt是否当前代理可以爬取
        if not rp.can_fetch(user_agent, url):
            print("Blocked by robots.txt", user_agent, url)
            return
        html = D(url)
        
        if depth != max_depth:
            for link in get_links(html):
                if re.match(link_regex, link):
                    link = urllib.parse.urljoin(seed_url, link)
                    if link not in seen:
                        seen[link] = depth + 1
                        crawl_queue.append(link)

def get_links(html):
    webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
    return webpage_regex.findall(html)


''' 
link_crawler4('http://example.webscraping.com', '/(places/default/view|places/default/index)', 
              delay = 3, num_retries = 2,  max_depth = -1)
'''
    

" \nlink_crawler4('http://example.webscraping.com', '/(places/default/view|places/default/index)', \n              delay = 3, num_retries = 2,  max_depth = -1)\n"

In [3]:
'''


# pymongo test
from pymongo import MongoClient
client = MongoClient('localhost',27017)

url = 'http://example.webscraping.com/view/Antigua-and-Barbuda-10'
html = '...'
db = client.cache
#db.webpage.insert_one({"url":url, 'html':html})
#db.webpage.insert_one({"url":url + '???','html':"???"})
db.webpage.find({"url":url}).count()

#db.webpage.delete_many({})
#db.webpage.update_one({'_id': url}, { "$set": { "html": "3"} }, upsert=True)
'''

'\n\n\n# pymongo test\nfrom pymongo import MongoClient\nclient = MongoClient(\'localhost\',27017)\n\nurl = \'http://example.webscraping.com/view/Antigua-and-Barbuda-10\'\nhtml = \'...\'\ndb = client.cache\n#db.webpage.insert_one({"url":url, \'html\':html})\n#db.webpage.insert_one({"url":url + \'???\',\'html\':"???"})\ndb.webpage.find({"url":url}).count()\n\n#db.webpage.delete_many({})\n#db.webpage.update_one({\'_id\': url}, { "$set": { "html": "3"} }, upsert=True)\n'

In [4]:
# 使用MongoDB实现cache类
    # 创建index时，expireAfterSeconds属性如果为utc时间，则会被数据库自动删除，否则不会
from datetime import datetime, timedelta
from pymongo import MongoClient

class MongoCache:
    def __init__(self, client=None, expires=timedelta(days=30)):
        self.client = MongoClient('localhost',27017)
        self.db = client.cache
        self.db.webpage.create_index('timestamp', expireAfterSeconds=expires.total_seconds())
    
    def __getitem__(self, url):
        record = self.db.webpage.find_one({'_id':url})
        if record:
            return record['result']
        else:
            raise KeyError(url + 'does not exist')
            
    def __setitem__(self, url, result):
        record = {'result': result, 'timestamp':datetime.now()}
        self.db.webpage.update( {'_id': url}, {'$set': record},  upsert=True)

client = MongoClient('localhost',27017)
cache = MongoCache(client=client, expires=timedelta())



link_crawler4('http://example.webscraping.com', '/(places/default/view|places/default/index)', 
              delay = 3, cache = cache, num_retries = 2,  max_depth = -1)

True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True


In [5]:
'''
# 压缩数据库
import pickle
import zlib
from bson.binary import Binary

class MongoCache:
    def __init__(self, client=None, expires=timedelta(days=30)):
        self.client = MongoClient('localhost',27017)
        self.db = client.cache
        self.db.webpage.create_index('timestamp', expireAfterSeconds=expires.total_seconds())
    
    def __getitem__(self, url):
        record = self.db.webpage.find_one({'_id':url})
        if record:
            return pickle.loads(zlib.decompress(record['result']))
        else:
            raise KeyError(url + 'does not exist')
            
    def __setitem__(self, url, result):
        record = {'result': Binary(zlib.compress(pickle.dumps(result))),
                  'timestamp':datetime.utcnow()}
        self.db.webpage.update( {'_id': url}, {'$set': record},  upsert=True)
'''



"\n# 压缩数据库\nimport pickle\nimport zlib\nfrom bson.binary import Binary\n\nclass MongoCache:\n    def __init__(self, client=None, expires=timedelta(days=30)):\n        self.client = MongoClient('localhost',27017)\n        self.db = client.cache\n        self.db.webpage.create_index('timestamp', expireAfterSeconds=expires.total_seconds())\n    \n    def __getitem__(self, url):\n        record = self.db.webpage.find_one({'_id':url})\n        if record:\n            return pickle.loads(zlib.decompress(record['result']))\n        else:\n            raise KeyError(url + 'does not exist')\n            \n    def __setitem__(self, url, result):\n        record = {'result': Binary(zlib.compress(pickle.dumps(result))),\n                  'timestamp':datetime.utcnow()}\n        self.db.webpage.update( {'_id': url}, {'$set': record},  upsert=True)\n"