In [1]:
# Downloader类
    
import urllib
import time
class Downloader:
    def __init__(self, delay=5, user_agent='yuhao', num_retries=1, cache=None):
        self.throttle = Throttle(delay)
        self.user_agent = user_agent
        self.num_retries = num_retries
        self.cache = cache
    def __call__(self, url):
        result = None
        if self.cache:
            try:
                result = self.cache[url]
            except KeyError:
                #url is not available in cache
                pass
            else:
                if result['code'] == None:
                     return result['html']
                if self.num_retries > 0 and 500 <= result['code'] <600:
                    result = None
        if result is None:
            self.throttle.wait(url)
            headers = {'User-agent':self.user_agent}
            result = self.download(url, headers, self.num_retries)
            if self.cache:
                self.cache[url] = result
            return result['html']
    def download(self, url, headers, num_retries, data=None):
        print ('Downloading:',url)
        # 创建请求而不是url， url读取会被禁止, 设置代理(默认代理会被拒)
        req = urllib.request.Request(url, headers={'User-agent':self.user_agent})
        code = None
        try:
            # 读取后编码成字符串，在后面re.findall中才能匹配
            html = urllib.request.urlopen(req).read().decode('utf-8')

        except urllib.error.HTTPError as e:
            print ('Download error:',e.reason)
            html = None
            code = e.code
            if(num_retries > 0):
                # 处理500-600的服务器错误
                if hasattr(e, 'code') and 500 <= e.code <600:
                    return download(url, num_retries - 1)
        
        return {'html':html,'code':code}
        


import re
from datetime import datetime
#import csv
#import urllib.robotparser as r_p
#import time
#import lxml.html

# Throttle类记录每个域名最近访问时间
class Throttle:
    def __init__(self, delay):
        self.delay = delay
        self.domains = {}
    def wait(self, url):
        domain = urllib.parse.urlparse(url).netloc
        last_accessed = self.domains.get(domain)
        if self.delay > 0 and last_accessed is not None:
            sleep_secs = self.delay - (datetime.now() - last_accessed).seconds
            if sleep_secs > 0:
                time.sleep(sleep_secs)
        self.domains[domain] = datetime.now()    

        




In [2]:
# 动态网页，直接读取的结果为空
import lxml.html
D = Downloader()
html = D('http://example.webscraping.com/places/default/search')
tree = lxml.html.fromstring(html)
tree.cssselect('div#results a')

Downloading: http://example.webscraping.com/places/default/search


[]

In [3]:
# 查看AJAX请求结果（json格式）
html = D('http://example.webscraping.com/places/ajax/search.json?page=0&page_size=10&search_term=a')
#print(html)
import json
json.loads(html)

Downloading: http://example.webscraping.com/places/ajax/search.json?page=0&page_size=10&search_term=a


{'records': [{'pretty_link': '<div><a href="/places/default/view/Afghanistan-1"><img src="/places/static/images/flags/af.png" /> Afghanistan</a></div>',
   'country': 'Afghanistan',
   'id': 7049953},
  {'pretty_link': '<div><a href="/places/default/view/Aland-Islands-2"><img src="/places/static/images/flags/ax.png" /> Aland Islands</a></div>',
   'country': 'Aland Islands',
   'id': 7049954},
  {'pretty_link': '<div><a href="/places/default/view/Albania-3"><img src="/places/static/images/flags/al.png" /> Albania</a></div>',
   'country': 'Albania',
   'id': 7049955},
  {'pretty_link': '<div><a href="/places/default/view/Algeria-4"><img src="/places/static/images/flags/dz.png" /> Algeria</a></div>',
   'country': 'Algeria',
   'id': 7049956},
  {'pretty_link': '<div><a href="/places/default/view/American-Samoa-5"><img src="/places/static/images/flags/as.png" /> American Samoa</a></div>',
   'country': 'American Samoa',
   'id': 7049957},
  {'pretty_link': '<div><a href="/places/default

In [7]:
# 剥离国家信息，通过26次ajax请求下载页面
import json
import string
template_url = 'http://example.webscraping.com/places/ajax/search.json?page={}&page_size=10&search_term={}'
countries = set()

for letter in string.ascii_lowercase:
    page = 0
    while True:
        html = D(template_url.format(page, letter))
        try:
            ajax = json.loads(html)
        except ValueError as e:
            print(e)
            ajax = None
        else:
            for record in ajax['records']:
                countries.add(record['country'])
        page += 1
        if ajax is None or page >= ajax['num_pages']:
            break
    
open('countries.txt','w').write('\n'.join(sorted(countries)))
        

Downloading: http://example.webscraping.com/places/ajax/search.json?page=0&page_size=10&search_term=a
Downloading: http://example.webscraping.com/places/ajax/search.json?page=1&page_size=10&search_term=a
Downloading: http://example.webscraping.com/places/ajax/search.json?page=2&page_size=10&search_term=a
Downloading: http://example.webscraping.com/places/ajax/search.json?page=3&page_size=10&search_term=a
Downloading: http://example.webscraping.com/places/ajax/search.json?page=4&page_size=10&search_term=a
Downloading: http://example.webscraping.com/places/ajax/search.json?page=5&page_size=10&search_term=a
Downloading: http://example.webscraping.com/places/ajax/search.json?page=6&page_size=10&search_term=a
Downloading: http://example.webscraping.com/places/ajax/search.json?page=7&page_size=10&search_term=a
Downloading: http://example.webscraping.com/places/ajax/search.json?page=8&page_size=10&search_term=a
Downloading: http://example.webscraping.com/places/ajax/search.json?page=9&page_si

KeyboardInterrupt: 

In [10]:
# 观察发现search页面是通过正则匹配字符
url = 'http://example.webscraping.com/places/ajax/search.json?page=0&page_size=10&search_term='
json.loads(D(url + '.'))['num_pages']

Downloading: http://example.webscraping.com/places/ajax/search.json?page=0&page_size=10&search_term=.


26