In [1]:
# Downloader类
    
import urllib
import time
class Downloader:
    def __init__(self, delay=5, user_agent='yuhao', num_retries=1, cache=None):
        self.throttle = Throttle(delay)
        self.user_agent = user_agent
        self.num_retries = num_retries
        self.cache = cache
    def __call__(self, url):
        result = None
        if self.cache:
            try:
                result = self.cache[url]
            except KeyError:
                #url is not available in cache
                pass
            else:
                if result['code'] == None:
                     return result['html']
                if self.num_retries > 0 and 500 <= result['code'] <600:
                    result = None
        if result is None:
            self.throttle.wait(url)
            headers = {'User-agent':self.user_agent}
            result = self.download(url, headers, self.num_retries)
            if self.cache:
                self.cache[url] = result
            return result['html']
    def download(self, url, headers, num_retries, data=None):
        print ('Downloading:',url)
        # 创建请求而不是url， url读取会被禁止, 设置代理(默认代理会被拒)
        req = urllib.request.Request(url, headers={'User-agent':self.user_agent})
        code = None
        try:
            # 读取后编码成字符串，在后面re.findall中才能匹配
            html = urllib.request.urlopen(req).read().decode('utf-8')

        except urllib.error.HTTPError as e:
            print ('Download error:',e.reason)
            html = None
            code = e.code
            if(num_retries > 0):
                # 处理500-600的服务器错误
                if hasattr(e, 'code') and 500 <= e.code <600:
                    return download(url, num_retries - 1)
        
        return {'html':html,'code':code}
        


import re
from datetime import datetime
#import csv
#import urllib.robotparser as r_p
#import time
#import lxml.html

# Throttle类记录每个域名最近访问时间
class Throttle:
    def __init__(self, delay):
        self.delay = delay
        self.domains = {}
    def wait(self, url):
        domain = urllib.parse.urlparse(url).netloc
        last_accessed = self.domains.get(domain)
        if self.delay > 0 and last_accessed is not None:
            sleep_secs = self.delay - (datetime.now() - last_accessed).seconds
            if sleep_secs > 0:
                time.sleep(sleep_secs)
        self.domains[domain] = datetime.now()    

        




In [2]:
# 动态网页，直接读取的结果为空
import lxml.html
D = Downloader()
html = D('http://example.webscraping.com/places/default/search')
tree = lxml.html.fromstring(html)
tree.cssselect('div#results a')

Downloading: http://example.webscraping.com/places/default/search


[]

In [3]:
# 查看AJAX请求结果（json格式）
html = D('http://example.webscraping.com/places/ajax/search.json?page=0&page_size=10&search_term=a')
#print(html)
import json
json.loads(html)

Downloading: http://example.webscraping.com/places/ajax/search.json?page=0&page_size=10&search_term=a


{'records': [{'pretty_link': '<div><a href="/places/default/view/Afghanistan-1"><img src="/places/static/images/flags/af.png" /> Afghanistan</a></div>',
   'country': 'Afghanistan',
   'id': 7050709},
  {'pretty_link': '<div><a href="/places/default/view/Aland-Islands-2"><img src="/places/static/images/flags/ax.png" /> Aland Islands</a></div>',
   'country': 'Aland Islands',
   'id': 7050710},
  {'pretty_link': '<div><a href="/places/default/view/Albania-3"><img src="/places/static/images/flags/al.png" /> Albania</a></div>',
   'country': 'Albania',
   'id': 7050711},
  {'pretty_link': '<div><a href="/places/default/view/Algeria-4"><img src="/places/static/images/flags/dz.png" /> Algeria</a></div>',
   'country': 'Algeria',
   'id': 7050712},
  {'pretty_link': '<div><a href="/places/default/view/American-Samoa-5"><img src="/places/static/images/flags/as.png" /> American Samoa</a></div>',
   'country': 'American Samoa',
   'id': 7050713},
  {'pretty_link': '<div><a href="/places/default

In [4]:
'''
# 剥离国家信息，通过26次ajax请求下载页面
import json
import string
template_url = 'http://example.webscraping.com/places/ajax/search.json?page={}&page_size=10&search_term={}'
countries = set()

for letter in string.ascii_lowercase:
    page = 0
    while True:
        html = D(template_url.format(page, letter))
        try:
            ajax = json.loads(html)
        except ValueError as e:
            print(e)
            ajax = None
        else:
            for record in ajax['records']:
                countries.add(record['country'])
        page += 1
        if ajax is None or page >= ajax['num_pages']:
            break
    
open('countries.txt','w').write('\n'.join(sorted(countries)))
'''      

"\n# 剥离国家信息，通过26次ajax请求下载页面\nimport json\nimport string\ntemplate_url = 'http://example.webscraping.com/places/ajax/search.json?page={}&page_size=10&search_term={}'\ncountries = set()\n\nfor letter in string.ascii_lowercase:\n    page = 0\n    while True:\n        html = D(template_url.format(page, letter))\n        try:\n            ajax = json.loads(html)\n        except ValueError as e:\n            print(e)\n            ajax = None\n        else:\n            for record in ajax['records']:\n                countries.add(record['country'])\n        page += 1\n        if ajax is None or page >= ajax['num_pages']:\n            break\n    \nopen('countries.txt','w').write('\n'.join(sorted(countries)))\n"

In [5]:
# 观察发现search页面是通过正则匹配字符
url = 'http://example.webscraping.com/places/ajax/search.json?page=0&page_size=10&search_term='
json.loads(D(url + '.'))['num_pages']

Downloading: http://example.webscraping.com/places/ajax/search.json?page=0&page_size=10&search_term=.


26

In [6]:
# 使用Selenium渲染网页，更加简便简单
    # 需要先下载Chrome webdriver
        # link： http://chromedriver.storage.googleapis.com/index.html
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

driver = webdriver.Chrome()
# 在driver中显示页面
driver.get('http://example.webscraping.com/places/default/search')
# 模拟键盘输入
driver.find_element_by_id('search_term').send_keys('.')

# 使用javascript语句直接设置选项框内容
js = "document.getElementById('page_size').options[1].text='1000'"
driver.execute_script(js)

driver.find_element_by_id('search').click()

# 等待AJAX请求完成，至多等待30秒
driver.implicitly_wait(30)

# 关闭浏览器
driver.close()



NameError: name 'drive' is not defined