In [74]:
import asyncio
import time
import nest_asyncio
import aiohttp
from bs4 import BeautifulSoup
import logging
import random

nest_asyncio.apply()

# aiohttp
异步的requests

核心框架
> 设计一个异步的框架，生成一个事件循环

> 创建一个专门去爬取网页的协程，利用aiohttp去爬取网站内容

> 生成多个要翻译的单词的url地址，组建一个异步的tasks, 扔到事件循环里面

> 等待所有的页面爬取完毕，然后用pyquery去一一解析网页，获取单词的解释

使用方法
> 与requests类似，使用ClientSession来管理会话

## 参考别人的代码

In [68]:
class AsnycGrab(object):

    def __init__(self, url_list, max_threads):

        self.urls = url_list
        self.results = {}
        self.max_threads = max_threads

    def __parse_results(self, url, html):
        soup = BeautifulSoup(html, 'lxml')
        try:
            percent = soup.find("div", class_="board-hq").find("p").get_text().split()[1]
        except Exception as e:
            percent = soup.find("div", class_="board-infos fr").find("dd").get_text()
        self.results[url] = float(percent.replace("%",""))

    async def get_body(self, url):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
            "Connection": "keep - alive",
            "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
            }
        async with aiohttp.ClientSession() as session:
            async with session.get(url,headers=headers) as response:
                assert response.status == 200
                response.encoding = 'gbk'
                html = await response.text()
                return response.url, html

    async def get_results(self, url):
        url, html = await self.get_body(url)
        self.__parse_results(url, html)
        return 'Completed'

    async def handle_tasks(self, task_id, work_queue):
        while not work_queue.empty():
            current_url = await work_queue.get()
            try:
                task_status = await self.get_results(current_url)
            except Exception as e:
                logging.exception('Error for {}'.format(current_url), exc_info=True)

    def eventloop(self):
        q = asyncio.Queue()
        [q.put_nowait(url) for url in self.urls]
        loop = asyncio.get_event_loop()
        tasks = [self.handle_tasks(task_id, q, ) for task_id in range(self.max_threads)]
        loop.run_until_complete(asyncio.wait(tasks))


if __name__ == '__main__':
    async_example = AsnycGrab(['http://q.10jqka.com.cn/gn/detail/code/301558/',
                              "http://q.10jqka.com.cn/gn/detail/code/300800/",
                              "http://q.10jqka.com.cn/gn/detail/code/301496/",
                              "http://q.10jqka.com.cn/gn/detail/code/304582/",
                              "http://q.10jqka.com.cn/gn/detail/code/301259/",
                              "http://q.10jqka.com.cn/gn/detail/code/307408/",
                              "http://q.10jqka.com.cn/gn/detail/code/300168/"], 5)
    async_example.eventloop()
    print(async_example.results)

{URL('http://q.10jqka.com.cn/gn/detail/code/301259/'): 0.32, URL('http://q.10jqka.com.cn/gn/detail/code/304582/'): 0.76, URL('http://q.10jqka.com.cn/gn/detail/code/300168/'): 0.87, URL('http://q.10jqka.com.cn/gn/detail/code/301558/'): 1.7, URL('http://q.10jqka.com.cn/gn/detail/code/301496/'): -0.71, URL('http://q.10jqka.com.cn/gn/detail/code/307408/'): 0.57, URL('http://q.10jqka.com.cn/gn/detail/code/300800/'): 0.23}
