# AsyncCrawler

from http://edmundmartin.com/writing-a-web-crawler-in-python-3-5-using-asyncio/

In [1]:
import asyncio
from datetime import datetime
import aiohttp
from urllib.parse import urljoin, urlparse
from lxml import html as lh
from tqdm import tqdm, tnrange
 
class AsyncCrawler:

    def __init__(self, start_url, count, max_concurrency=200):
        self.start_url = start_url
        self.base_url = '{}://{}'.format(urlparse(self.start_url).scheme, urlparse(self.start_url).netloc)
        self.count = count
        self.seen_urls = set()
        self.bounded_sempahore = asyncio.BoundedSemaphore(max_concurrency)
        print(f'{datetime.now()} [start crawling]',end='\r')

    async def http_request(self, url):
        print(f'{datetime.now()} {url:200}',end='\r')
        async with self.bounded_sempahore:
            try:
                async with self.session.get(url, timeout=10) as response:
                    html = await response.read()
                    return html
            except Exception as e:
                pass
 
    def find_urls(self, html):
        found_urls = []
        dom = lh.fromstring(html)
        for href in dom.xpath('//a/@href'):
            url = urljoin(self.base_url, href)
            if url not in self.seen_urls and url.startswith(self.base_url):
                found_urls.append(url)
        return found_urls
    
    async def extract_async(self, url):
        data = await self.http_request(url)
        found_urls = set()
        if data:
            for url in self.find_urls(data):
                found_urls.add(url)
                return url, data, sorted(found_urls)
            
    async def extract_multi_async(self, to_fetch):
        futures, results = [], []
        for url in to_fetch:
            if url in self.seen_urls: continue
            self.seen_urls.add(url)
            futures.append(self.extract_async(url))

        for future in asyncio.as_completed(futures):
            try:
                results.append((await future))
            except Exception as e:
                pass
        return results
    
    def parser(self, data):
        dom = lh.fromstring(data)
        title = dom.cssselect('title')
        if title:
            title = title[0].text
        return {'title': title}

    async def crawl_async(self):
        to_fetch = [self.start_url]
        results=[]
        self.session = aiohttp.ClientSession()
        for c in tnrange(self.count):
            batch = await self.extract_multi_async(to_fetch)
            to_fetch = []
            for url, data, found_urls in batch:
                data = self.parser(data)
                results.append((c, url, data))
                to_fetch.extend(found_urls)
        await self.session.close()
        return results
        

In [2]:
url = 'https://en.wikipedia.org/wiki/Wiki'
crawler = AsyncCrawler(url, 100)
future = asyncio.Task(crawler.crawl_async())
loop = asyncio.get_event_loop()

try:
    loop.run_until_complete(future)
    loop.close()
    result = future.result()
    print(len(result))
except Exception as e:
    pass

2018-11-18 01:13:30.371760 [start crawling]

HBox(children=(IntProgress(value=0), HTML(value='')))

2018-11-18 01:13:49.878193 https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes                                                                                                                                                   
