In [1]:
import nest_asyncio
nest_asyncio.apply()


In [15]:
# Chapter07-01
# Asyncio
# 비동기 I/O Coroutine 작업
# Generator -> 반복적인 객체 Return 사용
# non-blocking 비동기 처리

# Asyncio 웹 스크랩핑 실습
# aiohttp 권장
import asyncio
import timeit
from urllib.request import urlopen
from concurrent.futures import ThreadPoolExecutor
import threading


start = timeit.default_timer()

urls = [
    'http://daum.net', 
    'https://naver.com', 
    'http://mlbpark.donga.com/',
    'https://tistory.com', 
    'https://wemakeprice.com/'
]


def wrap_urlopen(url: str):
    print('Thread Name :', threading.current_thread().getName(), 'Start', url)
    res = urlopen(url)
    print('Thread Name :', threading.current_thread().getName(), 'Done', url)
    return res

async def fetch(url: str, executor: ThreadPoolExecutor):
    # print('Thread Name :', threading.current_thread().getName(), 'Start', url)
    resp = await loop.run_in_executor(executor, wrap_urlopen, url)
    # print('Thread Name :', threading.current_thread().getName(), 'Done', url)
    return resp.read()[0:10]


async def main():
    executor = ThreadPoolExecutor(max_workers=10)
    futures = [asyncio.ensure_future(fetch(url, executor)) for url in urls]
    results = await asyncio.gather(*futures)

    for url, result in zip(urls, results):
        print(f"{url} -->  {result}")


loop = asyncio.get_event_loop()
loop.run_until_complete(main())
duration = timeit.default_timer() - start
print('Total Running Time: {}'.format(duration))


Thread Name : ThreadPoolExecutor-10_0 Start http://daum.net
Thread Name : ThreadPoolExecutor-10_1 Start https://naver.com
Thread Name : ThreadPoolExecutor-10_2 Start http://mlbpark.donga.com/
Thread Name : ThreadPoolExecutor-10_3 Start https://tistory.com
Thread Name : ThreadPoolExecutor-10_4 Start https://wemakeprice.com/
Thread Name : ThreadPoolExecutor-10_0 Done http://daum.net
Thread Name : ThreadPoolExecutor-10_1 Done https://naver.com
Thread Name : ThreadPoolExecutor-10_2 Done http://mlbpark.donga.com/
Thread Name : ThreadPoolExecutor-10_3 Done https://tistory.com
Thread Name : ThreadPoolExecutor-10_4 Done https://wemakeprice.com/
http://daum.net -->  b'<!DOCTYPE '
https://naver.com -->  b'\n<!doctype'
http://mlbpark.donga.com/ -->  b'<!DOCTYPE '
https://tistory.com -->  b'\n\t<!doctyp'
https://wemakeprice.com/ -->  b'\x1f\x8b\x08\x00\x00\x00\x00\x00\x04\x03'
Total Running Time: 0.4606426999998803


In [17]:
# Chapter07-02
# Asyncio
# 비동기 I/O Coroutine 작업
# Generator -> 반복적인 객체 Return 사용
# non-blocking 비동기 처리

# Asyncio 웹 스크랩핑 실습
# Beautiful Soup 추가
# 스케쥴러 사용시 주기적으로 데이터 수집 가능

import asyncio
import timeit
from urllib.request import urlopen
from concurrent.futures import ThreadPoolExecutor
import threading
from bs4 import BeautifulSoup


start = timeit.default_timer()

urls = [
    'http://daum.net', 
    'https://naver.com', 
    'http://mlbpark.donga.com/',
    'https://tistory.com', 
    'https://www.inflearn.com/'
]


def get_title_from_url(url: str):
    print('Thread Name :', threading.current_thread().getName(), 'Start', url)
    resp = urlopen(url)
    soup = BeautifulSoup(resp.read(), 'html.parser')
    
    # 전체 페이지 소스 확인
    # print(soup.prettify())
    # 이 부분에서 BeautifulSoup Selector(선택자)를 활용해서 다양한 정보 가져오기 가능
    # 현 예제에서는 페이지 타이틀 정보 수집
    title = soup.title
    print('Thread Name :', threading.current_thread().getName(), 'Done', url)
    return title

async def fetch(url: str, executor: ThreadPoolExecutor):
    return await loop.run_in_executor(executor, get_title_from_url, url)


async def main():
    executor = ThreadPoolExecutor(max_workers=10)
    futures = [asyncio.ensure_future(fetch(url, executor)) for url in urls]
    results = await asyncio.gather(*futures)

    for url, result in zip(urls, results):
        print(f"{url} -->  {result}")


loop = asyncio.get_event_loop()
loop.run_until_complete(main())
duration = timeit.default_timer() - start
print('Total Running Time: {}'.format(duration))


Thread Name : ThreadPoolExecutor-12_0 Start http://daum.net
Thread Name : ThreadPoolExecutor-12_1 Start https://naver.com
Thread Name : Thread Name : ThreadPoolExecutor-12_3 Start https://tistory.com
ThreadPoolExecutor-12_2 Thread Name : ThreadPoolExecutor-12_4 Start Start http://mlbpark.donga.com/
https://www.inflearn.com/
Thread Name : ThreadPoolExecutor-12_2 Done http://mlbpark.donga.com/
Thread Name : ThreadPoolExecutor-12_1 Done https://naver.com
Thread Name : ThreadPoolExecutor-12_0 Done http://daum.net
Thread Name : ThreadPoolExecutor-12_3 Done https://tistory.com
Thread Name : ThreadPoolExecutor-12_4 Done https://www.inflearn.com/
http://daum.net -->  <title>Daum</title>
https://naver.com -->  <title>NAVER</title>
http://mlbpark.donga.com/ -->  <title>↗ 파크에 오면 즐겁다 MLBPARK</title>
https://tistory.com -->  <title>TISTORY</title>
https://www.inflearn.com/ -->  <title>인프런 - 프로가 되는 온라인 클래스 | 온라인 강의 플랫폼</title>
Total Running Time: 0.7316460000001825
