import aiohttp
import asyncio

async def fetch(session, url):
    async with session.get(url) as response:
        return await response.text(), response.status

async def main():
    async with aiohttp.ClientSession() as session:
        html, status = await fetch(session, 'https://cuiqingcai.com')
        print(f'html: {html[:100]}...')
        print(f'status: {status}')

if __name__ == '__main__':
    loop = asyncio.get_event_loop()
    loop.run_until_complete(main())

In [3]:
import aiohttp
import asyncio
import nest_asyncio
nest_asyncio.apply()
async def fetch(session, url):
    async with session.get(url) as response:
        return await response.text(), response.status
async def main():
    async with aiohttp.ClientSession() as session:
            html, status = await fetch(session, 'https://cuiqingcai.com')
            print(f'html: {html[:100]}...')
            print(f'status: {status}')
if __name__ == '__main__':
    loop = asyncio.get_event_loop()
    loop.run_until_complete(main())

html: <!DOCTYPE html>
<html lang="zh-CN">

<head>
  <meta charset="UTF-8">
  <meta name="viewport" content...
status: 200


In [None]:
import aiohttp
import asyncio
import nest_asyncio
nest_asyncio.apply()
async def main():
    timeout = aiohttp.ClientTimeout(total=1)
    async with aiohttp.ClientSession(timeout=timeout) as session:
        async with session.get('https://httpbin.org/get') as response:
            print('status:', response.status)
if __name__ == '__main__':
    asyncio.get_event_loop().run_until_complete(main())


import aiohttp
import asyncio
async def main():
    data = {'name': 'germey', 'age': 25}
    async with aiohttp.ClientSession() as session:
        async with session.post('http://httpbin.org/post', data=data) as resp:
            print(await resp.text())

if __name__ == '_main_':
    asyncio.get_event_loop().run_until_complete(main())

In [2]:
import asyncio
import aiohttp
import logging

logging.basicConfig(level=logging.INFO,format='%(asctime)s - %(levelname)s: %(message)s')
INDEX_URL = 'https://spa16.scrape.center/{offset}'
DETAIL_URL = 'https://spa16.scrape.center/{id}'

PAGE_SIZE = 18
PAGE_NUMBER = 100
CONCURRENCY = 5

In [3]:
semaphore = asyncio.Semaphore(CONCURRENCY)
session = None
async def scrape_api(url):
    async with semaphore:
        try:
            logging.info('scraping %s', url)
            async with session.get(url) as response:
                return await response.json()
        except aiohttp.ClientError:
            logging.error('error occurred while scraping %s', url, exc_info=True)


In [4]:
async def scrape_index(page):
    url = INDEX_URL.format(offset=PAGE_SIZE * (page - 1))
    return await scrape_api(url)

In [5]:
import json
async def main():
    global session
    session = aiohttp.ClientSession()
    scrape_index_tasks = [asyncio.ensure_future(scrape_index(page)) for page in range(1, PAGE_NUMBER + 1)]
    results = await asyncio.gather(*scrape_index_tasks)
    logging.info('results %s', json.dumps(results, ensure_ascii=False, indent=2))
if __name__== '__main__':
    asyncio.get_event_loop().run_until_complete(main())

2024-01-19 17:24:09,478 - INFO: scraping https://spa16.scrape.center/0
2024-01-19 17:24:09,480 - INFO: scraping https://spa16.scrape.center/18
2024-01-19 17:24:09,481 - INFO: scraping https://spa16.scrape.center/36
2024-01-19 17:24:09,483 - INFO: scraping https://spa16.scrape.center/54
2024-01-19 17:24:09,484 - INFO: scraping https://spa16.scrape.center/72
2024-01-19 17:24:10,343 - ERROR: error occurred while scraping https://spa16.scrape.center/0
Traceback (most recent call last):
  File "C:\Users\59427\AppData\Local\Temp\ipykernel_12228\3986696478.py", line 7, in scrape_api
    async with session.get(url) as response:
  File "d:\anaconda3\Lib\site-packages\aiohttp\client.py", line 1141, in __aenter__
    self._resp = await self._coro
                 ^^^^^^^^^^^^^^^^
  File "d:\anaconda3\Lib\site-packages\aiohttp\client.py", line 560, in _request
    await resp.start(conn)
  File "d:\anaconda3\Lib\site-packages\aiohttp\client_reqrep.py", line 899, in start
    message, payload = awai

TimeoutError: 

In [None]:
ids = []
for index_data in results:
    if not index_data: continue
for item in index_data.get('results'):
        ids.append(item.get('id'))

In [6]:
from motor.motor_asyncio import AsyncIOMotorClient
MONGO_CONNECTION_STRING = 'mongodb://localhost:27017'
MONGO_DB_NAME = 'books'
MONGO_COLLECTION_NAME = 'books'
client = AsyncIOMotorClient(MONGO_CONNECTION_STRING)
db = client[MONGO_DB_NAME]
collection = db[MONGO_COLLECTION_NAME]
async def save_data(data):
    logging.info('saving data %s', data)
    if data:
        return await collection.update_one({'id':data.get('id')},{'$set':data},upsert=True)

async def scrape_detail(id):
    url = DETAIL_URL.format(id=id)
    data = await scrape_api(url)
    await save_data(data)

In [7]:
scrape_detail_tasks = [asyncio.ensure_future(scrape_detail(id)) for id in ids]
await asyncio.wait(scrape_detail_tasks)
await session.close()

NameError: name 'ids' is not defined