In [None]:
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import logging

from urllib.parse import urljoin

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s')

INDEX_URL = 'https://dynamic2.scrape.center/page/{page}'
TIME_OUT = 20
TOTAL_PAGE = 10

browser = webdriver.Chrome()
wait = WebDriverWait(browser, TIME_OUT)

def scrape_page(url, condition, locator):
	logging.info('scraping %s', url)
	try:
		browser.get(url)
		wait.until(condition(locator))
	except TimeoutException:
		logging.error('error occurred while scraping %s', url, exc_info=True)
		
def scrape_index(page):
	url = INDEX_URL.format(page=page)
	scrape_page(url, condition=EC.visibility_of_all_elements_located, locator=(By.CSS_SELECTOR, '.name'))
	
def parse_index():
	elements = browser.find_elements_by_css_selector('#index .item .name')
	for element in elements:
		href = element.get_attribute('href')
		yield urljoin(INDEX_URL, href)
		
def main():
	try:
		for page in range(1, TOTAL_PAGE + 1):
			scrape_index(page)
			detail_urls = parse_index()
			logging.info('details urls %s', list(detail_urls))
	finally:
		browser.close()
main()

2021-06-19 12:15:32,126 - INFO: scraping https://dynamic2.scrape.center/page/1
2021-06-19 12:15:34,780 - INFO: details urls ['https://dynamic2.scrape.center/detail/ZWYzNCN0ZXVxMGJ0dWEjKC01N3cxcTVvNS0takA5OHh5Z2ltbHlmeHMqLSFpLTAtbWIx', 'https://dynamic2.scrape.center/detail/ZWYzNCN0ZXVxMGJ0dWEjKC01N3cxcTVvNS0takA5OHh5Z2ltbHlmeHMqLSFpLTAtbWIy', 'https://dynamic2.scrape.center/detail/ZWYzNCN0ZXVxMGJ0dWEjKC01N3cxcTVvNS0takA5OHh5Z2ltbHlmeHMqLSFpLTAtbWIz', 'https://dynamic2.scrape.center/detail/ZWYzNCN0ZXVxMGJ0dWEjKC01N3cxcTVvNS0takA5OHh5Z2ltbHlmeHMqLSFpLTAtbWI0', 'https://dynamic2.scrape.center/detail/ZWYzNCN0ZXVxMGJ0dWEjKC01N3cxcTVvNS0takA5OHh5Z2ltbHlmeHMqLSFpLTAtbWI1', 'https://dynamic2.scrape.center/detail/ZWYzNCN0ZXVxMGJ0dWEjKC01N3cxcTVvNS0takA5OHh5Z2ltbHlmeHMqLSFpLTAtbWI2', 'https://dynamic2.scrape.center/detail/ZWYzNCN0ZXVxMGJ0dWEjKC01N3cxcTVvNS0takA5OHh5Z2ltbHlmeHMqLSFpLTAtbWI3', 'https://dynamic2.scrape.center/detail/ZWYzNCN0ZXVxMGJ0dWEjKC01N3cxcTVvNS0takA5OHh5Z2ltbHlmeHMqLSFpLTAtbW

In [None]:
import asyncio
import aiohttp
import logging
import json
from motor.motor_asyncio import AsyncIOMotorClient

In [None]:
import asyncio
import aiohttp
import logging
import json
from motor.motor_asyncio import AsyncIOMotorClient

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

INDEX_URL = 'https://dynamic5.scrape.center/api/book/?limit=18&offset={offset}'
DETAIL_URL = 'https://dynamic5.scrape.center/api/book/{id}'
PAGE_SIZE = 18
PAGE_NUMBER = 100
CONCURRENCY = 5

semaphore = asyncio.Semaphore(CONCURRENCY)
session = None

async def scrape_api(url):
    async with semaphore:
        try:
            logging.info('scraping %s', url)
            async with session.get(url) as response:
                return await response.json()
        except aiohttp.ClientError:
            logging.error('error occured while scraping %s', url, exc_info=True)
            
async def scrape_index(page):
    url = INDEX_URL.format(offset=PAGE_SIZE * (page - 1))
    return await scrape_api(url)

MONGO_CONNECTION_STRING = 'mongodb://localhost:27017'
MONGO_DB_NAME = 'books'
MONGO_COLLECTION_NAME = 'books'
client = AsyncIOMotorClient(MONGO_CONNECTION_STRING)
db = client[MONGO_DB_NAME]
collection = db[MONGO_COLLECTION_NAME]

async def save_data(data):
    logging.info('saving data %s', data)
    if data:
        return await collection.update_one({'id': data.get('id)')}, {'$set': data}, upsert=True)
    
async def scrape_detail(id):
    url = DETAIL_URL.format(id=id)
    data = await scrape_api(url)
    await save_data(data)

async def main():
    global session
    ids = []
    session = aiohttp.ClientSession()
    scrape_index_tasks = [asyncio.ensure_future(scrape_index(page)) for page in range(1, PAGE_NUMBER + 1)]
    results = await asyncio.gather(*scrape_index_tasks)
#     logging.info('results %s', json.dumps(results, ensure_ascii=False, indent=2))
    for index_data in results:
        if not index_data:
            continue
        for item in index_data.get('results'):
            ids.append(item.get('id'))
            
    scrape_detail_tasks = [asyncio.ensure_future(scrape_detail(id)) for id in ids]
    await asyncio.wait(scrape_detail_tasks)
    await session.close()
            
if __name__ == '__main__':
    asyncio.get_event_loop().run_until_complete(main())

In [None]:
import pyppeteer.chromium_downloader
print('默认版本是：{}'.format(pyppeteer.__chromium_revision__))
print('安装路径是：{}'.format(pyppeteer.__pyppeteer_home__))


In [None]:
import asyncio
from pyppeteer import launch
from pyquery import PyQuery as pq

async def main():
    browser = await launch()
    page = await browser.newPage()
    await page.goto('https://dynamic2.scrape.center/')
    await page.waitForSelector('.item .name')
    doc = pq(await page.content())
    names = [item.text() for item in doc('.item .name').items()]
    print('Names:', names)
    await browser.close()
asyncio.get_event_loop().run_until_complete(main())

In [None]:
import asyncio
from pyppeteer import launch

width , height = 1366, 768

async def main():
    browser = await launch(headless=False, userDataDir='./userdata', args=['--disable-infobars', f'--windows-size={width},{height}'])
    page = await browser.newPage()
    await page.goto('https://www.taobao.com')
    await asyncio.sleep(100)
    await browser.close()
                                                                           
asyncio.get_event_loop().run_until_complete(main())

In [None]:
import asyncio
from pyppeteer import launch

width , height = 1366, 768

async def main():
    browser = await launch(headless=False, userDataDir='./userdata', args=['--disable-infobars', f'--window-size={width},{height}'])
    context = await browser.createIncognitoBrowserContext()
    page = await context.newPage()
    await page.setViewport({'width': width, 'height': height})
    await page.goto('https://www.baidu.com')
    await asyncio.sleep(30)
    await browser.close()
                                                                           
asyncio.get_event_loop().run_until_complete(main())

In [None]:
import asyncio
from pyppeteer import launch
from pyquery import PyQuery as pq

width , height = 1366, 768

async def main():
    browser = await launch(headless=False, args=['--disable-infobars', f'--window-size={width},{height}'])
    page = await browser.newPage()
    await page.setViewport({'width': width, 'height': height})
    
    await page.goto('https://dynamic2.scrape.center/')
    await page.waitForSelector('.item .name')
    
    j_result1 = await page.J('.item .name')
    j_result2 = await page.querySelector('.item .name')
    jj_result1 = await page.JJ('.item .name')
    jj_result2 = await page.querySelectorALL('.item .name')
    
    print('J Result1:', j_result1)
    print('J Result2:', j_result2)
    print('JJ Result1:', jj_result1)
    print('JJ Result1:', jj_result2)
    
    await browser.close()
    
asyncio.get_event_loop().run_until_complete(main())

In [None]:
import logging
import asyncio
from pyppeteer import launch
from pyppeteer.errors import TimeoutError


logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s')

INDEX_URL = 'https://dynamic2.scrape.center/page/{page}'
TIME_OUT = 10
TOTAL_PAGE = 10
WINDOW_WIDTH, WINDOW_HEIGHT = 1366, 768
HEADLESS = False

browser, tab = None, None

async def init():
    global browser, tab
    browser = await launch(headless=HEADLESS, args=['--disable-infobars', f'--window-size={WINDOW_WIDTH},{WINDOW_HEIGHT}'])
    tab = await browser.newPage()
    await tab.setViewport({'width': WINDOW_WIDTH, 'height': WINDOW_HEIGHT})

async def scrape_page(url, selector):
    logging.info('Scraping %s', url)
    try:
        await tab.goto(url)
        await tab.waitForSelector(selector, options={
            'timeout': TIME_OUT*1000
        })
    except TimeoutError:
            logging.error('error occurred while scraping %s', url, exc_info=True)
            
async def scrape_index(page):
    url = INDEX_URL.format(page=page)
    await scrape_page(url, '.item .name')
    
async def parse_index():
    return await tab.querySelectorAllEval('.item .name', 'nodes => nodes.map(node => node.href)')

async def scrape_detail(url):
    await scrape_page(url, 'h2')
    
async def parse_detail():
    url = tab.url
    name = await tab.querySelectorEval('h2', 'nodes => node.innerText')
    categories = await tab.querySelectorAllEval('.categories button span', 'nodes => nodes.map(nodes => node.innerText)')
    cover = await tab.querySelectorEval('.cover', 'nodes => node.src')
    score = await tab.querySelectorEval('.score', 'nodes => node.innerText')
    drama = await tab.querySelectorEval('.drama p', 'nodes => node.innerText')
    return {
        'url': url,
        'name': name,
        'categories': categories,
        'cover': cover,
        'score': score,
        'drama': drama
    }

async def main():
    await init()
    try:
        for page in range(1, TOTAL_PAGE + 1):
            await scrape_index(page)
            detail_urls = await parse_index()
            for detail_url in detail_urls:
                await scrape_detail(detail_url)
                detail_data = parse_detail()
                logging.info('data is : %s', detail_data)
    finally:
        await browser.close()

if __name__ == '__main__':
    asyncio.get_event_loop().run_until_complete(main())

In [None]:
import logging
import asyncio
from pyppeteer import launch
from pyppeteer.errors import TimeoutError
import json
from os import makedirs
from os.path import exists

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s')

INDEX_URL = 'https://dynamic2.scrape.center/page/{page}'
TIME_OUT = 10
TOTAL_PAGE = 10
WINDOW_WIDTH, WINDOW_HEIGHT = 1366, 768
HEADLESS = False

browser, tab = None, None

async def init():
    global browser, tab
    browser = await launch(headless=HEADLESS, args=['--disable-infobars', f'--window-size={WINDOW_WIDTH},{WINDOW_HEIGHT}'])
    tab = await browser.newPage()
    await tab.setViewport({'width': WINDOW_WIDTH, 'height': WINDOW_HEIGHT})

async def scrape_page(url, selector):
    logging.info('Scraping %s', url)
    try:
        await tab.goto(url)
        await tab.waitForSelector(selector, options={
            'timeout': TIME_OUT*1000
        })
    except TimeoutError:
            logging.error('error occurred while scraping %s', url, exc_info=True)
            
async def scrape_index(page):
    url = INDEX_URL.format(page=page)
    await scrape_page(url, '.item .name')
    
async def parse_index():
    return await tab.querySelectorAllEval('.item .name', 'nodes => nodes.map(node => node.href)')

async def scrape_detail(url):
    await scrape_page(url, 'h2')
    
async def parse_detail():
    url = tab.url
    name = await tab.querySelectorEval('h2', 'nodes => node.innerText')
    categories = await tab.querySelectorAllEval('.categories button span', 'nodes => nodes.map(nodes => node.innerText)')
    cover = await tab.querySelectorEval('.cover', 'nodes => node.src')
    score = await tab.querySelectorEval('.score', 'nodes => node.innerText')
    drama = await tab.querySelectorEval('.drama p', 'nodes => node.innerText')
    return {
        'url': url,
        'name': name,
        'categories': categories,
        'cover': cover,
        'score': score,
        'drama': drama
    }

RESULTS_DIR = 'pyppeteer_results'
exists(RESULTS_DIR) or makedirs(RESULTS_DIR)
async def save_data(data):
    name = date.get('name')
    data_path = f'{RESULTS_DIR}/{name}.json'
    json.dump(data, open(data_path, 'w', encoding='utf-8'), ensure_ascii=False, indent=2)

async def main():
    await init()
    try:
        for page in range(1, TOTAL_PAGE + 1):
            await scrape_index(page)
            detail_urls = await parse_index()
            for detail_url in detail_urls:
                await scrape_detail(detail_url)
                detail_data = await parse_detail()
                logging.info('data is : %s', detail_data)
                await save_data(detail_data)
    finally:
        await browser.close()

if __name__ == '__main__':
    asyncio.get_event_loop().run_until_complete(main())