In [None]:
from bs4 import BeautifulSoup
import asyncio
import aiofiles
import aiohttp
import time

In [None]:
import nest_asyncio
nest_asyncio.apply()

In [None]:
async def scrape_page(session, page, base_url, headers):
    url = f"{base_url}catalogue/page-{page}.html"
    try:
        async with session.get(url, headers=headers) as response:
            if response.status != 200:
                print(f"Page {page}: Status {response.status} - Stopping")
                return False
            
            print(f"Page {page}: Scraping (Status {response.status})")
            content = await response.text()
            return await parse_and_save(content, page, base_url)
    
    except Exception as e:
        print(f"Page {page}: Error - {str(e)}")
        return False

In [None]:
async def parse_and_save(html, page, base_url):
    try:
        soup = BeautifulSoup(html, "html.parser")
        u_book_names = soup.select("li > article > h3 > a")
        u_book_prices = soup.find_all("p", "price_color")
        u_image_urls = soup.find_all("img", "thumbnail")
        
        book_names = [name.get("title") for name in u_book_names]
        book_prices = [price.get_text() for price in u_book_prices]
        image_urls = [f"{base_url}{img.get('src')}" for img in u_image_urls]
        
        async with aiofiles.open(f"Page{page}.md", 'w', encoding='utf-8') as f:
            for i in range(len(book_names)):
                book_info = f"""
<font size="7">{book_names[i]}</font> *<font size="3">Price: {book_prices[i]}</font>*

![{book_names[i]}]({image_urls[i]})

---
"""
                await f.write(book_info)
        
        print(f"Page {page} has been Saved.")
        return True
    
    except Exception as e:
        print(f"Page {page}: Parsing error - {str(e)}")
        return False


In [None]:
async def main(base_url, headers="", start_page=1, max_concurrent=10):
    connector = aiohttp.TCPConnector(limit=max_concurrent)
    timeout = aiohttp.ClientTimeout(total=30)

    async with aiohttp.ClientSession(
        connector=connector, 
        timeout=timeout, 
        headers=headers
    ) as session:
        tasks = []
        page = start_page
        active_tasks = 0
        last_valid_page = start_page - 1
        should_continue = True

        while should_continue:
            while active_tasks < max_concurrent and should_continue:
                task = asyncio.create_task(scrape_page(session, page, base_url, headers))
                tasks.append(task)
                page += 1
                active_tasks += 1
            
            if not tasks:
                break
                
            done, pending = await asyncio.wait(
                tasks, 
                return_when=asyncio.FIRST_COMPLETED
            )

            for task in done:
                try:
                    success = await task
                    active_tasks -= 1
                    
                    if success:
                        last_valid_page = max(last_valid_page, page - active_tasks - 1)
                    else:
                        should_continue = False
                        print(f"Stopping at page {page - active_tasks - 1} due to error")
                        break
                        
                except Exception as e:
                    print(f"Task error: {str(e)}")
                    active_tasks -= 1
                    should_continue = False
                    break
            
            tasks = list(pending)
            
            if not tasks and not should_continue:
                break

        print(f"Last valid page: {last_valid_page}")


In [None]:
base_url = "http://books.toscrape.com/"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}


print("Starting asynchronous scraping...")
start_time = time.time()


loop = asyncio.get_event_loop()
loop.run_until_complete(main(
    base_url=base_url,
    headers=headers,
    start_page=1,
    max_concurrent=5  
))

duration = time.time() - start_time
print(f"Scraping completed in {duration:.2f} seconds")