In [21]:
import requests
import bs4
import re
import os
import httpx
import asyncio
import aiofiles

In [2]:
FANDOM = "sled"
WIKI_URL = f"https://{FANDOM}.fandom.com"
URL_TAIL = f"/ru/wiki/"
URL = f"{WIKI_URL}{URL_TAIL}"
URL

'https://sled.fandom.com/ru/wiki/'

In [3]:
headers = {
    "User-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}
resp = requests.get(URL, headers=headers)
resp

<Response [200]>

In [33]:
def _get_page_links(page_tag: bs4.Tag) -> list[str]:
    links_soup = body.find_all("a", href=re.compile(f"{URL_TAIL}"), class_=False)  # type: ignore
    return [WIKI_URL + link.get("href") for link in links_soup]


def _get_img_tags(page_tag: bs4.Tag) -> bs4.ResultSet[bs4.Tag]:
    imgs_soup = body.find_all("img", src=re.compile("static.wikia.nocookie.net"))  # type: ignore
    # return [img.get("src") for img in imgs_soup]
    return imgs_soup

In [9]:
def parse_page(url):
    resp = requests.get(url, headers=headers)
    soup = bs4.BeautifulSoup(resp.content, 'lxml', from_encoding="utf8")
    return soup

In [11]:
async def _download_image(img_tag: bs4.Tag, file_path: str):
    alt_name = img_tag.get("alt")
    file_name = img_tag.get("data-image-name")
    img_tag.get("src")
    async with httpx.AsyncClient() as client:
        response = await client.get(img_tag.get("src"))  # type: ignore
        if response.status_code == 200:
            async with aiofiles.open(f"{file_path}/{alt_name}_{file_name}", "wb") as f:
                await f.write(response.content)
            print(f"Downloaded {file_name}")


async def download_images(img_soup: bs4.ResultSet[bs4.Tag], file_path: str):
    tasks = []
    for tag in img_soup:
        task = asyncio.create_task(_download_image(tag, file_path))
        tasks.append(task)
    # Wait for all the tasks to complete
    await asyncio.gather(*tasks)

In [32]:
async def parse_url(url: str):
    async with httpx.AsyncClient() as client:
        response = await client.get(url)
        if response.status_code == 200:
            soup = bs4.BeautifulSoup(response.content, 'lxml', from_encoding="utf8")
            return soup

In [None]:
page_links = []
page_soup = await parse_url("https://sled.fandom.com/ru/wiki/%D0%9E%D0%BB%D1%8C%D0%B3%D0%B0_%D0%94%D1%83%D0%BD%D0%B0%D0%B5%D0%B2%D0%B0")
pagetag = page_soup.select_one("#mw-content-text")
try:
    title = page_soup.title.text.rstrip("| След вики | Fandom")
except:
    title = page_soup.title.text
savedir = f"data/{title}"
os.makedirs(savedir, exist_ok=True)
text = pagetag.get_text()
with open(f"{savedir}/{title}.txt", "w", encoding="utf8") as f:
    f.write(text) # save text to txt file
page_links.extend(_get_page_links(pagetag))
os.makedirs(f"{savedir}/imgs", exist_ok=True)
await download_images(_get_img_tags(pagetag), f"{savedir}/imgs") # save imgs to files