In [1]:
import json
import os

import pandas as pd
from bs4 import BeautifulSoup
import requests as r
from tqdm import tqdm
from pqdm.threads import pqdm

In [2]:
BASE_URL = "https://birds.kg/"


def get_all_bird_links():
    res = r.get(BASE_URL)
    soup = BeautifulSoup(res.text)
    all_links = soup.find_all("a")

    bird_links = []
    for link in all_links:
        if link.has_attr("href") and "family.php" in link.attrs["href"]:
            bird_links.append(BASE_URL + link.attrs["href"])
    return bird_links

In [3]:
def get_all_species_links(bird_links):
    species_links = set()

    for link in tqdm(bird_links):
        soup = BeautifulSoup(r.get(link).text)
        all_links = soup.find_all("a")

        for link in all_links:
            if link.has_attr("href") and "v2taxon.php" in link.attrs["href"]:
                link = BASE_URL + link.attrs["href"]
                species_links.add(link)
    return list(species_links)

In [4]:
def get_bird_info(link):
    info = {}
    link = link.replace("en", "ru")
    soup = BeautifulSoup(r.get(link).text)

    info["url"] = link
    info["russian"] = soup.find_all("h1")[0].text.strip()
    info["latin"] = soup.find_all("h2")[0].text.split("(")[0].strip()
    info["english"] = (
        soup.find("section", class_="orta")
        .find_all("div")[6]
        .text.split("|")[0]
        .strip()
    )

    for p in soup.find("section", class_="orta").find_all("p"):
        if len(p.text) > 100:
            info["description"] = p.contents[0].strip()
            break

    return info

In [5]:
def get_bird_images(link):
    link = link.replace("en", "ru").replace("taxon", "taxgal") + "&p=0"
    soup = BeautifulSoup(r.get(link).text)

    bird_images = []
    all_images = soup.find_all("img")
    for image in all_images:
        if image.has_attr("data-src") and image.attrs["data-src"].startswith("photos/"):
            bird_images.append(
                BASE_URL + image.attrs["data-src"].replace("s.jpg", ".jpg")
            )
    return bird_images

In [6]:
def process_specie(link):
    info = get_bird_info(link)

    # create folder
    folder = info["latin"].replace(" ", "_")
    os.makedirs(folder, exist_ok=True)

    # save info to file
    with open(f"{folder}/info.json", "w", encoding="utf-8") as file:
        json.dump(info, file)

    # download all images
    images = get_bird_images(link)
    for idx, image in enumerate(images, start=1):
        try:
            with open(f"{folder}/{idx:02d}.jpg", "wb") as file:
                file.write(r.get(image).content)
        except Exception as e:
            print(f"Could not donwload image from {image}, error: {e}")

In [7]:
bird_links = get_all_bird_links()
species_links = get_all_species_links(bird_links)

100%|█████████████████████████████████████████████████████| 59/59 [00:58<00:00,  1.01it/s]


In [8]:
all_infos = []
for link in tqdm(species_links):
    try:
        all_infos.append(get_bird_info(link))
    except:
        print(f"Failed to parse info from {link}")

 83%|██████████████████████████████████████████▎        | 274/330 [06:03<01:16,  1.37s/it]

Failed to parse info from https://birds.kg/v2taxon.php?s=372&l=en


100%|███████████████████████████████████████████████████| 330/330 [07:14<00:00,  1.32s/it]


In [None]:
pqdm(species_links, process_specie, n_jobs=10)

In [11]:
df = pd.DataFrame(all_infos)
df.to_csv("2021-07-28-birds_and_image.csv", index=False)