### Задание: 

По данному списку фамилий отправить запросы к сайту forebears.io и выгрузить базу данных с количеством их носителей по гендеру в России

Требуются 2 реализации: через Selenium и используя только HTTP

Selenium code:

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
import lxml

options = webdriver.ChromeOptions()
options.page_load_strategy = 'eager'
options.add_argument("--headless")
options.add_argument("--disable-gpu")

driver = webdriver.Chrome(options=options)
wait = WebDriverWait(driver, 5)
i = 0
j = 1


def scrape(surname):
    global i
    i+=1
    url = f"https://forebears.io/surnames/{surname}"
    driver.get(url)
    time.sleep(1)
    html = driver.page_source
    return html


def parse(html):
    global i
    global j
    print(i, j)
    if not ('bear' in html) and not ('Approximately' in html):
        return -1
    j += 1
    soup = BeautifulSoup(html, 'lxml')
    table = soup.find('table', {'class': 'table'})
    if table:
        rows = table.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            if len(cols) >= 2:
                country = cols[0].get_text(strip=True)
                count = cols[1].get_text(strip=True)
                if country.lower() == 'russia' or country.lower() == 'россия':
                    return count

    return 0

def insert_data(index, count, male):
    if male == True:
        df.at[index, 'male_cnt'] = count
    else:
        df.at[index, 'female_cnt'] = count

k = 0
minimum = 0
limit = len(df)

minimum = max(minimum, 0)
limit = max(limit, minimum+1)
limit = min(limit, len(df))


for index, row in df.iterrows():
    try:
        k += 1
        if k < minimum:
            continue
        if k > limit:
            break
        ms = row['male_eng']
        ws = row['female_eng']
        mc = parse(scrape(ms))
        wc = parse(scrape(ws))
        insert_data(index, mc, True)
        insert_data(index, wc, False)
        print('MC:', mc, 'WC:', wc)

    except Exception as e:
        print(e)

HTTP code:

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from urllib.parse import quote
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import fake_useragent

session = requests.Session()
ua = fake_useragent.FakeUserAgent()

# Рискованная версия
headers = {
    'User-Agent': (
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
        'AppleWebKit/537.36 (KHTML, like Gecko) '
        'Chrome/90.0.4430.93 Safari/537.36'
    )
}

# Оптимальная версия
headers = {
    'User-Agent': (ua.random)
}
session.headers.update(headers)

retry_params = Retry(
    total=5,
    status_forcelist=[429, 500, 502, 503, 504],
    backoff_factor=1
)

adapter = HTTPAdapter(max_retries=retry_params)
session.mount("https://", adapter)
session.mount("http://", adapter)


def scrape_and_parse(surname):

    headers = {
        'User-Agent': (ua.random)
    }
    session.headers.update(headers)

    surname_enc = quote(surname)
    url = f"https://forebears.io/surnames/{surname_enc}"
    try:
        print(f"Запрос по: {url}")
        response = session.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'lxml')
        table = soup.find('table', {'class': 'table'})

        if table:
            rows = table.find_all('tr')
            for row in rows:
                cols = row.find_all('td')
                if len(cols) >= 2:
                    country = cols[0].get_text(strip=True).lower()
                    count = cols[1].get_text(strip=True).replace(',', '')
                    if country in ['russia', 'россия']:
                        return count
                    
        return 0
    
    except requests.exceptions.HTTPError as http_err:
        print(f"Ошибка HTTP на '{surname}': {http_err}")
        if '403' in str(http_err):
            return -2
        return -1
    
    except requests.exceptions.RequestException as req_err:
        print(f"Сетевая ошибка на '{surname}': {req_err}")
        if '403' in str(req_err):
            return -2
        return -1
    
    except Exception as e:
        print(f"Другая ошибка на '{surname}': {e}")
        if '403' in str(e):
            return -2
        return -1


def process_row(row):
    ms = row['male_eng']
    ws = row['female_eng']
    mc = scrape_and_parse(ms)
    wc = scrape_and_parse(ws)
    return mc, wc


minimum = 2720
limit = len(df)

minimum = max(minimum, 0)
limit = max(limit, minimum+1)
limit = min(limit, len(df))

subset = df.iloc[minimum:limit]


def main():
    with ThreadPoolExecutor() as executor:
        future_to_index = {
            executor.submit(process_row, row): index for index, row in subset.iterrows()
        }
        for future in as_completed(future_to_index):
            index = future_to_index[future]
            try:
                mc, wc = future.result()
                if mc == -2:
                    return
                df.at[index, 'male_cnt'] = mc
                df.at[index, 'female_cnt'] = wc
                print(f'Index {index} - MC: {mc}, WC: {wc}')

            except Exception as e:
                print(f"ОШИБКА {index}: {e}")
                if str(e).__contains__('403'):
                    return
                

main()

print("done")