In [108]:
from typing import Optional, Union, Dict, Tuple, List
import re
from bs4 import BeautifulSoup


def extract_title(html_string: str) -> Optional[str]:
    try:
        soup = BeautifulSoup(html_string, 'html.parser')
        title_tag = soup.find('i', {'itemprop': 'headline'})
        return title_tag.text.strip() if title_tag else None
    except Exception:
        return None


def extract_license(html_string: str) -> Optional[str]:
    try:
        soup = BeautifulSoup(html_string, 'html.parser')
        license_element = soup.find('div', {'class': 'statitem label-cc'})
        return license_element['title'] if license_element else None
    except Exception:
        return None


def extract_views(html_string: str) -> Optional[int]:
    try:
        soup = BeautifulSoup(html_string, 'html.parser')
        views_div = soup.find('div', {'class': 'statitem views'})
        if views_div:
            views = views_div.get_text().replace('\n', '').strip()
            return int(views)
        else:
            return None
    except Exception:
        return None


def extract_downloads(html_string: str) -> Optional[int]:
    try:
        soup = BeautifulSoup(html_string, 'html.parser')
        downloads_div = soup.find('div', {'class': 'statitem downloads'})
        if downloads_div:
            downloads_str = downloads_div.text.strip()
            return int(downloads_str)
        else:
            return None
    except Exception:
        return None


def extract_likes(html_string: str) -> Optional[Dict[str, Union[str, int]]]:
    try:
        pattern = r'"likes":"(\d+)","dislikes":"(\d+)","user":(\d+)'
        match = re.search(pattern, html_string)
        if match:
            return {"likes": int(match.group(1)), "dislikes": int(match.group(2))}
        else:
            return None
    except Exception:
        return None


def extract_author_info(html_string: str) -> Optional[Dict[str, Union[int, str]]]:
    try:
        pattern = r'"id":\s*(\d+),\s*"name":\s*"([^"]+)"'
        match = re.search(pattern, html_string)
        if match:
            author_id = int(match.group(1))
            author_name = match.group(2)
            return {"id": author_id, "name": author_name}
        else:
            return None
    except Exception:
        return None


def extract_keywords(html_string: str) -> Tuple[str, ...]:
    try:
        soup = BeautifulSoup(html_string, 'html.parser')
        m = soup.find('i', {'itemprop': 'keywords'})
        if m:
            keywords_str = m.text
            keywords = re.findall(r'<span[^>]*>(.*?)</span>', keywords_str, re.DOTALL)
            keywords = [k.strip() for k in keywords]
            return tuple(keywords)
        else:
            return ()
    except Exception:
        return ()


def extract_similar_articles(html_string: str) -> List[Dict[str, str]]:
    try:
        soup = BeautifulSoup(html_string, 'html.parser')
        articles = []
        for article in soup.find_all('li'):
            link = article.find('a')['href']
            title = article.find('div', {'class': 'title'}).text
            articles.append({'title': title, 'link': 'https://cyberleninka.ru' + link})
        return articles
    except Exception:
            return []

In [110]:
import requests
from bs4 import BeautifulSoup

def parse_cyberleninka_post(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'
    }

    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status() 
    except requests.exceptions.RequestException:
        return None

    soup = BeautifulSoup(response.content, 'html.parser')
    
    post_dict = {
    'license': None,
    'views': None,
    'downloads': None,
    'title': None,
    'title_href': str(url),
    'votes': None,
    'author': None,
    'keywords': None,
    'annotation': None,
    'time': None,
    'similar': None
}

    author_elem = soup.find('div', {'class': 'infoblock'})
    post_dict['license'] = extract_license(str(author_elem))
    post_dict['views'] = extract_views(str(author_elem))
    post_dict['downloads'] = extract_downloads(str(author_elem))

    title_elem = soup.find('h1')
    post_dict['title'] = extract_title(str(title_elem))

    keywords_elem = soup.find('div', {'class': 'keywords'})
    post_dict['keywords'] = extract_keywords(str(keywords_elem))

    annotation_elem = soup.find('p', {'itemprop': 'description'})
    if annotation_elem:
        post_dict['annotation'] = annotation_elem.text.strip()

    time_elem = soup.find('time', {'itemprop': 'datePublished'})
    if time_elem:
        post_dict['time'] = time_elem.text.strip()

    similar_elem = soup.find('ul', {'class': 'list'})
    post_dict['similar'] = extract_similar_articles(str(similar_elem))

    return post_dict

In [111]:
print(parse_cyberleninka_post('https://cyberleninka.ru/article/n/balzakovskiy-vozrast'))

{'license': 'Лицензия Creative Commons', 'views': 460126, 'downloads': 1504, 'title': 'Бальзаковский возраст', 'title_href': 'https://cyberleninka.ru/article/n/balzakovskiy-vozrast', 'votes': None, 'author': None, 'keywords': (), 'annotation': '«Бальзаковский возраст» русская идиома, не существующая в других языках. Это выражение появилось в 1880-е годы, а его самые ранние формы почти на полвека раньше. Вопреки нередко бытующим представлениям, «бальзаковский возраст» с самого начала понимался так же, как он понимается и теперь.', 'time': '2017', 'similar': [{'title': 'Натурщица и шедевр (Бальзак и его продолжатели)', 'link': 'https://cyberleninka.ru/article/n/naturschitsa-i-shedevr-balzak-i-ego-prodolzhateli'}, {'title': 'Образ Вотрена в «Человеческой комедии» Бальзака', 'link': 'https://cyberleninka.ru/article/n/obraz-votrena-v-chelovecheskoy-komedii-balzaka'}, {'title': '«Блаженные слова» (пространство одной строки Осипа Мандельштама)', 'link': 'https://cyberleninka.ru/article/n/blaz