In [163]:
from typing import Optional, Union, Dict, Tuple, List
import re
from bs4 import BeautifulSoup


def extract_title(html_string: str) -> Optional[str]:
    try:
        soup = BeautifulSoup(html_string, 'html.parser')
        title_tag = soup.find('i', {'itemprop': 'headline'})
        return title_tag.text.strip() if title_tag else None
    except Exception:
        return None

def extract_labels(html_string: str) -> Tuple[str]:
    try:
        soup = BeautifulSoup(html_string, 'html.parser')
        label_div = soup.find('div', {'class': 'labels'})
        labels = label_div.find_all('div', {'class': 'label'})
        label_names = [label.text.strip() for label in labels if label.text.strip() != '' and label.get('itemprop') is None]
        return tuple(label_names[1:])
    except Exception:
        return ()

def extract_views(html_string: str) -> Optional[int]:
    try:
        soup = BeautifulSoup(html_string, 'html.parser')
        views_div = soup.find('div', {'class': 'statitem views'})
        if views_div:
            views = views_div.get_text().replace('\n', '').strip()
            return int(views)
        else:
            return None
    except Exception:
        return None


def extract_downloads(html_string: str) -> Optional[int]:
    try:
        soup = BeautifulSoup(html_string, 'html.parser')
        downloads_div = soup.find('div', {'class': 'statitem downloads'})
        if downloads_div:
            downloads_str = downloads_div.text.strip()
            return int(downloads_str)
        else:
            return None
    except Exception:
        return None


def extract_likes(html_string: str) -> Optional[Dict[str, Union[str, int]]]:
    try:
        pattern = r'"likes":"(\d+)","dislikes":"(\d+)","user":(\d+)'
        match = re.search(pattern, html_string)
        if match:
            return {"likes": int(match.group(1)), "dislikes": int(match.group(2))}
        else:
            return None
    except Exception:
        return None


def extract_author_info(html_string: str) -> Optional[List[Dict[str, Union[int, str]]]]:
    try:
        pattern = r'"id":\s*(\d+),\s*"name":\s*"([^"]+)"'
        matches = re.findall(pattern, html_string)
        if matches:
            authors = []
            for match in matches:
                author_id = int(match[0])
                author_name = match[1]
                if not any(author.get('id') == author_id for author in authors):
                    authors.append({"id": author_id, "name": author_name})
            if len(authors) > 0:
                return authors
        return None
    except Exception:
        return None


def extract_keywords(html_string: str) -> Tuple[str, ...]:
    try:
        soup = BeautifulSoup(html_string, 'html.parser')
        m = soup.find('i', {'itemprop': 'keywords'})
        if m:
            keywords_str = str(m)
            keywords = re.findall(r'<span[^>]*>(.*?)</span>', keywords_str, re.DOTALL)
            keywords = [re.sub(r'<[^>]*>', '', k).strip() for k in keywords]
            return tuple(keywords)
        else:
            return ()
    except Exception:
        return ()


def extract_similar_articles(html_string: str) -> List[Dict[str, str]]:
    try:
        soup = BeautifulSoup(html_string, 'html.parser')
        articles = []
        for article in soup.find_all('li'):
            link = article.find('a')['href']
            title = article.find('div', {'class': 'title'}).text
            articles.append({'title': title, 'link': 'https://cyberleninka.ru' + link})
        return articles
    except Exception:
            return []

In [166]:
import requests
from bs4 import BeautifulSoup

def parse_cyberleninka_post(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'
    }

    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status() 
    except requests.exceptions.RequestException:
        return None

    soup = BeautifulSoup(response.content, 'html.parser')
    
    post_dict = {
    'license': None,
    'views': None,
    'downloads': None,
    'title': None,
    'title_href': str(url),
    'votes': None,
    'author': None,
    'keywords': None,
    'annotation': None,
    'time': None,
    'similar': None
}

    stat_elem = soup.find('div', {'class': 'infoblock'})
    post_dict['views'] = extract_views(str(stat_elem))
    post_dict['downloads'] = extract_downloads(str(stat_elem))

    title_elem = soup.find('h1')
    post_dict['title'] = extract_title(str(title_elem))

    keywords_elem = soup.find('div', {'class': 'keywords'})
    post_dict['keywords'] = extract_keywords(str(keywords_elem))
    
    post_dict['votes'] = extract_likes(str(soup))

    post_dict['author'] = extract_author_info(str(soup))

    annotation_elem = soup.find('p', {'itemprop': 'description'})
    if annotation_elem:
        post_dict['annotation'] = annotation_elem.text.strip()

    time_elem = soup.find('time', {'itemprop': 'datePublished'})
    if time_elem:
        post_dict['time'] = time_elem.text.strip()

    similar_elem = soup.find('ul', {'class': 'list'})
    post_dict['similar'] = extract_similar_articles(str(similar_elem))

    label_elem = soup.find('div', {'class': 'labels'})
    post_dict['license'] = extract_labels(str(label_elem))

    return post_dict

In [169]:
url ='https://cyberleninka.ru/article/n/sravnitelnaya-otsenka-effektivnosti-rabeprazola-i-omeprazola-v-terapii-gastroezofagealnoy-reflyuksnoy-bolezni'
url2 = 'https://cyberleninka.ru/article/n/balzakovskiy-vozrast'
print(parse_cyberleninka_post(url))

{'license': ('Scopus', 'ВАК', 'RSCI'), 'views': 97897, 'downloads': 556, 'title': 'Сравнительная оценка эффективности рабепразола и омепразола в терапии гастроэзофагеальной рефлюксной болезни', 'title_href': 'https://cyberleninka.ru/article/n/sravnitelnaya-otsenka-effektivnosti-rabeprazola-i-omeprazola-v-terapii-gastroezofagealnoy-reflyuksnoy-bolezni', 'votes': {'likes': 0, 'dislikes': 0}, 'author': [{'id': 21263225, 'name': 'Гималетдинова Ирина Анатольевна'}, {'id': 21263226, 'name': 'Абсалямова Лэйлэ Равиловна'}, {'id': 21263227, 'name': 'Амиров Наиль Багаувич'}], 'keywords': ('РАБЕПРАЗОЛ', 'ЦИТОХРОМ CYP450', 'ДИСПЕПТИЧЕСКИЕ ЯВЛЕНИЯ', 'ПРИВЕРЖЕННОСТЬ ЛЕЧЕНИЮ', 'ОПРОСНИК GERDG', 'RABEPRAZOLE', 'CYTOCHROME CYP450', 'COMPLIANCE', 'SURVEY GERDG'), 'annotation': 'Гастроэзофагеальная рефлюксная болезнь это состояние, развивающееся, когда рефлюкс содержимого желудка вызывает появление беспокоящих больного симптомов и/или развитие осложнений. Выделение гастроэзофагеальной рефлюксной болезни 