In [108]:
from bs4 import BeautifulSoup
import json
from typing import List, Dict, Union, Optional

def extract_comment_count(html_string: str) -> int:
    try:
        soup = BeautifulSoup(html_string, 'html.parser')
        textarea_tag = soup.find('textarea')
        json_str = textarea_tag.text.strip()
        json_obj = json.loads(json_str)
        comment_count = int(json_obj['count'])
        return comment_count
    except:
        return 0

def get_likes_count(html_string: str) -> int:
    try:
        soup = BeautifulSoup(html_string, 'html.parser')
        textarea_tag = soup.find('textarea', {'class': 'l-hidden'})
        json_str = textarea_tag.text.strip()
        data = json.loads(json_str)
        count_likes = int(data['likeData']['count_likes'])
        return count_likes
    except:
        return 0

def parse_links(html_string: str) -> List[str]:
    try:
        soup = BeautifulSoup(html_string, 'html.parser')
        links = []
        for a in soup.find_all('a'):
            links.append(a.get('href'))
        return links
    except:
        return []

def remove_links(html_string: str) -> str:
    try:
        soup = BeautifulSoup(html_string, 'html.parser')
        for a in soup.find_all('a'):
            a.extract()
        return str(soup)
    except:
        return html_string


In [109]:
import requests
from bs4 import BeautifulSoup


def parse_cyberleninka_post(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'
    }

    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f'Request Error: {e}')
        return None

    soup = BeautifulSoup(response.content, 'html.parser')
    post_dict = {
        'title': None,
        'author': None,
        'author_href': None,
        'views': None,
        'comments': None,
        'post_href': str(url),
        'time': None,
        'likes': None,
        'links': None,
        'text': None
    }

    try:
        title_elem = soup.find('h1', {'class': 'content-title'})
        post_dict['title'] = title_elem.text.strip()
    except:
        print(f'Title Error')

    try:
        auth_elem = soup.find('a', {'class': 'content-header-author__name'})
        if auth_elem is None:
            auth_elem = soup.find('div', {'class': 'content-header-author__name'})
            post_dict['author'] = auth_elem.text.strip()
        else:
            post_dict['author_href'] = auth_elem['href']
            post_dict['author'] = auth_elem.text.strip()
    except:
        print(f'Author Error')

    try:
        views_elem = soup.find('span', {'class': 'views__value'})
        post_dict['views'] = int(views_elem.text.strip())
    except:
        print(f'Views Error')

    try:
        comments_elem = soup.find('vue', {'name': 'comments-counter'})
        post_dict['comments'] = extract_comment_count(str(comments_elem))
    except:
        print(f'Comments Error')

    try:
        time_elem = soup.find('time', {'class': 'time'})
        if time_elem:
            post_dict['time'] = int(time_elem['data-date'])
    except:
        print(f'Time Error')

    try:
        like_elem = soup.find('vue', {'name': 'likes'})
        if like_elem:
            post_dict['likes'] = get_likes_count(str(like_elem))
    except:
        print(f'Likes Error')

    try:
        annotation_elem = soup.find_all('p')
        if annotation_elem:
            annotation_elem = ''.join(str(annotation_elem))
            post_dict['links'] = parse_links(str(annotation_elem))
            post_dict['text'] = remove_links(str(annotation_elem))
    except:
        print(f'Annotation Error')

    return post_dict


In [110]:
url ='https://vc.ru/marketing/643838-zachem-biznesu-korporativnaya-kultura-i-kak-ona-konvertiruetsya-v-pribyl'
url2 = 'https://vc.ru/legal/644141-minyust-priznal-inoagentami-ilyu-varlamova-i-glavu-pravozashchitnoy-organizacii-agora-pavla-chikova'
url3 ='https://vc.ru/u/1190575-andrey-vecherniy/610040-specproekt-kak-bandy-podelili-vc-ru'
print(parse_cyberleninka_post(url))

{'title': 'Зачем бизнесу корпоративная культура, и как она конвертируется в прибыль', 'author': 'Antro', 'author_href': 'https://vc.ru/antro', 'views': 1949, 'comments': 106, 'post_href': 'https://vc.ru/marketing/643838-zachem-biznesu-korporativnaya-kultura-i-kak-ona-konvertiruetsya-v-pribyl', 'time': 1679575870, 'likes': 47, 'links': ['http://antro.cx', 'https://www.mckinsey.com/capabilities/people-and-organizational-performance/our-insights/the-organization-blog/6-elements-to-create-a-high-performing-culture', 'https://hbr.org/2013/05/creating-the-best-workplace-on-earth?registration=success', 'https://www.tinypulse.com/hubfs/2018%20Employee%20Retention%20Report.pdf', 'https://www.tinypulse.com/hubfs/2018%20Employee%20Retention%20Report.pdf', 'https://www.randstadusa.com/business/business-insights/employee-retention/your-best-employees-are-leaving-it-personal-practical/', 'https://www.randstadusa.com/business/business-insights/employee-retention/your-best-employees-are-leaving-it-per