In [90]:
from bs4 import BeautifulSoup
import re
import datetime
from typing import Optional, Tuple, Dict, Union

def extract_username(input_str: str) -> Optional[str]:
    try:
        pattern = re.compile(r'/users/(.*?)/">')
        match = pattern.search(input_str)
        if match:
            return match.group(1)
        else:
            return None
    except:
        return None

def extract_timestamp(input_str: str) -> Optional[int]:
    try:
        pattern = re.compile(r'<time datetime="(.*?)" title=".*?">')
        match = pattern.search(input_str)
        if match:
            timestamp_str = match.group(1)
            timestamp = datetime.datetime.strptime(timestamp_str, '%Y-%m-%dT%H:%M:%S.%fZ')
            return int((timestamp - datetime.datetime(1970, 1, 1)).total_seconds())
        else:
            return None
    except:
        return None

def extract_title(input_str: str) -> Optional[str]:
    try:
        pattern = re.compile(r'<span>(.*?)</span>')
        match = pattern.search(input_str)
        if match:
            title_str = match.group(1)
            return title_str.strip()
        else:
            return None
    except:
        return None

def extract_views(input_str: str) -> Optional[int]:
    try:
        pattern = re.compile(r'([\d\.]+)([KkMmBbTt]?)')
        match = pattern.search(input_str)
        if match:
            num_str = match.group(1)
            suffix = match.group(2).lower()
            num = float(num_str)
            if suffix == 'k':
                num *= 1000
            elif suffix == 'm':
                num *= 1000000
            elif suffix == 'b':
                num *= 1000000000
            elif suffix == 't':
                num *= 1000000000000
            return int(num)
        else:
            return None
    except:
        return None

def extract_hub_names(html: str) -> Tuple[str, ...]:
    try:
        soup = BeautifulSoup(html, 'html.parser')
        hubs = soup.find('div', {'class': 'tm-article-snippet__hubs'})
        hub_names = [hub.text.strip() for hub in hubs.find_all('span', {'class': 'tm-article-snippet__hubs-item'})]
        return tuple(hub_names)
    except:
        return tuple()

def parse_votes(s: str) -> Optional[Dict[str, int]]:
    try:
        pattern = re.compile(r'↑(\d+).*↓(\d+)')
        match = pattern.search(s)
        if match:
            return {'upvotes': int(match.group(1)), 'downvotes': int(match.group(2))}
        else:
            return None
    except:
        return None

def extract_bookmarks_count(text: str) -> Optional[int]:
    try:
        pattern = r'<span\s+class="bookmarks-button__counter".*?>\s*(\d+)\s*</span>'
        match = re.search(pattern, text)
        if match:
            return int(match.group(1))
        else:
            return 0
    except:
        return None

def extract_href(string: str) -> Optional[str]:
    try:
        pattern = r'href="(.+?)"'
        href = re.search(pattern, string).group(1)
        return 'https://habr.com'+href
    except:
        return None

def extract_comments_count(text: str) -> Optional[int]:
    try:
        count_str = re.search(r'\d+', text).group()
        return int(count_str)
    except:
        return None


def extract_tags(input_string):
    try:
        tag_links = re.findall('<a class="tm-tags-list__link" href=".+?">(.+?)</a>', input_string)
        return tag_links
    except:
        return []


def extract_text(html_string):
    try:
        soup = BeautifulSoup(html_string, 'html.parser')
        tags = ['p']
        text = ''
        for tag in tags:
            elements = soup.find_all(tag)
            for element in elements:
                text += element.get_text() + '.'
        return text
    except:
        return None


def extract_urls(html_string):
    try:
        soup = BeautifulSoup(html_string, 'html.parser')
        urls = []
        for link in soup.find_all('a', href=True):
            url = link['href']
            if re.match(r'^https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', url):
                urls.append(url)
        return urls
    except:
        return []


def get_first_n_sentences(text, n=10):
    try:
        sentences = text.split('.')
        non_empty_sentences = [s.strip() for s in sentences if s.strip()]
        return non_empty_sentences[:n]
    except:
        return [] 



In [91]:
import requests
from bs4 import BeautifulSoup

def parse_hubr_posts(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    post_dict = {}

    author_elem = soup.find('a', {'class': 'tm-user-info__username'})
    post_dict['author'] = extract_username(str(author_elem))
    post_dict['author_href'] = extract_href(str(author_elem))

    date_elem = soup.find('span', {'class': 'tm-article-datetime-published'})
    post_dict['date'] =extract_timestamp(str(date_elem))

    title_elem = soup.find('h1', {'class': 'tm-article-snippet__title tm-article-snippet__title_h1'})
    post_dict['title'] = extract_title(str(title_elem))
    post_dict['title_href'] = str(url)

    views_elem = soup.find('span', {'class': 'tm-icon-counter__value'})
    post_dict['views'] = extract_views(str(views_elem))

    hubs_elem = soup.find('div', {'class': 'tm-article-snippet__hubs'})
    post_dict['hubs'] = extract_hub_names(str(hubs_elem))

    votes_elem = soup.find('div', {'class': 'tm-votes-meter'})
    post_dict['votes'] = parse_votes(str(votes_elem))

    bookmarks_elem = soup.find('span', {'class': 'bookmarks-button__counter'})
    post_dict['bookmarks'] = extract_bookmarks_count(str(bookmarks_elem))

    comments_elem = soup.find('span', {'class': 'tm-article-comments-counter-link__value'})
    post_dict['comments'] = extract_comments_count(str(comments_elem))
 
    tags_elem = soup.find('div', {'class': 'tm-article-presenter__meta'})
    post_dict['tags'] = extract_tags(str(tags_elem))
    
    text_elem = soup.find('div', {'id': 'post-content-body'})
    post_dict['hrefs'] = extract_urls(str(text_elem))
    post_dict['text'] = get_first_n_sentences(extract_text(str(text_elem)))
    
    return post_dict

In [94]:
print(parse_hubr_posts('https://habr.com/ru/post/720848/'))

{'author': 'dimitryabramov', 'author_href': 'https://habr.com/ru/users/dimitryabramov/', 'date': 1678135430, 'title': 'Тайны мозга. Анализируем данные MRI с помощью FreeSurfer и Python', 'title_href': 'https://habr.com/ru/post/720848/', 'views': 54, 'hubs': ('Python *', 'Мозг', 'Будущее здесь', 'Научно-популярное', 'Визуализация данных *'), 'votes': None, 'bookmarks': 0, 'comments': 0, 'tags': ['neuroscience', 'нейробиология', 'мозг', 'визуализация данных', 'будущее', 'python', 'исследование'], 'hrefs': ['https://surfer.nmr.mgh.harvard.edu/', 'https://surfer.nmr.mgh.harvard.edu/fswiki/DownloadAndInstall', 'https://openneuro.org/', 'https://www.humanconnectome.org/', 'https://brainminds.jp/en/', 'https://www.cancerimagingarchive.net/'], 'text': ['Визуализация мозга это революционное направление в неврологии, оно позволяет исследователям получать беспрецедентное представление о структуре и функциях человеческого мозга', 'Одной из областей, где визуализация показала особые перспективы, яв