In [7]:
from bs4 import BeautifulSoup
import re
import datetime
from typing import Optional, Tuple, Dict, Union

def extract_username(input_str: str) -> Optional[str]:
    pattern = re.compile(r'/users/(.*?)/">')
    match = pattern.search(input_str)
    if match:
        return match.group(1)
    return None

def extract_timestamp(input_str: str) -> Optional[int]:
    pattern = re.compile(r'<time datetime="(.*?)" title=".*?">')
    match = pattern.search(input_str)
    if match:
        timestamp_str = match.group(1)
        timestamp = datetime.datetime.strptime(timestamp_str, '%Y-%m-%dT%H:%M:%S.%fZ')
        return int((timestamp - datetime.datetime(1970, 1, 1)).total_seconds())
    return None

def extract_title(input_str: str) -> Optional[str]:
    pattern = re.compile(r'<span>(.*?)</span>')
    match = pattern.search(input_str)
    if match:
        title_str = match.group(1)
        return title_str.strip()
    return None

def extract_views(input_str: str) -> Optional[int]:
    pattern = re.compile(r'([\d\.]+)([KkMmBbTt]?)')
    match = pattern.search(input_str)
    if match:
        num_str = match.group(1)
        suffix = match.group(2).lower()
        num = float(num_str)
        if suffix == 'k':
            num *= 1000
        elif suffix == 'm':
            num *= 1000000
        elif suffix == 'b':
            num *= 1000000000
        elif suffix == 't':
            num *= 1000000000000
        return int(num)
    return None

def extract_hub_names(html: str) -> Tuple[str, ...]:
    soup = BeautifulSoup(html, 'html.parser')
    hubs = soup.find('div', {'class': 'tm-article-snippet__hubs'})
    hub_names = [hub.text.strip() for hub in hubs.find_all('span', {'class': 'tm-article-snippet__hubs-item'})]
    return tuple(hub_names)

def parse_votes(s: str) -> Optional[Dict[str, int]]:
    pattern = re.compile(r'↑(\d+).*↓(\d+)')
    match = pattern.search(s)
    if match:
        return {'upvotes': int(match.group(1)), 'downvotes': int(match.group(2))}
    return None

def extract_bookmarks_count(text: str) -> Optional[int]:
    pattern = r'<span\s+class="bookmarks-button__counter".*?>\s*(\d+)\s*</span>'
    match = re.search(pattern, text)
    if match:
        return int(match.group(1))
    else:
        return 0

def extract_href(string: str) -> Optional[str]:
    try:
        pattern = r'href="(.+?)"'
        href = re.search(pattern, string).group(1)
        return 'https://habr.com'+href
    except:
        return None

def extract_comments_count(text: str) -> Optional[int]:
    try:
        count_str = re.search(r'\d+', text).group()
        return int(count_str)
    except:
        return 0



In [9]:
import requests
from bs4 import BeautifulSoup

def parse_hubr_posts(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    post_divs = soup.find_all('article', {'class': 'tm-articles-list__item'})


    posts = []
    for post_div in post_divs:
        post_dict = {}
        
        author_elem = post_div.find('a', {'class': 'tm-user-info__username'})
        post_dict['author'] = extract_username(str(author_elem))
        post_dict['author_href'] = extract_href(str(author_elem))

        date_elem = post_div.find('span', {'class': 'tm-article-datetime-published'})
        post_dict['date'] =extract_timestamp(str(date_elem))

        title_elem = post_div.find('a', {'class': 'tm-article-snippet__title-link'})
        post_dict['title'] = extract_title(str(title_elem))
        post_dict['title_href'] = extract_href(str(title_elem))
        
        views_elem = post_div.find('span', {'class': 'tm-icon-counter__value'})
        post_dict['views'] = extract_views(str(views_elem))

        hubs_elem = post_div.find('div', {'class': 'tm-article-snippet__hubs'})
        post_dict['hubs'] = extract_hub_names(str(hubs_elem))

        votes_elem = post_div.find('div', {'class': 'tm-votes-meter'})
        post_dict['votes'] = parse_votes(str(votes_elem))

        bookmarks_elem = post_div.find('span', {'class': 'bookmarks-button__counter'})
        post_dict['bookmarks'] = extract_bookmarks_count(str(bookmarks_elem))

        comments_elem = post_div.find('span', {'class': 'tm-article-comments-counter-link__value'})
        post_dict['comments'] = extract_comments_count(str(comments_elem))

        posts.append(post_dict)

    return posts


In [14]:
import pymongo
cluster = pymongo.MongoClient("mongodb://localhost:27017")
db = cluster["HabrDB"]
collection = db["HabrCl"]


In [18]:
def test_func(n,query = false):
    for i in range(1,n+1):
        if query:
            query = query.replace(' ', '%')
            url =f'https://habr.com/ru/search/page{i}/?q={query}&target_type=posts&order=relevance'
        else:
            url = f'https://habr.com/ru/all/page{i}/'
        try:
            posts = parse_hubr_posts(url)
            collection.insert_many(posts)
        except:
            return None
        
test_func(50,'python')



<pymongo.results.InsertManyResult at 0x1ee951f44c0>