In [1]:
import requests
import logging
from bs4 import BeautifulSoup
from enum import Enum
from time import sleep
from requests import Response
import sys
import math

In [2]:
log = logging.getLogger()
log.setLevel(logging.DEBUG)
if not log.hasHandlers():
    handler = logging.StreamHandler(sys.stdout)
    handler.setLevel(logging.INFO)
    formatter = logging.Formatter("%(asctime)s: %(levelname)s - %(message)s")
    handler.setFormatter(formatter)
    log.addHandler(handler)
log.debug("Logger is set up")

# Parser

In [3]:
"""
Aboba
"""


class ParserException(Exception):
    pass


def get_html(url: str) -> Response:
    response = requests.get(url)
    response.raise_for_status()
    return response


class Category(Enum):
    All = "all"
    Invest = "Invest"

    def __str__(self) -> str:
        return self.value


def get_latest_article_urls_from_category(category: Category) -> set[str]:
    """
    Get latest urls of the articles for the provided category
    """
    source_category = f"https://www.rbc.ru/quote/category/{category}/"
    log.info(f"source category is {source_category}")

    category_content = get_html(source_category).content
    soup = BeautifulSoup(category_content, "html.parser")

    found_articles = set()

    for article in soup.find_all("a", class_="q-item__link"):
        log.info(f"fetched link {article} from category {source_category}")
        found_articles.add(article["href"])  # type: ignore

    return found_articles


def parse_article_from_content(article_content: bytes) -> str:
    """
    Parse an article content into plain text
    """
    soup = BeautifulSoup(article_content, "html.parser")

    content_container = soup.find("div", class_="article__text")
    if not content_container:
        raise ParserException("Could not find article text")

    title_element = soup.find("h1")
    title = title_element.get_text(strip=True) if title_element else "No title"

    article_text = " ".join(
        [
            " ".join(
                element.get_text()
                .strip(" \n")
                .replace("\xa0", " ")
                .replace("\n", " ")
                .split()
            )
            for element in content_container.find_all(  # type: ignore
                ["h1", "h2", "h3", "h4", "h5", "h6", "p"]
            )
        ]
    )

    return f"{title} - {article_text}"

In [4]:
articles_urls = get_latest_article_urls_from_category(Category.All)
articles = []
for url in articles_urls:
    log.info(f'fetching and parsing an article at "{url}"...')
    articles.append(parse_article_from_content(get_html(url).content))
    log.info(f'an article at "{url}" was fetched and parsed')
    sleep(1)

2025-09-14 22:52:33,994: INFO - source category is https://www.rbc.ru/quote/category/all/
2025-09-14 22:52:34,895: INFO - fetched link <a class="q-item__link" href="https://www.rbc.ru/quote/news/article/68c12edd9a794795d341f495">
<span class="q-item__title js-rm-central-column-item-text">
                                                            Что такое единый QR-код и как он будет работать в России
                                                </span>
</a> from category https://www.rbc.ru/quote/category/all/
2025-09-14 22:52:34,896: INFO - fetched link <a class="q-item__link q-item__order" href="https://www.rbc.ru/quote/news/article/68c12edd9a794795d341f495">
<span class="q-item__image-block q-item__image-block_ticker">
<img alt="Фото: Shutterstock" class="g-image q-item__image js-rm-central-column-item-image" decoding="async" height="800" loading="lazy" src="https://s0.rbk.ru/v6_top_pics/media/img/5/63/347574971010635.jpeg" srcset="https://s0.rbk.ru/v6_top_pics/resized/810x405_

In [5]:
avg_letter_count = sum(map(len, articles)) / len(articles)
avg_word_count = sum(map(lambda t: t.count(" "), articles)) / len(articles)

print(f"average letter count is {avg_letter_count:.2f}")
print(f"average word count is {avg_word_count:.2f}")
print(
    f"need at least {math.ceil(8 * 2**20 / avg_letter_count)} documents for the best mark"
)

average letter count is 5001.17
average word count is 716.17
need at least 1678 documents for the best mark
