In [None]:
import sys
from pathlib import Path

sys.path.append(str(Path().resolve() / "src"))

from savers.mongo import MongoSaver
from scrapers.rbc import RBCGetter, RBCParser
from scrapers.habr import HabrGetter, HabrParser
from config.config import Config
from log import log

from common import ParsedScrap, Scrap
from config.config import SiteConfig
from savers.interfaces import ISaver
from scrapers.interfaces import IGetter, IParser

import logging

log.setLevel(logging.INFO)

In [None]:
def log_first_n(set: set, n: int):
    i = 0
    for s in set:
        if i >= n:
            break
        log.info(f"\t{s}")
        i += 1

In [None]:
ENTITIES_LOG_COUNT: int = 10
MAPPING: dict[str, tuple[type[IGetter], type[IParser]]] = {
    "rbc": (RBCGetter, RBCParser),
    "habr": (HabrGetter, HabrParser),
}

In [None]:
config = Config(Path().resolve() / ".." / "scraper.yml")
log.info(f"got config = {config}")

In [None]:
saver = MongoSaver(config.mongo_settings)

In [None]:
def get(getter: IGetter, cfg: SiteConfig) -> set[Scrap]:
    log.info(f"going to parse at most {cfg.doc_limit} documents")
    sources = getter.fetch_sources()
    log.info(f"first {ENTITIES_LOG_COUNT} sources = ")
    log_first_n(sources, ENTITIES_LOG_COUNT)
    log.info(
        f"fetched {len(sources)} articles, fetching all of them will take about {len(sources) * cfg.crawl_delay / 60 / 60:.3f} hours"
    )

    scraps = getter.fetch_scrap(sources)
    del sources

    return scraps


def parse(parser: IParser, scrap: set[Scrap], cfg: SiteConfig) -> set[ParsedScrap]:
    return parser.parse_scrap(scrap)


def save(saver: ISaver, parsed_scrap: set[ParsedScrap], cfg: SiteConfig) -> None:
    saver.save_parsed_scrap(parsed_scrap)


for site, cfg in config.sites.items():
    try:
        log.info(f"started scraping {site} with cfg = {cfg}")
        (Getter, Parser) = MAPPING[site]
        getter = Getter(cfg)  # type: ignore
        parser = Parser()

        scrap = get(getter, cfg)
        parser.info_scrap(scrap)

        parsed_scrap = parse(parser, scrap, cfg)
        del scrap
        parser.info_parsed_scrap(parsed_scrap)
        log.info(f"first {ENTITIES_LOG_COUNT} parsed scraps = ")
        log_first_n(set(map(lambda x: x.value, parsed_scrap)), ENTITIES_LOG_COUNT)

        save(saver, parsed_scrap, cfg)

        del parsed_scrap
    except Exception as e:
        log.error(f"scraping {site} with cfg = {cfg} failed: {e}")
