In [2]:
from site_scraper import SiteScraper

gsp_scraper = SiteScraper(
    name="gsp",
    base_url="https://www.gsp.ro",
    traffic=3.6,
    time_selector="p.data-author",
    block_selector="div.news-item",
    link_selector="h2 a",
    title_strategy="text"
)

digisport_scraper = SiteScraper(
    name="digisport",
    base_url="https://www.digisport.ro",
    traffic=3.3,
    time_selector="cite",
    block_selector="article.article",
    link_selector="a.article-link, h3 a.widget-latest-list-item-link",
    title_strategy="attribute",
    title_attribute="title"
)

fanatik_scraper = SiteScraper(
    name="fanatik",
    base_url="https://www.fanatik.ro",
    traffic=2.9,  # Estimated, adjust as needed
    time_selector="span.date",  # Often used near article blocks
    block_selector="div.article",  # Container for each article
    link_selector="h3 a",  # Headline links
    title_strategy="text"
)

prosport_scraper = SiteScraper(
    name="prosport",
    base_url="https://www.prosport.ro",
    traffic=3.1,
    time_selector="span.date",
    block_selector="h2.article__title",  # Directly targets the title block
    link_selector="a",  # The anchor inside the h2
    title_strategy="text"
)

sites = [gsp_scraper, digisport_scraper, fanatik_scraper, prosport_scraper]
total_traffic = sum(site.traffic for site in sites)

In [3]:
for site in sites:
    site.compute_weight(total_traffic)
    site.scrape_recent_articles(minutes=360)
    site.save_to_csv()
    # site.short_print()


In [4]:
from story_clusterer import StoryClusterer

clusterer = StoryClusterer(sites, 180, 0.3, 0.4)
clusterer.cluster_stories()

clusterer.print_matched_clusters()

Shared article https://www.fanatik.ro/cine-va-fi-noul-selectioner-daca-mircea-lucescu-va-pleca-lista-de-3-a-lui-razvan-burleanu-exclusiv-21273199 on digisport - fanatik skipped https://www.fanatik.ro/cine-va-fi-noul-selectioner-daca-mircea-lucescu-va-pleca-lista-de-3-a-lui-razvan-burleanu-exclusiv-21273199 digisport fanatik
Shared article https://www.prosport.ro/fotbal-intern/nationala/mircea-lucescu-e-out-de-la-nationala-romaniei-culisele-finalului-si-raspunsurile-la-10-intrebari-puse-de-milioane-de-romani-intr-un-material-de-gabriel-berceanu-exclusiv-20269549 on digisport - prosport skipped https://www.prosport.ro/fotbal-intern/nationala/mircea-lucescu-e-out-de-la-nationala-romaniei-culisele-finalului-si-raspunsurile-la-10-intrebari-puse-de-milioane-de-romani-intr-un-material-de-gabriel-berceanu-exclusiv-20269549 digisport prosport

🧠 Cluster #1 — Score: 0.496

🧠 Cluster #2 — Score: 0.256

🧠 Cluster #3 — Score: 0.256

🔍 Matched Clusters Across Multiple Sites

🧠 Cluster #3 — Score: 0.

In [7]:

top_stories = clusterer.score_clusters()

for i, story in enumerate(top_stories[:5], 1):
    print(f"\n🏆 Top {i} — Score: {story['score']}")
    for article in story["articles"]:
        print(f"- {article.site}: {article.title}")
