In [None]:
import schedule
import time
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
from io import StringIO
import os

def scrape_bbc(): # entire scraping + saving logic here

    # Folder and CSV file
    folder_path = r"C:\Users\vladi\OneDrive\Робочий стіл\DATA SCRAPING PRACTICE"
    os.makedirs(folder_path, exist_ok=True)
    csv_file = os.path.join(folder_path, "bbc_top10_headlines.csv")

    # Get webpage
    url = "https://www.bbc.com/"
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")

    # Find headline elements
    headlines = soup.find_all("h2", {"data-testid": "card-headline"})

    # Collect up to 10 unique headlines
    articles = []
    seen_titles = set()
    for h in headlines:
        title = h.get_text(strip=True)
        a_tag = h.find_parent("a")
        if a_tag and title and title not in seen_titles:
            link = a_tag['href']
            if not link.startswith("http"):
                link = "https://www.bbc.com" + link
            articles.append({"title": title, "link": link, "date": ""})
            seen_titles.add(title)
        if len(articles) >= 10:
            break

    # Fill if fewer than 10 (for later projects)
    while len(articles) < 10:
        articles.append({"title": "No headline available", "link": "", "date": ""})

    # Add today's date
    today = datetime.today().strftime("%d-%m-%Y")
    for a in articles:
        a["date"] = today

    # Convert to DataFrame and save CSV
    new_df = pd.DataFrame(articles)
    new_df.to_csv(csv_file, index=False)

    # Print confirmation
    print(f"Top {len(new_df)} BBC Headlines Today:")
    for i, row in new_df.iterrows():
        print(f"{i+1}. {row['title']} -> {row['link']} -> Todays's date is: {row['date']}")

    print(f"Scraped {len(new_df)} headlines today and saved to CSV.")

# run every day at 8:00 AM
#schedule.every().day.at("08:00").do(scrape_bbc)
# test run - should print out the desired result
schedule.every(1).minutes.do(scrape_bbc)

# repeated automation with a running Jupyter notebook (can be run as a standalone Python script in CMD/Terminal or as a background task with some minor manipulations)
while True:
    schedule.run_pending()
    # time.sleep(60) # original automated setting
    time.sleep(10) # testing time setting 
    # make sure sleep time is constantly present in the code to avoid CPU overload

Top 10 BBC Headlines Today:
1. I filmed what it takes to make a family meal in Gaza -> https://www.bbc.co.uk/news/resources/idt-d3d76a1d-f320-4047-8a57-cc6f567f08c0 -> Todays's date is: 06-09-2025
2. Raid on Hyundai plant in US swept up workers on visitor visas -> https://www.bbc.com/news/articles/cy50yge052xo -> Todays's date is: 06-09-2025
3. Trump says Venezuelan jets will be shot down if they endanger US ships -> https://www.bbc.com/news/articles/cr70511v774o -> Todays's date is: 06-09-2025
4. Analysis: What's behind Putin's uncompromising stance? -> https://www.bbc.com/news/articles/c0m40pv44kgo -> Todays's date is: 06-09-2025
5. 'We are the troops': Inside Chicago's split communities as Trump vows to deploy National Guard -> https://www.bbc.com/news/articles/c4gj489q6e0o -> Todays's date is: 06-09-2025
6. Revenge & redemption - Sabalenka v Anisimova in final -> https://www.bbc.com/sport/tennis/articles/cr705zdjm28o -> Todays's date is: 06-09-2025
7. The sunscreen scandal shocking