In [1]:
import schedule
import time
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
from io import StringIO
import os

In [2]:
# converting to CSV file and saving to an existing folder
folder_path = r"C:\Users\vladi\OneDrive\Робочий стіл\DATA SCRAPING PRACTICE" # choose own folder path to save the CSV file
os.makedirs(folder_path, exist_ok=True)
csv_file = os.path.join(folder_path, "bbc_top10_headlines.csv") # choose your own name for the CSV file


In [3]:
# checking if the file already exists - load existing CSV if it exists, else create empty DataFrame; track already seen titles to avoid duplicates
if os.path.exists(csv_file):
    existing_df = pd.read_csv(csv_file)
    seen_titles = set(existing_df["title"].tolist())
else:
    existing_df = pd.DataFrame(columns=["date", "title", "link"])
    seen_titles = set()


In [4]:
# linking web and avoiding 403 error User-Agent
url = "https://www.bbc.com/"
headers = {"User-Agent": "Mozilla/5.0"} # use your own User-Agent if necessary
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")

In [5]:
# scrape top 10 unique BBC headlines with links, skipping duplicates already in CSV. We also create named columns for the information that we extract and want to see in our final dataframe
headlines = soup.find_all("h2", {"data-testid": "card-headline"}) # use data-testid="card-headline" since it's a stable, unique marker for BBC headlines (more reliable than dynamic class names that can change every day)

seen_titles = set(existing_df["title"].tolist())

articles = []

for h in headlines:
    title = h.get_text(strip=True)
    a_tag = h.find_parent("a")
    if a_tag and title and title not in seen_titles:
        link = a_tag['href']
        if not link.startswith("http"):
            link = "https://www.bbc.com" + link
        articles.append({"title": title, "link": link, "date": ""})
        seen_titles.add(title)
    
    if len(articles) >= 10:  
        break

In [6]:
# make sure the code is ready for when there are less than 10 headlines (this line of code is mainly for future purposes of the project)
while len(articles) < 10:
    articles.append({"title": "No headline available", "link": "", "date": ""})

In [7]:
# setting top 10 articles 
top_10_articles = articles[:10]

In [8]:
# adding date and time
today = datetime.today().strftime("%d-%m-%Y") # choose own date format
for a in top_10_articles:
    a["date"] = today

In [9]:
# converting to a data frame for later export
new_df = pd.DataFrame(articles)


In [10]:
# save the new headlines DataFrame to CSV (overwrite existing file)
new_df.to_csv(csv_file, index=False)


In [11]:
# check if the project's conditions are met
print(f"Top {len(new_df)} BBC Headlines Today:")
for i, row in new_df.iterrows():
    print(f"{i+1}. {row['title']} -> {row['link']} -> Todays's date is: {row['date']}")

print(f"Total headlines in CSV: {len(new_df)}")


Top 10 BBC Headlines Today:
1. I filmed what it takes to make a family meal in Gaza -> https://www.bbc.co.uk/news/resources/idt-d3d76a1d-f320-4047-8a57-cc6f567f08c0 -> Todays's date is: 06-09-2025
2. Raid on Hyundai plant in US swept up workers on visitor visas -> https://www.bbc.com/news/articles/cy50yge052xo -> Todays's date is: 06-09-2025
3. Analysis: What's behind Putin's uncompromising stance? -> https://www.bbc.com/news/articles/c0m40pv44kgo -> Todays's date is: 06-09-2025
4. 'We are the troops': Inside Chicago's split communities as Trump vows to deploy National Guard -> https://www.bbc.com/news/articles/c4gj489q6e0o -> Todays's date is: 06-09-2025
5. Trump says Venezuelan jets will be shot down if they endanger US ships -> https://www.bbc.com/news/articles/cr70511v774o -> Todays's date is: 06-09-2025
6. Revenge & redemption - Sabalenka v Anisimova in final -> https://www.bbc.com/sport/tennis/articles/cr705zdjm28o -> Todays's date is: 06-09-2025
7. The sunscreen scandal shocking