### Initializing the imports

In [6]:
import newspaper
import feedparser
import pandas as pd
import os
import time
import logging


In [None]:
# setup configuration
RSS_FEEDS = [
    "https://feeds.bbci.co.uk/news/rss.xml"
]

MAX_ARTICLES_PER_FEED = 30
DELAY_BETWEEN_ARTICLES = 0.5


In [7]:
# PATH SETUP (RELATIVE & PORTFOLIO SAFE)
BASE_DIR = os.getcwd()   # Notebook safe
OUTPUT_DIR = os.path.join(BASE_DIR, "outputs")
os.makedirs(OUTPUT_DIR, exist_ok=True)

OUTPUT_FILE = os.path.join(OUTPUT_DIR, "news_articles.csv")


In [None]:
# Logging helps track progress instead of messy print statements
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s",
)


In [9]:
#Parse RSS Feed Function
def parse_rss_feed(feed_url):
    feed = feedparser.parse(feed_url)
    # Limit number of articles to avoid overload
    return feed.entries[:MAX_ARTICLES_PER_FEED]


In [10]:
#Scrape Single Article Function
def scrape_article(article_url, fallback_data):
    article = newspaper.Article(article_url)
    # Download article HTML
    article.download()
    # Parse article content
    article.parse()

    return {
        # Prefer newspaper extracted data, fallback to RSS data
        "title": article.title or fallback_data.get("title", ""),
        "authors": ", ".join(article.authors) if article.authors else "",
        "publish_date": article.publish_date or fallback_data.get("published", ""),
        "content": article.text.strip(),
        "source_url": article_url,
    }


In [11]:
#Main Scraping Function
def scrape_news_from_feeds(feed_urls):
    all_articles = []
    seen_urls = set()

    for feed_url in feed_urls:
        entries = parse_rss_feed(feed_url)

        for entry in entries:
            url = entry.get("link")
            # Skip invalid or duplicate URLs
            if not url or url in seen_urls:
                continue

            try:
                article_data = scrape_article(url, entry)
                # Skip very short or empty articles
                if len(article_data["content"]) < 200:
                    continue

                all_articles.append(article_data)
                seen_urls.add(url)

                logging.info(f"Scraped: {article_data['title'][:60]}...")
                time.sleep(DELAY_BETWEEN_ARTICLES)

            except Exception as e:
                logging.warning(f"Failed to scrape {url} | {e}")

    return all_articles


In [12]:
#Run Scraper & Save Output

articles = scrape_news_from_feeds(RSS_FEEDS)

df = pd.DataFrame(articles)
# Arrange columns for analytics friendliness
df = df[["title", "authors", "publish_date", "content", "source_url"]]

df.to_csv(OUTPUT_FILE, index=False, encoding="utf-8")

print(f"Total articles scraped: {len(df)}")
print(f"Saved to: {OUTPUT_FILE}")


2026-02-12 21:39:02,406 | INFO | Scraped: Minneapolis immigration enforcement operation to 'conclude',...
2026-02-12 21:39:03,068 | INFO | Scraped: 'Vast majority' of parents should be involved if children qu...
2026-02-12 21:39:03,718 | INFO | Scraped: Trump case against BBC to go to trial in February 2027...
2026-02-12 21:39:04,389 | INFO | Scraped: Jim Ratcliffe sorry language 'offended some' after immigrati...
2026-02-12 21:39:05,054 | INFO | Scraped: Kim Ju Ae: North Korea leader Kim Jong Un chooses daughter a...
2026-02-12 21:39:05,730 | INFO | Scraped: Robin Windsor: Strictly star took own life after mental heal...
2026-02-12 21:39:06,455 | INFO | Scraped: Reeves says 'more to do' after sluggish GDP growth...
2026-02-12 21:39:07,323 | INFO | Scraped: Alton Towers U-turns on plan to restrict disability pass for...
2026-02-12 21:39:08,269 | INFO | Scraped: 2026 Winter Olympics: Why Vladyslav Heraskevych was banned f...
2026-02-12 21:39:09,162 | INFO | Scraped: Team GB's Matt Westo

Total articles scraped: 26
Saved to: c:\Users\shukl\OneDrive\Desktop\Data-Analytics-Portfolio\Python\Web-Scraping\News-Scraping\outputs\news_articles.csv
