In [1]:
import requests
from newspaper import Article
from bs4 import BeautifulSoup
import csv
import time


In [6]:
def extract_article_from_url(url):
    try:
        article = Article(url)
        article.download()
        article.parse()
        return {
            "title": article.title,
            "text": article.text,
            "url": url
        }
    except Exception as e:
        print(f"[ERROR] Failed to extract article from {url}: {e}")
        return None

def get_article_links_from_homepage(homepage_url, limit=10):
    try:
        response = requests.get(homepage_url)
        soup = BeautifulSoup(response.text, "html.parser")
        links = set()

        for a_tag in soup.find_all("a", href=True):
            href = a_tag["href"]
            if href.startswith("/") and not href.startswith("//"):
                href = homepage_url.rstrip("/") + href
            if homepage_url in href:
                links.add(href)

        return list(links)[:limit]
    except Exception as e:
        print(f"[ERROR] Failed to get links from {homepage_url}: {e}")
        return []

def scrape_news_site(urls, output_csv, article_limit=10):
    rows = []
    for homepage_url in urls: 
        article_urls = get_article_links_from_homepage(homepage_url, limit=article_limit)
        print(f"Found {len(article_urls)} article links.")

    
        for url in article_urls:
            print(f"[INFO] Scraping article: {url}")
            article = extract_article_from_url(url)
            if article and article["text"].strip():
                rows.append(article)
            time.sleep(1)  # To avoid getting blocked

    with open(output_csv, "w", newline='', encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["title", "text", "url"])
        writer.writeheader()
        writer.writerows(rows)

    print(f"[DONE] Saved {len(rows)} articles to {output_csv}")


In [7]:
urls = [
    "https://www.thesun.co.uk/news/", 
    "https://www.dailymail.co.uk/tvshowbiz", 
    "https://www.dailymail.co.uk/news",
    "https://www.dailymail.co.uk/ushome",
    "https://www.dailymail.co.uk/tv"
]
scrape_news_site(urls, output_csv="July17.csv", article_limit=100)

Found 0 article links.
Found 100 article links.
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14785785/Ana-Armas-Ballerina-action-thriller-BRIAN-VINER.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tv/article-14910969/Loose-Women-Kelle-Bryan-storms-studio-audience-Jane-Street-Porter-ITV.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/article-14912685/The-real-BBC-rich-list-REVEALED-Stacey-Solomon-Richard-Osman-Rylan-Clark-stars-earnings-corporation-dont-publish.html?ico=comment-anchor#comments
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14692421/Kylie-Jenner-fans-convinced-engaged-Timothee-Chalamet-telling-clue-red-carpet-debut.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/news/royals/index.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/tvshowbiz/news/royals/index.html: Article `download()` failed with 404 Client Error: Not Foun

[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/registration/profile.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14696915/oscar-nominated-actress-justin-timberlake-naked-jessica-biel-upset.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14693255/tlc-divides-fans-1000-lb-spin-off.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14693945/selena-gomez-shuts-benny-blanco-cheating-rumors.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14696203/Al-Pacino-mini-son-wears-rocker-T-shirt.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14893279/Brooklyn-olive-branch-ALISON-BOSHOFF-Beckham-feud-birthday-Harper.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14911353/Ellie-Goulding-shows-sensational-figure-triangle-bikini-Italian-holiday-heading-Tracy-Emin-

[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tv/article-14911679/Love-Island-fakery-row-revealed-couple-knew-hooked.html
Found 100 article links.
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14912755/Lena-Dunham-tattoo-directs-Natalie-Portman-Rashida-Jones-Good-Sex.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/article-14910399/Angela-Rayner-stealth-taxes-Deputy-PM-majors.html?ico=comment-anchor#comments
[INFO] Scraping article: https://www.dailymail.co.uk/news/article-14910287/Hosepipe-ban-rules-law-TikTok-videos.html#video
[INFO] Scraping article: https://www.dailymail.co.uk/news/article-14910473/Mum-28-punched-policewoman-bragged-deserved-fined-160.html?ico=comment-anchor#comments
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14909663/elmo-x-hacked-sesame-street-responds.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14909979/Elizabeth-Hurley-incredible-fi

[INFO] Scraping article: https://www.dailymail.co.uk/news/health/article-14904343/Plastic-surgeon-spill-secrets-Brad-Pitt-facelift.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-14910153/Heartbroken-mother-six-year-old-boy-killed-drug-drink.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tv/article-14908737/CHRISTOPHER-STEVENS-reviews-Mix-Tape-wallow-nostalgic-young-love.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-14910585/sussex-stink-plagues-residents-stench-france.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tv/article-14910969/Loose-Women-Kelle-Bryan-storms-studio-audience-Jane-Street-Porter-ITV.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-14912041/Duke-Sussex-follows-mother-Princess-Dianas-footsteps-walks-landmines-Angola.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14911657/Julia-Roberts-57-no-longer-looks-like-goes-BLOND

[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14903969/Sarah-Jessica-Parker-confirms-dated-actor-Lisa-Marie-Presley.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/femail/article-14904891/harry-potter-fans-difference-movies.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14902465/Bella-Hadid-bikini-Texas-boyfriend-Adan-Banuelos.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14901907/olivia-culpo-birth-baby-husband-christian-mccaffrey.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/news/royals/article-14875481/Prince-Philip-believed-Prince-Charles-lacked-dedication-king.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/ushome/news/royals/article-14875481/Prince-Philip-believed-Prince-Charles-lacked-dedication-king.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/ushome/news/ro

[INFO] Scraping article: https://www.dailymail.co.uk/ushome/femail/article-14901011/Reclusive-Athina-Onassis-heiress-billion-ball-public.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14905643/jennifer-aniston-boyfriend-jim-curtis-love-mallorca-holiday.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/femail/article-14907941/motivation-gayle-king-friendship-kris-jenner.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14906909/Jack-Grealish-girlfriend-Sasha-Attwood-toned-figure-bikinis.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14899967/Adele-miss-best-friends-50th-Insiders-KATIE-HIND.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14902953/Spencer-Pratt-reveals-real-reason-didnt-attend-longtime-friend-Brody-Jenners-wedding.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/sport/au/index.html
[ERROR] Failed to extr

[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14909153/Lil-shady-Eminems-grandson-Elliot-looks-adorable-dressed-rapper-mom-Hailie-Jade.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14902679/Pregnant-Rihanna-sons-Smurfs-premiere-LA.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/authors
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14907765/Kylie-Jenner-fans-think-beau-Timothee-Chalamet-secretly-yacht.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/femail/article-14907733/celine-dion-fans-shocked-ellen-degeneres-interview.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/news/article-14911413/trump-oval-office-crypto-deal-benefit-business.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/news/article-14912041/Duke-Sussex-follows-mother-Princess-Dianas-footsteps-walks-landmines-Angola.html
[INFO] Scraping article: https://www

[INFO] Scraping article: https://www.dailymail.co.uk/tv/article-14902183/Netflix-bosses-axe-iconic-moment-Pride-Prejudice-avoid-objectifying-men.html
[INFO] Scraping article: https://www.dailymail.co.uk/tv/tv/article-14908963/Love-Island-fans-raging-Tommy-Lucy-dumped.html?ico=livefeed#comments
[INFO] Scraping article: https://www.dailymail.co.uk/tv/news/article-14910345/Emma-Watson-BANNED-driving-caught-doing-38mph-30-zone-fourth-speeding-offence-two-years.html
[INFO] Scraping article: https://www.dailymail.co.uk/tv/article-14904041/The-Couple-Door-star-Annabel-Scholey-reveals-sexy-roles-40s.html
[INFO] Scraping article: https://www.dailymail.co.uk/tv/tv/article-14902183/Netflix-bosses-axe-iconic-moment-Pride-Prejudice-avoid-objectifying-men.html?ico=livefeed#video
[INFO] Scraping article: https://www.dailymail.co.uk/tv/tv/article-14908023/Naga-Munchetty-pay-rise-BBC-Breakfast-host-Charlie-Stayt.html?ico=livefeed#comments
[INFO] Scraping article: https://www.dailymail.co.uk/tv/article-

[INFO] Scraping article: https://www.dailymail.co.uk/tv/tvshowbiz/article-14910215/Cher-79-ageless-visage-Rome-hotel-Dolce-Gabbana-show.html
[INFO] Scraping article: https://www.dailymail.co.uk/tv/tvshowbiz/article-14910597/Dani-Dyer-spotted-mother-Strictly-Come-Dancing.html
[INFO] Scraping article: https://www.dailymail.co.uk/tv/tv/index.html?channelPageNum=6
[INFO] Scraping article: https://www.dailymail.co.uk/tv/podcasts/index.html
[INFO] Scraping article: https://www.dailymail.co.uk/tv/tvshowbiz/article-14912843/rob-kardashian-dating-years-blac-chyna-split.html
[INFO] Scraping article: https://www.dailymail.co.uk/tv/tv/article-14907225/Coronation-Street-legend-returning-cobbles-26-years-dev-alahan.html?ico=livefeed#video
[INFO] Scraping article: https://www.dailymail.co.uk/tv/tvshowbiz/article-14909955/Nicola-Peltz-skimpy-bralette-steamy-Brooklyn-Beckhams-feud-family.html
[INFO] Scraping article: https://www.dailymail.co.uk/tv/tv/index.html?channelPageNum=4
[INFO] Scraping article:

https://www.thesun.co.uk/news/
https://www.thesun.co.uk/tvandshowbiz/
https://www.dailymail.co.uk/tvshowbiz
https://www.dailymail.co.uk/news
