In [1]:
import requests
from newspaper import Article
from bs4 import BeautifulSoup
import csv
import time
import pandas as pd

In [2]:
def extract_article_from_url(url):
    try:
        article = Article(url)
        article.download()
        article.parse()
        return {
            "title": article.title,
            "text": article.text,
            "url": url
        }
    except Exception as e:
        print(f"[ERROR] Failed to extract article from {url}: {e}")
        return None

def get_article_links_from_homepage(homepage_url, limit=10):
    try:
        response = requests.get(homepage_url)
        soup = BeautifulSoup(response.text, "html.parser")
        links = set()

        for a_tag in soup.find_all("a", href=True):
            href = a_tag["href"]
            if href.startswith("/") and not href.startswith("//"):
                href = homepage_url.rstrip("/") + href
            if homepage_url in href:
                links.add(href)

        return list(links)[:limit]
    except Exception as e:
        print(f"[ERROR] Failed to get links from {homepage_url}: {e}")
        return []

def scrape_news_site(urls, output_csv, article_limit=10):
    rows = []
    for homepage_url in urls: 
        article_urls = get_article_links_from_homepage(homepage_url, limit=article_limit)
        print(f"Found {len(article_urls)} article links.")

    
        for url in article_urls:
            print(f"[INFO] Scraping article: {url}")
            article = extract_article_from_url(url)
            if article and article["text"].strip():
                rows.append(article)
            time.sleep(1)  # To avoid getting blocked

    with open(output_csv, "w", newline='', encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["title", "text", "url"])
        writer.writeheader()
        writer.writerows(rows)

    print(f"[DONE] Saved {len(rows)} articles to {output_csv}")


In [3]:
urls = [
    "https://www.dailymail.co.uk/tvshowbiz", 
    "https://www.dailymail.co.uk/news",
    "https://www.dailymail.co.uk/ushome",
    "https://www.dailymail.co.uk/tv"
    "https://www.dailymail.co.uk/auhome/",
    "https://www.dailymail.co.uk/news/royals/"
    "https://www.dailymail.co.uk/home/"
]
scrape_news_site(urls, output_csv="Today.csv", article_limit=1000)

Found 193 article links.
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14922959/Harper-Beckham-14-takes-leaf-mum-Victorias-beauty-book-leading-make-tutorial.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/additionalcookieinfo
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14923411/Ellen-DeGeneres-confirms-left-US-President-Trump.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14697029/Buffy-Vampire-Slayer-fans-meltdown-possibility-beloved-character-join-reboot.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/topics
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14699803/Kelly-Clarkson-telling-remarks-demanding-talk-gig-mystery-absence.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14693163/Cristiano-Ronaldos-girlfriend-dress-inspired-Princess-Diana.html
[INFO] Scraping a

[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/article-14873949/BRIAN-VINER-reviews-Jurassic-World-Rebirth-Snap-bite-T-Rex-ROARS-again.html?ico=comment-anchor#comments
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/index.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/terms
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/article-14912547/80s-movie-icon-Al-Pacino-unrecognizable-rare-outing-guess-who.html?ico=comment-anchor#comments
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/article-14855447/30-carat-ring-Bezos-wedding-story-Boshoff.html?ico=comment-anchor#comments
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14923143/Dr-Ranj-boyfriend-lavish-wedding-hard-launch.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14700523/amber-heard-welcomes-newborn-twins-daughter-son.html
[INFO] Scraping article: https://www.dailymail.co.

[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tv/article-14923561/Love-Island-viewers-slam-disrespectful-Dejon-rude-Yasmin.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14693847/model-irina-shayk-stare-famous-ex-nyc-street.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/textbased/channel-1/index.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/tvshowbiz/textbased/channel-1/index.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/tvshowbiz/textbased/channel-1/index.html on URL https://www.dailymail.co.uk/tvshowbiz/textbased/channel-1/index.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/article-14923331/Ulrika-Jonsson-hits-ageist-comments-trolled.html#video
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14922863/Janet-Jackson-surprise-appearance-rarely-seen-son-Jacksons-gig-Reading.h

[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14923363/Heidi-Klum-bikini-pert-derriere-cheeky-video.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14696915/oscar-nominated-actress-justin-timberlake-naked-jessica-biel-upset.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/article-14923331/Ulrika-Jonsson-hits-ageist-comments-trolled.html?ico=comment-anchor#comments
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/article-14923379/Love-Island-winner-Millie-Court-sends-temperatures-soaring-skimpy-black-bikini-sun-soaked-girls-holiday-Ibiza.html?ico=comment-anchor#comments
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14694537/Brian-Austin-Green-Vanessa-Marcil-Machine-Gun-Kelly-meme-toxic.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14923317/jennifer-garner-reunites-violet-affleck-fight-la-fires.html
[INFO] Scr

[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14921643/Davina-Mccall-six-pack-workout-brain-tumour.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/femail/article-14915623/miley-cyrus-fans-horrified-resurfaced-interviews-reporters-sex-pregnancy.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/article-14922687/Scuffle-breaks-cast-member-raises-Palestinian-flag-Royal-Opera-House-stage.html#video
[INFO] Scraping article: https://www.dailymail.co.uk/news/article-14923855/Dame-Joanna-Lumley-speaks-favour-assisted-dying.html?ico=comment-anchor#comments
[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-14923047/Exodus-gardeners-upset-King-Charles-red-letter-feedback-beloved-Highgrove-garden.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/article-14922809/reason-Epstein-list-never-released-update-Trump.html#video
[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-14923495/Missing-g

[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-14923799/I-cured-one-deadliest-forms-cancer-trialling-new-drug-brain-cancer-gone.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14920469/bravo-star-quits-reality-cheating.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14921889/Gareth-Southgate-KATIE-HIND-Joseph-Fiennes-spitting-image-ex-England-coach-TV-remake-stage-hit-lookalikes.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14922839/Superman-director-condemns-DC-Studios-terrible-handling-Henry-Cavill-announcing-actors-return-Man-Steel-handing-role-David-Corenswet.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/article-14923675/Ian-Wilkinson-Erin-Patterson-church.html?ico=comment-anchor#comments
[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-14923187/epping-bell-hotel-essex-anti-migrant-protests-ethiopian-man-tried-kiss-14-ye

[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-14915425/Violent-cult-mafia-Black-Axe-gang-tortures-recruits-naked-blood-drinking-initiation-British-teens.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14921129/Jamie-Laing-pregnancy-update-wife-Sophie-Habboo-life-baby.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/article-14923531/belgian-football-barber-honeytrap-kidnap-gang-London-500k-cryptocurrency.html?ico=comment-anchor#comments
[INFO] Scraping article: https://www.dailymail.co.uk/news/tv/article-14921841/Danny-Dyer-Stephen-Graham-reunite-drama-Netflix-Adolescence.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14920833/Brian-Nolan-given-clear-prostate-cancer-battle-Coleen-Linda.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14921925/Former-child-star-actor-unrecognizable-rare-outing-guess-who.html
[INFO] Scraping article: https://www.dailyma

[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14923211/wendy-williams-public-outing-birthday-guardianship.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-14882663/Couple-sue-Waitrose-race-discrimination-sacked-came-unauthorised-holiday-tan.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14922035/Zoe-Kravitz-sparks-concern-shockingly-appearance-NYC-surprise-Emmy.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14922453/wedding-Charli-XCX-husband-George-Daniel-cried.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14923593/Mandy-Moore-Chrissy-Metz-update-house-fire.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14921975/Kimberley-Walsh-Girls-Aloud-different-stage-tribute-late-Sarah-Harding-reunion-kids.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14923209/Huge-pop-st

[INFO] Scraping article: https://www.dailymail.co.uk/news/news/mostread/index.html#news
[ERROR] Failed to extract article from https://www.dailymail.co.uk/news/news/mostread/index.html#news: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/news/news/mostread/index.html#news on URL https://www.dailymail.co.uk/news/news/mostread/index.html#news
[INFO] Scraping article: https://www.dailymail.co.uk/news/article-14922553/Ryanair-staff-bonuses-passengers-oversized-cabin-bags.html?ico=comment-anchor#comments
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14919845/denise-richards-aaron-phypers-marriage-toxic-reality-show.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14923525/Taylor-Swift-Selena-Gomez-glamorous-33-birthday-party.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14923665/Hunky-Hollyoaks-star-signs-celebrity-dating-app-Raya-returns-UK-lov

[ERROR] Failed to extract article from https://www.dailymail.co.uk/news/news/royals/article-14907619/Prince-Harry-Meghan-Markle-trip-Colombia-Spare.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/news/news/royals/article-14907619/Prince-Harry-Meghan-Markle-trip-Colombia-Spare.html on URL https://www.dailymail.co.uk/news/news/royals/article-14907619/Prince-Harry-Meghan-Markle-trip-Colombia-Spare.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14922575/Gino-DAcampo-apology-tour-kisses-employees-uncancelled-ITV.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-14923613/Taliban-kills-10-Afghans-helped-West-data-leak-disaster.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/article-14922673/Bitcoin-Rachel-Reeves-UK-finances-seized-criminal-crypto-cash.html?ico=comment-anchor#comments
[INFO] Scraping article: https://www.dailymail.co.uk/news/podcasts/soccer-az/inde

[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-14922641/Home-Office-social-media-influencer-resigns-Palestine-Action.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14921313/KATIE-HIND-Gallaghers-reunited-Meg-Noels-rock.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/femail/article-14922793/Clarksons-Farm-star-grew-wanting-ballerina-make-fortune-shows-break-star.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/home/sitemap.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/news/home/sitemap.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/news/home/sitemap.html on URL https://www.dailymail.co.uk/news/home/sitemap.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-14923167/Stonehenge-village-speeding-death-World-Heritage-safety-signs.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/news/ar

[INFO] Scraping article: https://www.dailymail.co.uk/news/femail/article-14905329/Tragic-spiral-caused-reclusive-heiress-2-7-billion-fortune-retreat-shadows-Athina-Onassis-haunted-childhood-grief-humiliating-heartbreak-slowly-steps-public-life.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-14923315/Nigel-Farage-pledges-tackle-lawless-Britain-radical-plan-offenders-sent-jails-El-Salvador.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/article-14922943/barack-obama-gay-role-models-boys-ignorant.html#video
[INFO] Scraping article: https://www.dailymail.co.uk/news/sciencetech/article-14915535/went-prison-genetically-editing-human.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/article-14922747/Health-tourists-cost-NHS-200million.html?ico=comment-anchor#comments
[INFO] Scraping article: https://www.dailymail.co.uk/news/article-14919171/Family-fury-12-year-old-girl-stepfather-diversity-day-Union-Jack-dress.html?ico=comment-anchor#com

[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-14922673/Bitcoin-Rachel-Reeves-UK-finances-seized-criminal-crypto-cash.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14921655/Tom-Cruises-ex-Rebecca-Mornay-rare-comment-Risky-Business-affair.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/article-14917777/Man-war-council-plans-seven-bedroom-mansion-trees-axed.html?ico=comment-anchor#comments
[INFO] Scraping article: https://www.dailymail.co.uk/news/registration/profile.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14921987/Kerry-Katona-dress-daughter-Heidi-charity-event.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14923419/Danny-Dyer-reveals-ran-NAKED-council-estate-filming-new-series-Mr-Bigstuff.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14920855/Larry-Lamb-childhood-abusive-dad-candid.html
[INFO] Scraping ar

[ERROR] Failed to extract article from https://www.dailymail.co.uk/ushome/femail/au/index.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/ushome/femail/au/index.html on URL https://www.dailymail.co.uk/ushome/femail/au/index.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14922129/Scott-Wolf-divorce-wife-Kelley-shock-twist.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/media/article-14918391/jimmy-kimmel-blasts-cbs-stephen-colbert-canceled-late-night.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/sport/nfl/article-14921893/Travis-Kelce-drive-LA-Taylor-Swift-Scott-heart-surgery.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/ushome/sport/nfl/article-14921893/Travis-Kelce-drive-LA-Taylor-Swift-Scott-heart-surgery.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/ushome/sport/nfl/article-

[ERROR] Failed to extract article from https://www.dailymail.co.uk/ushome/news/royals/article-14907619/Prince-Harry-Meghan-Markle-trip-Colombia-Spare.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/ushome/news/royals/article-14907619/Prince-Harry-Meghan-Markle-trip-Colombia-Spare.html on URL https://www.dailymail.co.uk/ushome/news/royals/article-14907619/Prince-Harry-Meghan-Markle-trip-Colombia-Spare.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14920685/Louis-Tomlinson-concerned-spots-large-object-night-sky.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14912765/Donald-Trump-Jr-ex-girlfriend-shock-reveal-past-romance.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/femail/article-14916719/sza-nicki-minaj-lying-age-feud.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14921313/KATIE-HIND-Gallaghers-reunited-Me

[INFO] Scraping article: https://www.dailymail.co.uk/ushome/news/article-14917991/White-Lotus-star-new-restaurant-mixed-reviews.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14919845/denise-richards-aaron-phypers-marriage-toxic-reality-show.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14909215/Savannah-Chrisley-liposuction-weight-gain-rhinoplasty-MAGA-Barbie.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14909387/Nick-Cannon-roasted-relationship-advice-podcast-12-children.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/real-estate/article-14918847/ritzy-neighborhood-surge-foreclosures-park-avenue.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/news/article-14921905/Donald-Trump-sues-Wall-Street-Journal-MoS-reveals-Bill-Clinton-letter-Jeffrey-Epstein-birthday.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-149

[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14922453/wedding-Charli-XCX-husband-George-Daniel-cried.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/auhome/index.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14916613/Keeping-Kardashians-mansion-hits-real-estate-market-9M.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/femail/article-14917761/coldplay-ceo-couple-press-release.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14910245/Natalie-Portman-leggy-display-Lena-Dunham-Good-Sex-Rashida-Jones.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/textbased/channel-1/index.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/ushome/textbased/channel-1/index.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/ushome/textbased/channel-1/index.html on URL https://www.dail

[INFO] Scraping article: https://www.dailymail.co.uk/ushome/news/royals/article-14919249/Harry-Meghan-Britain-aides-King-secret-peace-summit.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/ushome/news/royals/article-14919249/Harry-Meghan-Britain-aides-King-secret-peace-summit.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/ushome/news/royals/article-14919249/Harry-Meghan-Britain-aides-King-secret-peace-summit.html on URL https://www.dailymail.co.uk/ushome/news/royals/article-14919249/Harry-Meghan-Britain-aides-King-secret-peace-summit.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14919811/Hollywood-icon-unrecognizable-mobster-JFK-assassination-film.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/femail/food/article-14923919/Chefs-secret-restaurant-mashed-potato-tastes-better.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/ushome/f

[ERROR] Failed to extract article from https://www.dailymail.co.uk/ushome/sport/nfl/article-14920307/alix-earle-braxton-berrios-charles-leclerc.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/ushome/sport/nfl/article-14920307/alix-earle-braxton-berrios-charles-leclerc.html on URL https://www.dailymail.co.uk/ushome/sport/nfl/article-14920307/alix-earle-braxton-berrios-charles-leclerc.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14921175/Russell-Crowe-sons-Los-Angeles.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/sport/nfl/article-14923911/Andy-Reid-devastating-Kansas-City-Chiefs-setback-training-camp.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/ushome/sport/nfl/article-14923911/Andy-Reid-devastating-Kansas-City-Chiefs-setback-training-camp.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/ushome/

[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14909827/menendez-brothers-lyle-erik-ryan-murphy-emmy.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/podcasts/index.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/meghan-markle/index.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/ushome/tvshowbiz/meghan-markle/index.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/ushome/tvshowbiz/meghan-markle/index.html on URL https://www.dailymail.co.uk/ushome/tvshowbiz/meghan-markle/index.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/index.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14920639/Mariska-Hargitay-peace-biological-dad-Nelson-Sardelli-paternity-bombshell.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/kim_kardashian/index.html
[ERROR] Failed to 

[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14921853/Kelly-Clarkson-daughter-River-surprise-duet-Las-Vegas-show.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/news/article-14915425/Violent-cult-mafia-Black-Axe-gang-tortures-recruits-naked-blood-drinking-initiation-British-teens.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14913757/Kristen-Stewart-wife-Dylan-Meyer-twin-sporty-lunch-married.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14924051/Pregnant-Rihanna-flashes-bare-baby-bump-sheer-bra-shows-necklaces-sons-names.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/news/article-14917773/Meghan-Markle-Love-Netflix-flop-10.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/femail/article-14908271/paranormal-investigator-annabelle-haunted-doll-tour-death.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tv/article-1491

[ERROR] Failed to extract article from https://www.dailymail.co.uk/ushome/sport/college-football/article-14923205/Ole-Miss-freshman-football-player-Corey-Adams-Memphis-shooting.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/ushome/sport/college-football/article-14923205/Ole-Miss-freshman-football-player-Corey-Adams-Memphis-shooting.html on URL https://www.dailymail.co.uk/ushome/sport/college-football/article-14923205/Ole-Miss-freshman-football-player-Corey-Adams-Memphis-shooting.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/news/royals/article-14903883/meghan-markle-plan-outshine-queen-terrified-palace.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/ushome/news/royals/article-14903883/meghan-markle-plan-outshine-queen-terrified-palace.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/ushome/news/royals/article-14903883/meghan-markle-p

[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14917007/lana-del-rey-simple-louisiana-life-husband-jeremy-dufrene.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/news/article-14923385/Is-CNN-Michael-Smerconish-goes-viral-rejects-WSJs-Trump-Epstein-letter.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14909391/Director-James-Wan-heartbroken-death.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/news/royals/article-14918309/kate-middleton-approved-items-sale-shoes-sweater-selling-fast.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/ushome/news/royals/article-14918309/kate-middleton-approved-items-sale-shoes-sweater-selling-fast.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/ushome/news/royals/article-14918309/kate-middleton-approved-items-sale-shoes-sweater-selling-fast.html on URL https://www.dailymail.co.uk/

[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14920389/lauren-sanchez-jeff-bezos-list-wedding-guest-details.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14912645/blake-lively-toxic-legal-justin-baldoni-feud.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/media/article-14922823/Stephen-Colbert-late-canceled-Greg-gutfeld-fox.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/terms
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14916169/Love-Island-usa-star-tears-father-abused-beat-huda-mustafa.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/femail/article-14916703/dylan-dreyer-today-divorce-husband-brian-fichera.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/news/article-14919023/Gavin-Newsom-trans-children.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/sport/othersports/article-14913087/shane-gillis-jeffr

[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14910649/Conor-McGregors-fiancee-Dee-Devlin-sexually-charged-comment-woman-kissing-again.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tv/au/index.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/ushome/tv/au/index.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/ushome/tv/au/index.html on URL https://www.dailymail.co.uk/ushome/tv/au/index.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14920345/blake-lively-shrugs-justin-baldoni-lawsuit-video.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14912833/Megan-Thee-Stallion-confirms-romance-NBA-star-Klay-Thompson-NYC.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/news/article-14923613/Taliban-kills-10-Afghans-helped-West-data-leak-disaster.html
[INFO] Scraping article: https://www.d

[ERROR] Failed to extract article from https://www.dailymail.co.uk/ushome/news/royals/article-14908035/reiss-dress-looks-like-meghan-markles-designer-favourite-sale.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/ushome/news/royals/article-14908035/reiss-dress-looks-like-meghan-markles-designer-favourite-sale.html on URL https://www.dailymail.co.uk/ushome/news/royals/article-14908035/reiss-dress-looks-like-meghan-markles-designer-favourite-sale.html
Found 0 article links.
Found 57 article links.
[INFO] Scraping article: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/news/article-14922169/Aussie-Michael-Caola-Thailand-apartment-Swedish-Mika-Huotari.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/news/article-14922169/Aussie-Michael-Caola-Thailand-apartment-Swedish-Mika-Huotari.html: Article `download()` failed with 404 Client Error: Not F

[INFO] Scraping article: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/news/article-14913749/Teenage-mum-two-accused-recruiting-underage-girls-twisted-bank-boss-reveals-astonishing-sum-just-48-hours-fresh-details-emerge-money-row-hotel-room-later-killed-himself.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/news/article-14913749/Teenage-mum-two-accused-recruiting-underage-girls-twisted-bank-boss-reveals-astonishing-sum-just-48-hours-fresh-details-emerge-money-row-hotel-room-later-killed-himself.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/news/article-14913749/Teenage-mum-two-accused-recruiting-underage-girls-twisted-bank-boss-reveals-astonishing-sum-just-48-hours-fresh-details-emerge-money-row-hotel-room-later-killed-himself.html on URL https://www.dailymail.co.uk/news/royals/https://www

[ERROR] Failed to extract article from https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/news/article-14914471/Heartbreaking-final-words-young-girl-trapped-mudflow-60-hours.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/news/article-14914471/Heartbreaking-final-words-young-girl-trapped-mudflow-60-hours.html on URL https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/news/article-14914471/Heartbreaking-final-words-young-girl-trapped-mudflow-60-hours.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/news/king-charles-iii/index.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/news/king-charles-iii/index.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/news/royals/https://ww

[INFO] Scraping article: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/auhome/index.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/auhome/index.html: Article `download()` failed with 504 Server Error: Gateway Time-out for url: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/auhome/index.html on URL https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/auhome/index.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/additionalcookieinfo
[ERROR] Failed to extract article from https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/additionalcookieinfo: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/additionalcookieinfo on URL https://www.dailymail.co.uk/news/royals/https://www.dailymail.c

[DONE] Saved 983 articles to July.csv


In [4]:
# processing
data = pd.read_csv('Today.csv', header=0)
data = data[data['url'].str.contains(r'/article-\d+')]

983


Unnamed: 0,title,text,url
0,"Harper Beckham, 14, takes another leaf out of ...",Harper Beckham took another leaf out of mum Vi...,https://www.dailymail.co.uk/tvshowbiz/tvshowbi...
1,"Latest Celebrity News, Gossip & Photos",We shall find out soon enough if the enormousl...,https://www.dailymail.co.uk/tvshowbiz/addition...
2,Ellen DeGeneres confirms she left the US becau...,Ellen DeGeneres has confirmed she left the US ...,https://www.dailymail.co.uk/tvshowbiz/tvshowbi...
3,Buffy The Vampire Slayer fans in meltdown over...,Buffy fans were left overwhelmed with exciteme...,https://www.dailymail.co.uk/tvshowbiz/tvshowbi...
4,"Latest Celebrity News, Gossip & Photos",We shall find out soon enough if the enormousl...,https://www.dailymail.co.uk/tvshowbiz/topics


In [7]:
# merge and de-dup
clean = pd.read_csv("clean.csv", header=0)
merge = pd.concat([data, clean], ignore_index=True)
merge.drop_duplicates(subset='title', inplace=True)
merge.to_csv("clean.csv", index=False)