In [1]:
import requests
from newspaper import Article
from bs4 import BeautifulSoup
import csv
import time
import pandas as pd

In [2]:
def extract_article_from_url(url):
    try:
        article = Article(url)
        article.download()
        article.parse()
        return {
            "title": article.title,
            "text": article.text,
            "url": url
        }
    except Exception as e:
        print(f"[ERROR] Failed to extract article from {url}: {e}")
        return None

def get_article_links_from_homepage(homepage_url, limit=10):
    try:
        response = requests.get(homepage_url)
        soup = BeautifulSoup(response.text, "html.parser")
        links = set()

        for a_tag in soup.find_all("a", href=True):
            href = a_tag["href"]
            if href.startswith("/") and not href.startswith("//"):
                href = homepage_url.rstrip("/") + href
            if homepage_url in href:
                links.add(href)

        return list(links)[:limit]
    except Exception as e:
        print(f"[ERROR] Failed to get links from {homepage_url}: {e}")
        return []

def scrape_news_site(urls, output_csv, article_limit=10):
    rows = []
    for homepage_url in urls: 
        article_urls = get_article_links_from_homepage(homepage_url, limit=article_limit)
        print(f"Found {len(article_urls)} article links.")

    
        for url in article_urls:
            print(f"[INFO] Scraping article: {url}")
            article = extract_article_from_url(url)
            if article and article["text"].strip():
                rows.append(article)
            time.sleep(1)  # To avoid getting blocked

    with open(output_csv, "w", newline='', encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["title", "text", "url"])
        writer.writeheader()
        writer.writerows(rows)

    print(f"[DONE] Saved {len(rows)} articles to {output_csv}")


In [3]:
urls = [
    "https://www.dailymail.co.uk/tvshowbiz", 
    "https://www.dailymail.co.uk/news",
    "https://www.dailymail.co.uk/ushome",
    "https://www.dailymail.co.uk/tv"
    "https://www.dailymail.co.uk/auhome/",
    "https://www.dailymail.co.uk/news/royals/"
    "https://www.dailymail.co.uk/home/"
]
scrape_news_site(urls, output_csv="Today.csv", article_limit=1000)

Found 191 article links.
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14785785/Ana-Armas-Ballerina-action-thriller-BRIAN-VINER.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-15008823/David-Haye-advanced-talks-join-Im-Celebrity-Stars-decade-appearance-ITV-show.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/kim_kardashian/index.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/kim_kardashian/index.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/kim_kardashian/index.html on URL https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/kim_kardashian/index.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14694421/George-Clooney-grey-roots-Tony-nominees-event-dye-job.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/coffeebrea

[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14956049/Kelly-Osbourne-heartbreaking-final-promise-Ozzy-sharon.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14699763/Meghan-Trainor-boob-job-red-carpet-weight-loss-drug.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14694067/Ben-Affleck-giggles-reveals-women-loves-latinas.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/article-15000573/skinny-shamers-Cheryl-Olivia-Attwood-Myleene-Klass-hit-back.html#video
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/sciencetech/index.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14693587/Surprising-career-Britney-Spears-astronaut-lover-Oops-Did-video-revealed.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tv/au/index.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/tvshowbi

[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14695913/Patrick-Schwarzenegger-shirtless-bikini-fiancee-reunion-ex-Miley-Cyrus.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-15008329/Jojo-Siwa-plans-start-family-boyfriend-Chris-Hughes.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14694193/Shia-LaBeouf-reveals-three-surprising-Hollywood-actors-helped-sober.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14696153/Addison-Rae-flashes-bra-skirt-lessons-Britney-Spears.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-15007967/Laura-Jackson-crippling-antenatal-depression.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14696529/Kourtney-Kardashian-shockingly-hires-Kanye-Wests-ex-Julia-Fox-libido-booster-ad.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbi

[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/authors
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/article-15009075/Selena-Gomez-toned-stomach-photo-wedding-Benny-Blanco.html?ico=comment-anchor#comments
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14696705/Bam-Magera-comeback-Jackass-Tony-Hawk-game.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/article-14785785/Ana-Armas-Ballerina-action-thriller-BRIAN-VINER.html?ico=comment-anchor#comments
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-15008225/Strictlys-biggest-scandal-BBC-call-police-drug-use.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14947167/Kathleen-Marshalls-enchanting-new-production-Irving-Berlins-Hat-perfect-summer-tonic-says-Patrick-Marmion.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/article-15008755/David-Victoria-Beckham-fresh-heartac

[INFO] Scraping article: https://www.dailymail.co.uk/news/article-15009195/Killer-nurse-Lucy-Letby-guard-checked-prison.html#video
[INFO] Scraping article: https://www.dailymail.co.uk/news/home/article-10538781/About-MailOnline.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/article-15003993/migrant-hotel-flashpoint-heart-London-banking.html#video
[INFO] Scraping article: https://www.dailymail.co.uk/news/article-15008401/abu-musa-michigan-detroit-video-ballot-box-trump.html#video
[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-15008743/Long-Island-teen-attacked-stanley-cup-mom-Brentwood-school.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/home/article-14996495/Win-luxury-Volkswagen-Tiguan-R-TentBox-5-000-cash-just-10p.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-15009487/Shark-attack-Cabarita-Beach.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-15007057/horrible-incid

[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-15009221/Pregnant-Vanessa-Hudgens-bares-baby-bump-crop-epic-EDM-rave.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/femail/article-14801585/Pregnancy-nutritionist-reveals-diet-advice-trying-conceive-pregnant-breastfeeding-baby-nutrition.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-15003845/Mrs-Ronaldo-humble-beginnings-Georgina-Rorogiez-dad-jail.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-15008755/David-Victoria-Beckham-fresh-heartache-new-details-Brooklyn-vow-renewal-speech-Nicola-Peltz.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-15008965/Euro-leaders-race-stand-Zelensky-Trump.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/article-15008743/Long-Island-teen-attacked-stanley-cup-mom-Brentwood-school.html?ico=comment-anchor#comments
[INFO] Scraping article: https://www.dailymail.co.u

[INFO] Scraping article: https://www.dailymail.co.uk/news/article-15008407/Vladimir-Putin-surprising-lunch-menu-meeting-Donald-Trump.html?ico=comment-anchor#comments
[INFO] Scraping article: https://www.dailymail.co.uk/news/news/mostread/index.html#news
[ERROR] Failed to extract article from https://www.dailymail.co.uk/news/news/mostread/index.html#news: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/news/news/mostread/index.html#news on URL https://www.dailymail.co.uk/news/news/mostread/index.html#news
[INFO] Scraping article: https://www.dailymail.co.uk/news/home/article-3633654/CONTRIBUTIONS-STANDARD-TERMS-CONDITIONS.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-15005813/Leonardo-DiCaprio-girlfriend-Vittoria-topless-bikini-photo.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/article-15008569/Moment-flustered-Putin-winces-free-press-Ukraine.html?ico=comment-anchor#comments
[INFO] Scr

[INFO] Scraping article: https://www.dailymail.co.uk/news/article-15008507/Man-lures-Grindr-date-home-massage-gouging-eye-knife.html?ico=comment-anchor#comments
[INFO] Scraping article: https://www.dailymail.co.uk/news/tv/article-15008611/Fake-Fortune-drama-art-collector-gamble-trejects-huge-sum-lost-masterpiece-painting.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-15001601/Andy-Warhol-Edie-Sedgwick.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-15009175/Jeremy-Clarkson-reveals-right-hand-man-Kaleb-Cooper-told-JD-Vances-security-f-US-vice-presidents-Cotswolds-convoy-disrupted-farming-duties.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-15009065/Skibidi-tradwife-delulu-Cambridge-dictionary-new-words-understand.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-15004867/Michael-Barrymore-73-epilepsy-reveals-shock-health-diagnosis-led-mini-strokes-reaches-fans-suppo

[INFO] Scraping article: https://www.dailymail.co.uk/news/tv/article-15006441/A-beloved-soap-returning-screens-22-years-air-set-make-historic-crossover-Hollyoaks.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/news/royals/article-14993221/Moment-Prince-Harry-signalled-change-breaking-protocol-meghan-markle.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/news/news/royals/article-14993221/Moment-Prince-Harry-signalled-change-breaking-protocol-meghan-markle.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/news/news/royals/article-14993221/Moment-Prince-Harry-signalled-change-breaking-protocol-meghan-markle.html on URL https://www.dailymail.co.uk/news/news/royals/article-14993221/Moment-Prince-Harry-signalled-change-breaking-protocol-meghan-markle.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/columnists/article-15008699/ANDREW-PIERCE-Londoners-Tube-Sadiq-Khan-gravy-train.html
[INF

[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-15008823/David-Haye-advanced-talks-join-Im-Celebrity-Stars-decade-appearance-ITV-show.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-15008409/Rochelle-Humes-sizzles-pink-bikini-enjoys-family-holiday-Marbella-dragged-Myleene-Klass-Frankie-Bridge-feud.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/podcasts/index.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tv/article-15009105/Jodi-Whittaker-Doctor-train-wreck-One-Night-ITV-drama.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-15008539/Woke-Star-Wars-icon-debated-moving-USA-Trump-win.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-15008035/70s-film-legend-Golden-Globe-nominee-looks-unrecognisable-announces-US-tour-dates-YOU-guess-is.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-15007741/LA-restauranteur

[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-15004995/Gordon-Ramsays-recipe-happy-family-Beckham-kids-relish-limelight-celebrity-chef-wife-Tana-determined-children-forge-career-paths.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-15007821/Emmerdale-Rebecca-Sarker-sexy-snaps-Instagram.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-15003503/Celebrities-you-no-idea-auditioned-X-Factor-reality-TV-icons-popstars-Olympian-stars-determined-make-no-matter-what.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-15007777/Sam-Reid-new-scenes-Interview-Vampire-Canada.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-15008813/markley-fire-california-victor-serriteno-murder-priscilla-castro-arson-sentencing.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/article-15008951/D-Day-White-House-Starmer-Zelensky-EU-Trump-Ukraine.html?ico=c

[INFO] Scraping article: https://www.dailymail.co.uk/news/femail/article-15008405/Cotswolds-elite-snort-cocaine-DEBBIE-SCOTT.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14999719/stars-turned-God-Matilda-Draper-Roman-Hackett-baptised.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/article-15008733/Bitchy-catfight-Molly-Mae-Vogue-Williams-Dani-Dyer-GRANT-TUCKER.html?ico=comment-anchor#comments
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-15007333/Katie-Price-legal-row-exes-Kieran-Hayler-Alex-Reid-documentary.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/article-15008123/Newspoll-Australia-Anthony-Albanese-Donald-Trump-Xi-Jinping.html?ico=comment-anchor#comments
[INFO] Scraping article: https://www.dailymail.co.uk/news/tv/article-15004953/BBC-Survivor-axed-Joel-Dommett.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/podcasts/soccer-az/index.html
[ERROR] Failed to extract arti

[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-15004221/Derry-Girls-Saoirse-Monica-Jackson-incredible-wedding.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/article-15009487/Shark-attack-Cabarita-Beach.html#video
[INFO] Scraping article: https://www.dailymail.co.uk/news/tv/article-15000009/the-chase-bradley-walsh-true-colours-camera.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/article-14998835/Dubai-sex-trafficking-queen.html?ico=comment-anchor#comments
[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-15009069/White-pupils-likely-group-university-British-students.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/article-15008401/abu-musa-michigan-detroit-video-ballot-box-trump.html?ico=comment-anchor#comments
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-15009559/Kathy-Griffin-face-Farmers-Market-LA-undergoing-facelift.html
[INFO] Scraping article: https://www

[ERROR] Failed to extract article from https://www.dailymail.co.uk/ushome/sport/college-football/article-15008447/College-football-coach-Rhett-Lashlee-SMU-takes-aim-ESPN.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/ushome/sport/college-football/article-15008447/College-football-coach-Rhett-Lashlee-SMU-takes-aim-ESPN.html on URL https://www.dailymail.co.uk/ushome/sport/college-football/article-15008447/College-football-coach-Rhett-Lashlee-SMU-takes-aim-ESPN.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/yourmoney/consumer/article-15001373/gen-z-think-war-likely-buying-home.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/ushome/yourmoney/consumer/article-15001373/gen-z-think-war-likely-buying-home.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/ushome/yourmoney/consumer/article-15001373/gen-z-think-war-likely-buying-home.html on URL 

[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-15005089/Lily-Collins-films-fifth-season-Emily-Paris-Venice.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-15004681/Jessica-Simpson-shares-bikini-flashback-video-announcing-Vegas-concert.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14974033/Tom-Cruise-Ana-Armas-age-gap-concerns-experts.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-15004785/Inside-Sydney-Sweeneys-wild-boozy-night-continues-blow-American-Eagle-controversy.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/femail/article-15003369/The-Biggest-Loser-biggest-scandals-Contestants-noughties-weight-loss-reveal-didnt-eat-10-days-left-blood-urine-collapsed-burning-8-000-calories-day.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/home/article-10538781/About-MailOnline.html
[INFO] Scraping article: https://www.dail

[INFO] Scraping article: https://www.dailymail.co.uk/ushome/femail/article-14997295/Kate-releases-second-film-highlighting-beauty-Mother-Nature-inspired-solace-took-countryside-cancer-journey.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14995283/Kelly-Clarkson-NOT-mentioned-ex-Brandon-Blackstock-obituary-partner-assistant.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/femail/article-15008589/disinherited-mother-loved-siblings-DAISY-GOODWIN.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/debate/article-15001019/Brooklyn-Harry-pair-wet-wipes-lack-gumption-stand-families-love-JAN-MOIR.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-15004173/Travis-Kelce-Taylor-Swift-sexy-album.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tv/article-15002245/And-Just-Like-series-finale-Sex-City-reboot.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/textbased/chann

[INFO] Scraping article: https://www.dailymail.co.uk/ushome/femail/article-14993683/dermatologist-brandi-glanville-parasite-mysterious-face-illness-parasite.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/news/article-15008401/abu-musa-michigan-detroit-video-ballot-box-trump.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-15004551/Jennifer-Lopez-flashes-nude-bra-lace-talks-difficult-time-life.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/femail/article-14995791/meghan-markle-trailer-love.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/sport/nfl/article-15009093/Cincinnati-Bengals-trade-offers-star-player-Trey-Hendrickson-contract-dispute.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/ushome/sport/nfl/article-15009093/Cincinnati-Bengals-trade-offers-star-player-Trey-Hendrickson-contract-dispute.html: Article `download()` failed with 404 Client Error: Not Found for url: https:

[INFO] Scraping article: https://www.dailymail.co.uk/ushome/femail/article-14997499/kristi-noem-style-experts-ice-barbie-clothes-outfits.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/ushome/femail/article-14997499/kristi-noem-style-experts-ice-barbie-clothes-outfits.html: Article `download()` failed with HTTPSConnectionPool(host='www.dailymail.co.uk', port=443): Max retries exceeded with url: /ushome/femail/article-14997499/kristi-noem-style-experts-ice-barbie-clothes-outfits.html (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x167198a90>: Failed to resolve 'www.dailymail.co.uk' ([Errno 8] nodename nor servname provided, or not known)")) on URL https://www.dailymail.co.uk/ushome/femail/article-14997499/kristi-noem-style-experts-ice-barbie-clothes-outfits.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/sport/au/index.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/ushome/sport/au/index.htm

[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-15007349/eva-longoria-black-string-bikini-secrets-figure.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/ushome/tvshowbiz/article-15007349/eva-longoria-black-string-bikini-secrets-figure.html: Article `download()` failed with HTTPSConnectionPool(host='www.dailymail.co.uk', port=443): Max retries exceeded with url: /ushome/tvshowbiz/article-15007349/eva-longoria-black-string-bikini-secrets-figure.html (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x1671994d0>: Failed to resolve 'www.dailymail.co.uk' ([Errno 8] nodename nor servname provided, or not known)")) on URL https://www.dailymail.co.uk/ushome/tvshowbiz/article-15007349/eva-longoria-black-string-bikini-secrets-figure.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/sport/college-football/article-15008767/College-football-star-Ian-Schieffelin-arrested-Clemson-Dabo-Swinney.html
[ERROR]

[INFO] Scraping article: https://www.dailymail.co.uk/ushome/travel/index.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-15006793/Britney-Spears-ex-husband-Sam-Asghari-torches-Kevin-Federline-new-tell-book.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/femail/article-14997217/kimberly-guilfoyle-young-pictures-don-jr.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/sport/nfl/article-15005577/Travis-Kelce-Taylor-Swift-NFL-arrival-outfit-TS12-orange.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/ushome/sport/nfl/article-15005577/Travis-Kelce-Taylor-Swift-NFL-arrival-outfit-TS12-orange.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/ushome/sport/nfl/article-15005577/Travis-Kelce-Taylor-Swift-NFL-arrival-outfit-TS12-orange.html on URL https://www.dailymail.co.uk/ushome/sport/nfl/article-15005577/Travis-Kelce-Taylor-Swift-NFL-arrival-outfit-TS1

[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14997609/Rosie-ODonnell-size-12-weight-loss-drug-Mounjaro.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/home/article-3633654/CONTRIBUTIONS-STANDARD-TERMS-CONDITIONS.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-15008941/Jennifer-Aniston-Courteney-Cox-selfie-Friends.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-15008649/Glee-star-Dianna-Agron-talks-death-curse.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/shopping-us/article-14976109/quicken-simplifi-money-management-app.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/auhome/index.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-15008385/Russell-Crowe-phone-throwing-New-York.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/n

[INFO] Scraping article: https://www.dailymail.co.uk/ushome/sport/nfl/article-15004951/myles-garrett-chloe-kim-romance.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/ushome/sport/nfl/article-15004951/myles-garrett-chloe-kim-romance.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/ushome/sport/nfl/article-15004951/myles-garrett-chloe-kim-romance.html on URL https://www.dailymail.co.uk/ushome/sport/nfl/article-15004951/myles-garrett-chloe-kim-romance.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/home/contactus/index.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/ushome/home/contactus/index.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/ushome/home/contactus/index.html on URL https://www.dailymail.co.uk/ushome/home/contactus/index.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/femail/article-1

[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-15006519/Kate-Gosselin-icy-response-sons-claim-kids-pitted-against-divorce.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-15005297/Tristan-Rogers-dead-79-General-Hospital-lung-cancer.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/news/article-15008191/Miss-Universe-Kseniya-Alexandrova-killed-crash-Elk-russia.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/yourmoney/consumer/article-15001423/americas-energy-bill-crisis-spirals-georgia-price-hikes.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/ushome/yourmoney/consumer/article-15001423/americas-energy-bill-crisis-spirals-georgia-price-hikes.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/ushome/yourmoney/consumer/article-15001423/americas-energy-bill-crisis-spirals-georgia-price-hikes.html on URL https://www.dai

[INFO] Scraping article: https://www.dailymail.co.uk/ushome/news/article-15005405/carolyn-bessette-friends-daddy-issue-jfk-kennedy-lie-MAUREEN-CALLAHAN.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/sport/nba/article-15008361/Dallas-Mavericks-owner-Mark-Cuban-reveals-run-2028-presidency.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/ushome/sport/nba/article-15008361/Dallas-Mavericks-owner-Mark-Cuban-reveals-run-2028-presidency.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/ushome/sport/nba/article-15008361/Dallas-Mavericks-owner-Mark-Cuban-reveals-run-2028-presidency.html on URL https://www.dailymail.co.uk/ushome/sport/nba/article-15008361/Dallas-Mavericks-owner-Mark-Cuban-reveals-run-2028-presidency.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/sport/nfl/article-15008851/dennis-rodman-travis-hunter-leanna-lenee.html
[ERROR] Failed to extract article from https://www.d

[ERROR] Failed to extract article from https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/ushome/index.html: Article `download()` failed with 504 Server Error: Gateway Time-out for url: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/ushome/index.html on URL https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/ushome/index.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/home/article-2572160/DMA-Privacy-Policy.html#ccpaExplicitNotice
[ERROR] Failed to extract article from https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/home/article-2572160/DMA-Privacy-Policy.html#ccpaExplicitNotice: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/home/article-2572160/DMA-Privacy-Policy.html#ccpaExplicitNotice on URL https://www.dailymail.co.uk/news/royals/https://ww

[INFO] Scraping article: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/femail/au/index.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/femail/au/index.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/femail/au/index.html on URL https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/femail/au/index.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/news/worldnews/index.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/news/worldnews/index.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/news/worldnews/index.html on URL https://www.dailymail.co.uk/news/royals/htt

[INFO] Scraping article: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/howcomplain
[ERROR] Failed to extract article from https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/howcomplain: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/howcomplain on URL https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/howcomplain
[INFO] Scraping article: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/news/prince-andrew/index.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/news/prince-andrew/index.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/news/prince-andrew/index.html on URL https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk

[INFO] Scraping article: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/privacy
[ERROR] Failed to extract article from https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/privacy: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/privacy on URL https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/privacy
[DONE] Saved 874 articles to Today.csv


In [4]:
# processing
data = pd.read_csv('Today.csv', header=0)
data = data[data['url'].str.contains(r'/article-\d+')]

In [5]:
len(data)

810

In [6]:
# merge and de-dup
clean = pd.read_csv("clean.csv", header=0)
print(len(clean))
merge = pd.concat([data, clean], ignore_index=True)
merge.drop_duplicates(subset='title', inplace=True)
print(len(merge))
merge.to_csv("clean.csv", index=False)

2407
2869
