In [1]:
import requests
from newspaper import Article
from bs4 import BeautifulSoup
import csv
import time
import pandas as pd

In [2]:
def extract_article_from_url(url):
    try:
        article = Article(url)
        article.download()
        article.parse()
        return {
            "title": article.title,
            "text": article.text,
            "url": url
        }
    except Exception as e:
        print(f"[ERROR] Failed to extract article from {url}: {e}")
        return None

def get_article_links_from_homepage(homepage_url, limit=10):
    try:
        response = requests.get(homepage_url)
        soup = BeautifulSoup(response.text, "html.parser")
        links = set()

        for a_tag in soup.find_all("a", href=True):
            href = a_tag["href"]
            if href.startswith("/") and not href.startswith("//"):
                href = homepage_url.rstrip("/") + href
            if homepage_url in href:
                links.add(href)

        return list(links)[:limit]
    except Exception as e:
        print(f"[ERROR] Failed to get links from {homepage_url}: {e}")
        return []

def scrape_news_site(urls, output_csv, article_limit=10):
    rows = []
    for homepage_url in urls: 
        article_urls = get_article_links_from_homepage(homepage_url, limit=article_limit)
        print(f"Found {len(article_urls)} article links.")

    
        for url in article_urls:
            print(f"[INFO] Scraping article: {url}")
            article = extract_article_from_url(url)
            if article and article["text"].strip():
                rows.append(article)
            time.sleep(1)  # To avoid getting blocked

    with open(output_csv, "w", newline='', encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["title", "text", "url"])
        writer.writeheader()
        writer.writerows(rows)

    print(f"[DONE] Saved {len(rows)} articles to {output_csv}")


In [3]:
urls = [
    "https://www.dailymail.co.uk/tvshowbiz", 
    "https://www.dailymail.co.uk/news",
    "https://www.dailymail.co.uk/ushome",
    "https://www.dailymail.co.uk/tv"
    "https://www.dailymail.co.uk/auhome/",
    "https://www.dailymail.co.uk/news/royals/"
    "https://www.dailymail.co.uk/home/"
]
scrape_news_site(urls, output_csv="Today.csv", article_limit=1000)

Found 191 article links.
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/article-15017435/Ioan-Gruffudd-blow-bitter-divorce-battle-ex-Alice-Evans.html?ico=comment-anchor#comments
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tv/article-15016919/I-Love-Blind-UK-Netflix-romance-screens.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/article-15018931/Zoe-Ball-gardening-help-grief-death-boyfriend-billy-yates.html#video
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14700463/john-travolta-honors-late-wife-emily-ratajkowski-throwback-mothers-day.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/topics
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/article-15019049/Margot-Robbie-corset-mini-dress-Colin-Farrell-Big-Bold-Beautiful-Journey-Los-Angeles.html?ico=comment-anchor#comments
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-15018569/Katie

[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/terms
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14947167/Kathleen-Marshalls-enchanting-new-production-Irving-Berlins-Hat-perfect-summer-tonic-says-Patrick-Marmion.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14696705/Bam-Magera-comeback-Jackass-Tony-Hawk-game.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14693439/influencer-tours-madonna-40m-nyc-home-personal-items.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/home/sitemaparchive/index.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/tvshowbiz/home/sitemaparchive/index.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/tvshowbiz/home/sitemaparchive/index.html on URL https://www.dailymail.co.uk/tvshowbiz/home/sitemaparchive/index.html
[INFO] Scraping article: h

[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/kim_kardashian/index.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/kim_kardashian/index.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/kim_kardashian/index.html on URL https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/kim_kardashian/index.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/coffeebreak/puzzles/index.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/tvshowbiz/coffeebreak/puzzles/index.html: Article `download()` failed with 504 Server Error: Gateway Time-out for url: https://www.dailymail.co.uk/tvshowbiz/coffeebreak/puzzles/index.html on URL https://www.dailymail.co.uk/tvshowbiz/coffeebreak/puzzles/index.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/article-15000573/skinny-shamers-Cheryl-Olivia-Attwood-Myleene-Klass-hit-ba

[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/article-14785785/Ana-Armas-Ballerina-action-thriller-BRIAN-VINER.html#video
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/registration/profile.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14696131/Teen-Mom-vets-Farrah-Abraham-Jenelle-Evans-shock-fans-ending-nasty-10-year-feud-tacos.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/ushome/index.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-15019279/Millie-Mackintosh-reality-fun-post-dog-Instagram.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-15017301/Holly-Ramsay-incredible-abs-bridal-era-Adam-Peaty.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14953777/Anne-Hathaway-Meryl-Streeps-Devil-Wears-Prada-2-outfits.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowb

[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-15016841/Brooks-Nader-leggy-display-glamorous-three-sisters-promote-new-reality-Love-Thy-Nader.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/article-15019121/thug-machete-brawl-residential-street.html#video
[INFO] Scraping article: https://www.dailymail.co.uk/news/home/article-2572160/DMA-Privacy-Policy.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-15018043/family-murder-suicide-new-hampshire-toddler-survived.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-15017043/Disney-actor-Alyson-Stoner-survived-stalkers-starvation-sexualisation-rape.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-15016219/Talking-Heads-David-Byrne-engaged-Mala-Gaonkar.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tv/article-15016981/Loose-Women-pulled-screens-ITV-plea.html
[INFO] Scraping article: https://www.d

[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-15019049/Margot-Robbie-corset-mini-dress-Colin-Farrell-Big-Bold-Beautiful-Journey-Los-Angeles.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-15016277/Joe-Jonas-reveals-embarrassing-joining-mile-high-club.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/topics
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-15016471/Liev-Schreiber-Naomi-Watts-reunite-son-milestone-moment-split.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tv/article-15016139/Brookside-child-star-acting-appearing-iconic-soap.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tv/article-15017643/BBC-presenter-QUITS-29-years-broadcaster-tony-smith.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/news/royals/article-14979877/Prince-Harry-Meghan-Markle-addiction-drama-self-inflicted-wounds.html
[ERROR] Failed to extract article from 

[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-15016773/Barbara-Windsor-widower-Scott-Mitchell-terror-dementia.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-15016585/Australian-star-shocks-bizarre-G-string-skirt.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-15019445/American-Pie-star-Seann-William-Scott-six-figure-monthly-income-revealed.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-15015407/Katie-Price-split-Peter-Andre-feud-Princess-truth-KATIE-HIND.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/terms
[INFO] Scraping article: https://www.dailymail.co.uk/news/home/article-2572160/DMA-Privacy-Policy.html#ccpaExplicitNotice
[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-15016695/China-Brisbane-baby-AFP.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-15018287/Grieving-colleague-tragic-Finn

[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-15016273/Taylor-Swift-fans-ENRAGED-Margaret-Qualley-interview-jack-antonoff.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/shopping-uk/article-14978919/Raylo-Business-lets-small-businesses-lease-tech-little-2-16-MONTH-no-theres-no-catch.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-15019615/Duolingo-sorry-trans-JK-Rowling-German-lesson.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/video/index.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/article-15019833/Nykia-Hamilton-Burger-King-fired.html?ico=comment-anchor#comments
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-15014211/Dua-Lipa-details-intensely-falling-fianc-Callum-Turner-lifts-lid-romance-rare-posing-chic-Harpers-Bazaar-shoot.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tv/article-15014685/Harry-Potter-TV-Weasley-siblings-Fred-Ge

[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-15013319/Jessica-Hart-Taylor-Swift-feud-Victorias-Secret.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-15016531/tax-plan-productivity-roundtable-albanese.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-15016951/Hypocrisy-secret-celebrity-smokers-wellness.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/travel/index.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/article-15017655/You-messed-wrong-mom-Daughter-US-tourist-held-ponytail-14-year-old-pickpocket-50-MINUTES-reveals-brilliant-way-tracked-thief-Venice-crowds.html#video
[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-15017021/Kneecap-fans-court-support-rapper-27-Hezbollah-flag-gig.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-15019205/Jessica-Albas-ex-Cash-Warren-46-confirms-hes-dating-model-20-years-junior-

[INFO] Scraping article: https://www.dailymail.co.uk/news/property/article-15017189/katie-price-mucky-mansion-sold-new-owner.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/femail/article-15019117/kate-middleton-children-sports-games-school.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/debate/article-15019621/HUGO-DUNCAN-genie-burst-bottle-clear-blame.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/real-estate/article-15015213/kanye-west-malibu-mansion-deal-collapsed.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-15019765/Kirsten-Dunst-smokes-cigarette-cafe-Los-Angeles.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-15015477/Robbie-Williams-reprises-iconic-dungarees-battling-hair-thinning-everyday.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/femail/article-15000099/I-heard-mothers-murderer-smash-home-Ring-doorbell-beating-death.html
[INFO] Scraping article

[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-15017477/Actor-David-Morrissey-banned-driving-caught-speeding-three-times.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-15014763/brandon-blackstock-celebration-life-loving-partner-kelly-clarkson.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-15014855/Peter-Andre-wife-Emily-hurt-Katie-Price-disgusting-swipe-children-cash-counting-video.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-15015383/The-Thursday-Murder-Club-Helen-Mirren-costars-Richard-Osman-photocall-film.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-15016789/Stop-right-Richard-E-Grant-thrills-Spice-World-fans-reunites-Victoria-Beckham-Italy-28-years-starring-beloved-film.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-15018735/Sixth-forms-competition-Labours-VAT-private-schools.html
[INFO] Scra

[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-15016199/Photo-Elle-Macpherson-confirms-rumour.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tv/article-15016025/Moment-Christine-McGuinness-tears-quitting-Celebs-Dating.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/article-15012245/real-reason-high-street-falling-disarray.html#video
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-15018185/Gigi-Hadid-wears-NO-underwear-bra-raciest-shoot-yet.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-15015369/austin-butler-carb-diet-workout.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-15018157/Louisiana-serial-pedophile-surgically-castrated-American-law.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-15019495/CHRISTOPHER-STEVENS-Mudtown-killer-magistrate.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/t

[INFO] Scraping article: https://www.dailymail.co.uk/news/article-15018883/Blue-Badge-misuse-dead-relatives-parking-permit.html?ico=comment-anchor#comments
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-15017311/Why-Tommy-Fury-told-not-daughter-Bambi-2-Christmas-Molly-Mae-Hague.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/femail/article-15015157/taylor-swift-jewelry-life-showgirl-album-cost.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-15015289/royal-family-row-Kings-cousin-attack-mother-lazy-RICHARD-EDENS-DIARY.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/article-15017655/You-messed-wrong-mom-Daughter-US-tourist-held-ponytail-14-year-old-pickpocket-50-MINUTES-reveals-brilliant-way-tracked-thief-Venice-crowds.html?ico=comment-anchor#comments
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-15018569/Katie-Price-responds-ex-Alex-Reid-cash-video-drama.html
[INFO] 

[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-15015233/Zoe-Kravitz-halterneck-Austin-Butler-Caught-Stealing-screening-London.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/news/article-15018841/laura-caron-teacher-pregnant-boy-cape-may.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/news/royals/article-15013717/AMANDA-PLATELL-Prince-William-needs-man-new-obsession-suggests-forgotten-late-Queen-stood-fear-future.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/ushome/news/royals/article-15013717/AMANDA-PLATELL-Prince-William-needs-man-new-obsession-suggests-forgotten-late-Queen-stood-fear-future.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/ushome/news/royals/article-15013717/AMANDA-PLATELL-Prince-William-needs-man-new-obsession-suggests-forgotten-late-Queen-stood-fear-future.html on URL https://www.dailymail.co.uk/ushome/news/royals/article

[INFO] Scraping article: https://www.dailymail.co.uk/ushome/real-estate/article-15015213/kanye-west-malibu-mansion-deal-collapsed.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-15014211/Dua-Lipa-details-intensely-falling-fianc-Callum-Turner-lifts-lid-romance-rare-posing-chic-Harpers-Bazaar-shoot.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/femail/article-15005009/meghan-markle-message-royals-netflix.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/sciencetech/index.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-15014973/Teresa-Giudice-bikini-sangria-Mallorca-getaway-RHONJ-salary.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/sport/golf/article-15019855/Tiger-Woods-major-new-golf-role-doubts-hell-play-again.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/ushome/sport/golf/article-15019855/Tiger-Woods-major-new-golf-role-doubts-hell-play-ag

[ERROR] Failed to extract article from https://www.dailymail.co.uk/ushome/sport/wwe/article-15019449/WWE-Trish-Stratus-mom-dead.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/ushome/sport/wwe/article-15019449/WWE-Trish-Stratus-mom-dead.html on URL https://www.dailymail.co.uk/ushome/sport/wwe/article-15019449/WWE-Trish-Stratus-mom-dead.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/news/au/index.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/ushome/news/au/index.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/ushome/news/au/index.html on URL https://www.dailymail.co.uk/ushome/news/au/index.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/femail/article-15018859/Harry-Styles-staggering-comeback-plans-revealed-saucy-liaisons-secret-getaways-insiders-tell-JENNIFER-RUBY-exactly-star-bosses-unhappy-him.html
[INFO] Scraping

[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-15001529/Justin-Trudeau-Katy-Perry-date.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/podcasts/index.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/news/article-15018909/Teacher-school-blackface-arbitrator.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-15009627/Shailene-Woodley-boyfriend-Lucas-Bravo-rare-PDA-camping-trip.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/femail/article-15018925/savannah-guthrie-controversial-parenting-tactic-snooping-phone.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/sport/wnba/article-15019929/Caitlin-Clark-suffers-injury-Indiana-Fever-star-dealing-groin-problem.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/ushome/sport/wnba/article-15019929/Caitlin-Clark-suffers-injury-Indiana-Fever-star-dealing-groin-problem.html: Article `download()` failed 

[INFO] Scraping article: https://www.dailymail.co.uk/ushome/sciencetech/article-15017759/Mutant-deer-horrifying-flesh-bubbles-US-outbreak.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-15016071/Tamar-Braxton-died-horror-accident-left-broken-nose-lost-teeth.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-15014471/Kristin-Davis-reveals-Friends-star-set-blind-date-landed-Sex-City.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/femail/article-15015587/Joanna-Gaines-weird-rule-cake-plate-chip.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/home/videoarchive/index.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/ushome/home/videoarchive/index.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/ushome/home/videoarchive/index.html on URL https://www.dailymail.co.uk/ushome/home/videoarchive/index.html
[INFO] Scraping 

[INFO] Scraping article: https://www.dailymail.co.uk/ushome/femail/article-14825707/woman-wedding-nanny-charges-80-hour.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/shopping-uk/article-15000681/Parents-love-easy-apply-organic-kids-sunscreen-thats-perfect-sensitive-skin.html?ico=mail_best_commerce_xp_desktop_185
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/shopping-us/article-14997097/hers-weight-loss-drug-health-ozempic.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-15015059/Tracee-Ellis-Ross-nude-launch-body-care-line.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-15012651/Sharon-Stone-reveals-famous-rapper-date.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-15018659/Gisele-Bundchen-baby-son-joaquim-valente-photos.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-15013303/American-Pie-star-looks-unrecognisable-a

[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-15016799/Katy-Perry-puts-illuminous-performance-metallic-outfit-world-tour.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/shopping-us/article-14995197/teens-women-periods-intimates-knix-underwear-bra.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-15012491/Joan-Collins-incredible-figure-white-swimsuit-St-Tropez.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/shopping-us/article-15006961/humble-seed-wheat-protein-crackers.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/shopping-uk/article-15010295/Hosting-Bank-Holiday-weekend-Waitrose-Cellars-25-wine-deal-save-white-red-fizz.html?ico=mail_best_commerce_xp_desktop_185
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/news/article-15019661/JD-Vance-interview-threat-Zelensky-Elon-Musk.html
[INFO] Scraping ar

[INFO] Scraping article: https://www.dailymail.co.uk/ushome/podcasts/soccer-az/index.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/ushome/podcasts/soccer-az/index.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/ushome/podcasts/soccer-az/index.html on URL https://www.dailymail.co.uk/ushome/podcasts/soccer-az/index.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-15011425/Emily-Paris-issues-casting-extras-catch.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/news/royals/article-15018423/Careful-Kate-William-Ive-supported-you-behaviours-left-cold-Im-not-LIZ-JONES.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/ushome/news/royals/article-15018423/Careful-Kate-William-Ive-supported-you-behaviours-left-cold-Im-not-LIZ-JONES.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/ushome/news

[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-15013823/Austin-Butler-zoe-kravitz-caught-stealing-cosying-london-photocall.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-15016651/Star-Trek-William-Shatner-rare-outing-Los-Angeles.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/meghan-markle/index.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/ushome/tvshowbiz/meghan-markle/index.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/ushome/tvshowbiz/meghan-markle/index.html on URL https://www.dailymail.co.uk/ushome/tvshowbiz/meghan-markle/index.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-15011059/diana-vickers-invited-leonardo-dicaprio-house-bizarre-encounter.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/sport/nfl/article-15019571/Bill-Belichicks-ex-Linda-Holliday-

[INFO] Scraping article: https://www.dailymail.co.uk/ushome/femail/article-15018257/nick-jonas-bed-priyanka-chopra-ick.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/shopping-us/article-15003885/vetnique-pet-joint-health-relief-chews.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/news/article-15018773/mass-brawl-carnival-cruise-ship-chicken-tenders.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-15019205/Jessica-Albas-ex-Cash-Warren-46-confirms-hes-dating-model-20-years-junior-hold-hands-kiss.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-15015409/Ariana-DeBose-mom-dead-cancer-battle-Oscar-winner-tribute-gina.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-15012293/Rod-Stewart-Penny-Lancaster-stroll-los-angeles-PDA.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/sport/au/index.html
[ERROR] Failed to extract article from https://w

[INFO] Scraping article: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/news/article-15016995/Aussie-tourist-dies-Vietnam.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/news/article-15016995/Aussie-tourist-dies-Vietnam.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/news/article-15016995/Aussie-tourist-dies-Vietnam.html on URL https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/news/article-15016995/Aussie-tourist-dies-Vietnam.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/terms
[ERROR] Failed to extract article from https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/terms: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/news/royals/https://www.d

[INFO] Scraping article: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/tv/au/index.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/tv/au/index.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/tv/au/index.html on URL https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/tv/au/index.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/registration/profile.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/registration/profile.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/registration/profile.html on URL https://www.dailymail.co.uk/news/royals/https://www.dailyma

[INFO] Scraping article: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/news/article-15019457/Brumbys-Bakery-sale.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/news/article-15019457/Brumbys-Bakery-sale.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/news/article-15019457/Brumbys-Bakery-sale.html on URL https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/news/article-15019457/Brumbys-Bakery-sale.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/home/article-3633654/CONTRIBUTIONS-STANDARD-TERMS-CONDITIONS.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/home/article-3633654/CONTRIBUTIONS-STANDARD-TERMS-CONDITIONS.html: Article `download()` failed with 40

In [4]:
# processing
data = pd.read_csv('Today.csv', header=0)
data = data[data['url'].str.contains(r'/article-\d+')]

In [5]:
len(data)

866

In [6]:
# merge and de-dup
clean = pd.read_csv("clean.csv", header=0)
print(len(clean))
merge = pd.concat([data, clean], ignore_index=True)
merge.drop_duplicates(subset='title', inplace=True)
print(len(merge))
merge.to_csv("clean.csv", index=False)

2869
3354
