In [1]:
import requests
from newspaper import Article
from bs4 import BeautifulSoup
import csv
import time
import pandas as pd

In [2]:
def extract_article_from_url(url):
    try:
        article = Article(url)
        article.download()
        article.parse()
        return {
            "title": article.title,
            "text": article.text,
            "url": url
        }
    except Exception as e:
        print(f"[ERROR] Failed to extract article from {url}: {e}")
        return None

def get_article_links_from_homepage(homepage_url, limit=10):
    try:
        response = requests.get(homepage_url)
        soup = BeautifulSoup(response.text, "html.parser")
        links = set()

        for a_tag in soup.find_all("a", href=True):
            href = a_tag["href"]
            if href.startswith("/") and not href.startswith("//"):
                href = homepage_url.rstrip("/") + href
            if homepage_url in href:
                links.add(href)

        return list(links)[:limit]
    except Exception as e:
        print(f"[ERROR] Failed to get links from {homepage_url}: {e}")
        return []

def scrape_news_site(urls, output_csv, article_limit=10):
    rows = []
    for homepage_url in urls: 
        article_urls = get_article_links_from_homepage(homepage_url, limit=article_limit)
        print(f"Found {len(article_urls)} article links.")

    
        for url in article_urls:
            print(f"[INFO] Scraping article: {url}")
            article = extract_article_from_url(url)
            if article and article["text"].strip():
                rows.append(article)
            time.sleep(1)  # To avoid getting blocked

    with open(output_csv, "w", newline='', encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["title", "text", "url"])
        writer.writeheader()
        writer.writerows(rows)

    print(f"[DONE] Saved {len(rows)} articles to {output_csv}")


In [3]:
urls = [
    "https://www.dailymail.co.uk/tvshowbiz", 
    "https://www.dailymail.co.uk/news",
    "https://www.dailymail.co.uk/ushome",
    "https://www.dailymail.co.uk/tv"
    "https://www.dailymail.co.uk/auhome/",
    "https://www.dailymail.co.uk/news/royals/"
    "https://www.dailymail.co.uk/home/"
]
scrape_news_site(urls, output_csv="Today.csv", article_limit=1000)

Found 187 article links.
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14696037/Kim-Alexis-64-talks-pressure-staying-beautiful.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14697029/Buffy-Vampire-Slayer-fans-meltdown-possibility-beloved-character-join-reboot.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/coffeebreak/puzzles/index.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/tvshowbiz/coffeebreak/puzzles/index.html: Article `download()` failed with 504 Server Error: Gateway Time-out for url: https://www.dailymail.co.uk/tvshowbiz/coffeebreak/puzzles/index.html on URL https://www.dailymail.co.uk/tvshowbiz/coffeebreak/puzzles/index.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14693163/Cristiano-Ronaldos-girlfriend-dress-inspired-Princess-Diana.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/authors
[INFO] Scr

[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/home/sitemap.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/tvshowbiz/home/sitemap.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/tvshowbiz/home/sitemap.html on URL https://www.dailymail.co.uk/tvshowbiz/home/sitemap.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/article-14924887/The-Greek-restaurant-gone-viral-gushing-Kylie-Jenner-post-history-OTT-celeb-tributes-legendary-Magic-Johnson-Demi-Moores-timeless-beauty.html?ico=comment-anchor#comments
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/article-14924497/Has-Liam-Neeson-finally-Love-Actually-73-Perennially-single-star-sparks-fresh-romance-rumours-Pamela-Anderson-twenty-six-years-tragic-death-beloved-wife-Natasha-Richardson.html?ico=comment-anchor#comments
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14694243/Morgan-Wa

[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14695913/Patrick-Schwarzenegger-shirtless-bikini-fiancee-reunion-ex-Miley-Cyrus.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/article-14785785/Ana-Armas-Ballerina-action-thriller-BRIAN-VINER.html#video
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14925693/New-mum-Arabella-Chi-bikini-daughter-gigi.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14696335/Kanye-West-Kim-Kardashian-exploiting-children-legal-attack.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tv/au/index.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/tvshowbiz/tv/au/index.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/tvshowbiz/tv/au/index.html on URL https://www.dailymail.co.uk/tvshowbiz/tv/au/index.html
[INFO] Scraping article: https://www.daily

[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/article-14763387/British-comedy-Ballad-Wallis-Island-BRIAN-VINER.html#video
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14924521/Nicole-Scherzinger-waves-kisses-fans-final-Sunset-Boulevard-Broadway.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14888793/What-Nicola-Peltz-thinks-laws-Amid-Beckham-feud-does-Brooklyns-wife-glamorous-partners-brothers.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14785785/Ana-Armas-Ballerina-action-thriller-BRIAN-VINER.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/home_and_away/index.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/home_and_away/index.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/home_and_away/index.html on URL ht

[INFO] Scraping article: https://www.dailymail.co.uk/news/article-14924469/Fears-state-pension-age-rise-faster-amid-government-cash-crisis-review-looks-ramping-automatic-contributions.html?ico=comment-anchor#comments
[INFO] Scraping article: https://www.dailymail.co.uk/news/tv/article-14925289/Emma-Matt-Willis-QUIT-Love-Blind-UK-replacement-host-new-Netflix-series.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14922959/Harper-Beckham-14-takes-leaf-mum-Victorias-beauty-book-leading-make-tutorial.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/sport/football/article-14926385/Paul-Gascoigne-management-update-England-legend-condition-collapse.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/news/sport/football/article-14926385/Paul-Gascoigne-management-update-England-legend-condition-collapse.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/news/sport/football/arti

[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-14922991/The-ghost-town-centre-abandoned-shops-nearly-city-UK-locals-blame-drug-addicts-high-rents-OTHER.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-14923145/Border-Patrol-agent-shot-illegal-immigrant-New-York-City-park.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/article-14923613/Taliban-kills-10-Afghans-helped-West-data-leak-disaster.html?ico=comment-anchor#comments
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14912547/80s-movie-icon-Al-Pacino-unrecognizable-rare-outing-guess-who.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/home/article-14919331/clarifications-corrections.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14924311/Nicola-Peltz-joint-venture-husband-Brooklyn-Beckham-family-feud.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/article-14923013/Anti-semitism-r

[INFO] Scraping article: https://www.dailymail.co.uk/news/tv/article-14924975/shark-devoured-arm-leg-changed-life-protecting.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14925909/Lindsay-Lohans-legs-look-impossibly-long-glam-shot-fans-scream-Photoshop-fail.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-14924469/Fears-state-pension-age-rise-faster-amid-government-cash-crisis-review-looks-ramping-automatic-contributions.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/article-14924463/Meghan-Markle-fraud-milking-fame-Prince-Harry-Ever.html?ico=comment-anchor#comments
[INFO] Scraping article: https://www.dailymail.co.uk/news/article-14922691/moment-Delta-Airline-Boeing-flight-engine-fire-Los-Angeles.html?ico=comment-anchor#comments
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14925693/New-mum-Arabella-Chi-bikini-daughter-gigi.html
[INFO] Scraping article: https://www.dailymail.co.

[INFO] Scraping article: https://www.dailymail.co.uk/news/home/article-2572160/DMA-Privacy-Policy.html#ccpaExplicitNotice
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14926037/Katie-Price-47-displays-extreme-weight-loss-tiny-green-thong-bikini-Dubai-holiday-returning-UK-filler-bum.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-14925191/Protest-outside-construction-site-controversial-2-5-million-mosque-sees-police-called-rival-factions-clash.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14923303/Katie-Piper-bikini-sizzling-swimwear.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14923863/Liam-Neeson-Naked-Gun-sex-scenes-Pamela-Anderson.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-14926137/new-astronomer-ceo-statement-coldplay-concert-cheating-scandal.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-

[ERROR] Failed to extract article from https://www.dailymail.co.uk/news/sport/football/article-14925403/Paul-Gascoigne-hospital-update-England-intensive-care-collapse.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/news/sport/football/article-14925403/Paul-Gascoigne-hospital-update-England-intensive-care-collapse.html on URL https://www.dailymail.co.uk/news/sport/football/article-14925403/Paul-Gascoigne-hospital-update-England-intensive-care-collapse.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/article-14923633/Water-bills-set-sky-rocket-despite-shake-fix-broken-system.html?ico=comment-anchor#comments
[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-14923857/Camping-trip-two-children-fiance-killed-crash.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14924689/Jennifer-Lopez-younger-popstars-Sabrina-Carpenter-Kesha-racy-performances.html
[INFO] Scraping articl

[INFO] Scraping article: https://www.dailymail.co.uk/news/article-14921291/Alligator-Alcatraz-cut-crime-boat-migrants-not-come.html?ico=comment-anchor#comments
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14924725/Oasis-fans-wild-Gallagher-brothers-seen-backstage-photo-Liams-new-measure-avoid-feuding-brother-Noel-revealed.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-14919171/Family-fury-12-year-old-girl-stepfather-diversity-day-Union-Jack-dress.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/article-14923013/Anti-semitism-rife-schools-pupils-teachers-warn.html?ico=comment-anchor#comments
[INFO] Scraping article: https://www.dailymail.co.uk/news/article-14925129/Moment-ex-wife-Tory-MP-breaks-tears-recalls-moment-nearly-strangled-death.html?ico=comment-anchor#comments
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14924953/Child-star-Drake-Bell-fumes-left-struggling-pay-rent-fat-cat-

[INFO] Scraping article: https://www.dailymail.co.uk/news/article-14924919/asylum-seeker-Bibby-Stockholm-sexually-assaulted-woman-beach.html?ico=comment-anchor#comments
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14923031/Love-Islands-Harrison-Solomon-Lauren-Wood-REUNITE.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14923211/wendy-williams-public-outing-birthday-guardianship.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14925667/Kelly-Clarkson-reveals-sworn-dating-five-years-leaving-husband.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/article-14924669/Primary-school-trans-actor-Elliot-Page-lesson-masculinity.html#video
[INFO] Scraping article: https://www.dailymail.co.uk/news/news/headlines/index.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/news/news/headlines/index.html: Article `download()` failed with 504 Server Error: Gateway Time-out fo

[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-14923465/bombshell-new-book-safety-questions-Elon-Musks-dreams-world-driverless-Teslas-run-road-FREDDY-GRAY.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14926239/dog-bounty-hunter-grandson-death-update-teen-shot-father.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14924225/Dog-Bounty-Hunters-stepson-bragged-using-rifles-son-four-years-accidentally-shooting-death.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/article-14923495/Missing-girl-Melina-Frattolin-9-dead-dad-claimed-snatched-away-white-van.html#video
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14926277/Spencer-Matthews-responds-cruel-comments-Ulrika-Jonssons-appearance-podcast.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-14925149/coldplay-concert-ceo-andy-byron-scandal-boston-cabot-family-jfk.html
[INFO] Scrapin

[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14924497/Has-Liam-Neeson-finally-Love-Actually-73-Perennially-single-star-sparks-fresh-romance-rumours-Pamela-Anderson-twenty-six-years-tragic-death-beloved-wife-Natasha-Richardson.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tv/article-14926801/Love-Island-fans-accuse-Dejon-win-50k-Meg-girlfriend.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/femail/food/article-14802247/wines-summer-gatherings-taste-seriously-good-award-winning.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/news/femail/food/article-14802247/wines-summer-gatherings-taste-seriously-good-award-winning.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/news/femail/food/article-14802247/wines-summer-gatherings-taste-seriously-good-award-winning.html on URL https://www.dailymail.co.uk/news/femail/food/article-14802247/wines-summer-gatheri

[ERROR] Failed to extract article from https://www.dailymail.co.uk/ushome/sport/nfl/article-14926877/Kansas-City-Chiefs-sign-wide-receiver-Hal-Presley-ditched-Buffalo-Bills.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/ushome/sport/nfl/article-14926877/Kansas-City-Chiefs-sign-wide-receiver-Hal-Presley-ditched-Buffalo-Bills.html on URL https://www.dailymail.co.uk/ushome/sport/nfl/article-14926877/Kansas-City-Chiefs-sign-wide-receiver-Hal-Presley-ditched-Buffalo-Bills.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/meghan-markle/index.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/ushome/tvshowbiz/meghan-markle/index.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/ushome/tvshowbiz/meghan-markle/index.html on URL https://www.dailymail.co.uk/ushome/tvshowbiz/meghan-markle/index.html
[INFO] Scraping article: https://www.dailyma

[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14922669/Sia-49-sparks-romance-rumours-reality-star-28-unlikely-pair-step-walk-hand-hand-LA-dinner-date.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14920621/wendy-williams-shock-transformation-guardianship-health-birthday.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/shopping-us/article-14896919/reverse-health-chair-yoga-sale.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14924137/Secret-Lives-Mormon-Wives-birth-fourth-baby-husband-mikayla-matthews.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/femail/article-14925675/Baba-Vanga-vision-Psychic-trapped-tornado-stranded-visionary-powers.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/registration/profile.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/news/royals/article-14919249/Harry-Meghan-Britain-aides-King-secret

[INFO] Scraping article: https://www.dailymail.co.uk/ushome/shopping-uk/article-14914423/Wickes-surprising-items-camping-garden-furniture.html?ico=mail_best_commerce_xp_desktop_185#comments
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/news/article-14926359/Donald-Trump-intervention-Bryan-Kohberger-Idaho-murders.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/shopping-uk/article-14896437/Power-portable-power-station-outdoor-getaways-shop-today-50-plus-save-EXTRA-5-code.html?ico=mail_best_commerce_xp_desktop_185#comments
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14922093/ricki-lake-long-facelift-public.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14923039/Amber-heard-rare-appearance-Johnny-Depp-trial-birth-twins.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14921175/Russell-Crowe-sons-Los-Angeles.html
[INFO] Scraping article: https://www.dailymail

[INFO] Scraping article: https://www.dailymail.co.uk/ushome/shopping-us/article-14916259/cosmedix-beauty-celebrity-lip.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/sport/othersports/article-14920035/cavinder-twins-boob-job-update.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/ushome/sport/othersports/article-14920035/cavinder-twins-boob-job-update.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/ushome/sport/othersports/article-14920035/cavinder-twins-boob-job-update.html on URL https://www.dailymail.co.uk/ushome/sport/othersports/article-14920035/cavinder-twins-boob-job-update.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14926459/Christina-Haack-hands-new-boyfriend-Chris-Larocca-models-flirty-tennis-skirt.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/health/article-14894679/doctor-lump-son-mouth-tumor-truth.html
[INFO] Scraping ar

[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14912429/blake-lively-hiding-life-taylor-swift-justin-baldoni.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14926479/Drew-Barrymores-plain-Jane-apartment-shocks-fans.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14920069/Rebecca-Romijn-Jerry-OConnell-make-shock-financial-admission-18-year-marriage.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/sport/othersports/article-14919659/Livvy-Dunne-feud-New-York-Times-sex-sells.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/ushome/sport/othersports/article-14919659/Livvy-Dunne-feud-New-York-Times-sex-sells.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/ushome/sport/othersports/article-14919659/Livvy-Dunne-feud-New-York-Times-sex-sells.html on URL https://www.dailymail.co.uk/ushome/sport/othersport

[ERROR] Failed to extract article from https://www.dailymail.co.uk/ushome/sport/golf/article-14925385/chris-gotterup-doubles-career-earnings-open.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/ushome/sport/golf/article-14925385/chris-gotterup-doubles-career-earnings-open.html on URL https://www.dailymail.co.uk/ushome/sport/golf/article-14925385/chris-gotterup-doubles-career-earnings-open.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/media/article-14918391/jimmy-kimmel-blasts-cbs-stephen-colbert-canceled-late-night.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/news/article-14926403/Etan-Patz-killer-conviction-overturned-pedro-hernandez.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14923803/Jeff-Bezos-Lauren-Sanchez-Europe-weeks-wedding.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/sport/college-football/article-14925845/Deion-Sanders-give

[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14920183/Kylie-Jenner-trolled-dead-eyes-unrecognizable-fashion-shoot.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/news/article-14925077/Billionaire-tycoon-Charles-Cohen-faces-losing-mansions-yachts-25-supercars-embarrassing-debt-drama.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14925555/Paris-Hilton-parties-Megan-Foxs-ex-Machine-Gun-Kelly-romantically-linked-6-years-ago.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14926661/jamie-foxx-hollywood-cosby-malcolm-jamal-warner-death-tributes.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14922819/Jennifer-Lopez-shocks-simulates-sex-raunchy-performance-Tenerife.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14926811/euphoria-cleaning-kitchens-job-instagram-fans.html
[INFO] Scraping article: https://ww

[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14926659/30-Rock-John-Lutz-sudden-death-younger-brother.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/home/sitemaparchive/index.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/ushome/home/sitemaparchive/index.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/ushome/home/sitemaparchive/index.html on URL https://www.dailymail.co.uk/ushome/home/sitemaparchive/index.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/home/contactus/index.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/ushome/home/contactus/index.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/ushome/home/contactus/index.html on URL https://www.dailymail.co.uk/ushome/home/contactus/index.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz

[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14922453/wedding-Charli-XCX-husband-George-Daniel-cried.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14923993/Kylie-Jenners-post-Greek-island-restaurant-goes-viral-not-good-way.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/news/donald_trump/index.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/ushome/news/donald_trump/index.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/ushome/news/donald_trump/index.html on URL https://www.dailymail.co.uk/ushome/news/donald_trump/index.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14926129/Linda-Evangelista-60-drinks-beer-son-Augustin-18-hanging-exs-wife-Salma-Hayek.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14921853/Kelly-Clarkson-daughter-River-surprise-duet-Las

[ERROR] Failed to extract article from https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/ushome/index.html: Article `download()` failed with 504 Server Error: Gateway Time-out for url: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/ushome/index.html on URL https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/ushome/index.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/additionalcookieinfo
[ERROR] Failed to extract article from https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/additionalcookieinfo: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/additionalcookieinfo on URL https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/additionalcookieinfo
[INFO] Scraping article: https://www.dailymail.co.uk/news/royals/https://www.dailymai

[INFO] Scraping article: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/travel/index.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/travel/index.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/travel/index.html on URL https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/travel/index.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/home/article-2572160/DMA-Privacy-Policy.html#ccpaExplicitNotice
[ERROR] Failed to extract article from https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/home/article-2572160/DMA-Privacy-Policy.html#ccpaExplicitNotice: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/home/article

[INFO] Scraping article: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/home/sitemaparchive/index.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/home/sitemaparchive/index.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/home/sitemaparchive/index.html on URL https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/home/sitemaparchive/index.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/topics
[ERROR] Failed to extract article from https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/topics: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/topics on URL https://www.dailymail.co.uk/news/royals/https://www.dailymai

[INFO] Scraping article: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/shopping-au/index.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/shopping-au/index.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/shopping-au/index.html on URL https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/shopping-au/index.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/news/prince_harry/index.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/news/prince_harry/index.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/news/prince_harry/index.html on URL https://www.dailymail.co.u

In [4]:
# processing
data = pd.read_csv('Today.csv', header=0)
data = data[data['url'].str.contains(r'/article-\d+')]

In [5]:
len(data)

818

In [6]:
# merge and de-dup
clean = pd.read_csv("clean.csv", header=0)
print(len(clean))
merge = pd.concat([data, clean], ignore_index=True)
merge.drop_duplicates(subset='title', inplace=True)
print(len(merge))
merge.to_csv("clean.csv", index=False)

1051
1296
