In [1]:
import requests
from newspaper import Article
from bs4 import BeautifulSoup
import csv
import time
import pandas as pd

In [2]:
def extract_article_from_url(url):
    try:
        article = Article(url)
        article.download()
        article.parse()
        return {
            "title": article.title,
            "text": article.text,
            "url": url
        }
    except Exception as e:
        print(f"[ERROR] Failed to extract article from {url}: {e}")
        return None

def get_article_links_from_homepage(homepage_url, limit=10):
    try:
        response = requests.get(homepage_url)
        soup = BeautifulSoup(response.text, "html.parser")
        links = set()

        for a_tag in soup.find_all("a", href=True):
            href = a_tag["href"]
            if href.startswith("/") and not href.startswith("//"):
                href = homepage_url.rstrip("/") + href
            if homepage_url in href:
                links.add(href)

        return list(links)[:limit]
    except Exception as e:
        print(f"[ERROR] Failed to get links from {homepage_url}: {e}")
        return []

def scrape_news_site(urls, output_csv, article_limit=10):
    rows = []
    for homepage_url in urls: 
        article_urls = get_article_links_from_homepage(homepage_url, limit=article_limit)
        print(f"Found {len(article_urls)} article links.")

    
        for url in article_urls:
            print(f"[INFO] Scraping article: {url}")
            article = extract_article_from_url(url)
            if article and article["text"].strip():
                rows.append(article)
            time.sleep(1)  # To avoid getting blocked

    with open(output_csv, "w", newline='', encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["title", "text", "url"])
        writer.writeheader()
        writer.writerows(rows)

    print(f"[DONE] Saved {len(rows)} articles to {output_csv}")


In [4]:
urls = [
    "https://www.dailymail.co.uk/tvshowbiz", 
    "https://www.dailymail.co.uk/news",
    "https://www.dailymail.co.uk/ushome",
    "https://www.dailymail.co.uk/tv"
    "https://www.dailymail.co.uk/auhome/",
    "https://www.dailymail.co.uk/news/royals/"
    "https://www.dailymail.co.uk/home/"
]
scrape_news_site(urls, output_csv="July.csv", article_limit=1000)

Found 193 article links.
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14916861/Jessie-J-set-huge-TV-comeback-breast-cancer.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14700463/john-travolta-honors-late-wife-emily-ratajkowski-throwback-mothers-day.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/health/index.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14696037/Kim-Alexis-64-talks-pressure-staying-beautiful.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/news/royals/index.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/tvshowbiz/news/royals/index.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/tvshowbiz/news/royals/index.html on URL https://www.dailymail.co.uk/tvshowbiz/news/royals/index.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbi

[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/textbased/channel-1/index.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/tvshowbiz/textbased/channel-1/index.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/tvshowbiz/textbased/channel-1/index.html on URL https://www.dailymail.co.uk/tvshowbiz/textbased/channel-1/index.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14693305/jennifer-aniston-stalker-glare-shirtless-court-photos-gate-crash.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14916221/Chris-Hemsworth-wife-Elsa-Pataky-supported-brother-Liam-Limitless-Live-Better-premiere-London.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/auhome/index.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14696203/Al-Pacino-mini-son-wears-rocker-T-shirt.html
[INFO] Scraping a

[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14693713/Ben-Afflecks-Chasing-Amy-costar-worked-Jennifer-Aniston-looks-good.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/home/article-2572160/DMA-Privacy-Policy.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14693163/Cristiano-Ronaldos-girlfriend-dress-inspired-Princess-Diana.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/article-14785785/Ana-Armas-Ballerina-action-thriller-BRIAN-VINER.html#video
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14866497/review-Evita-Rachel-Zegler-knockout.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14763497/PATRICK-MARMION-reviews-Marriage-Material.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14696335/Kanye-West-Kim-Kardashian-exploiting-children-legal-attack.html
[INFO] Scrapin

[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14694131/Gigi-Hadid-solo-Bradley-Cooper-daughter-major-step-romance.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/home_and_away/index.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/home_and_away/index.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/home_and_away/index.html on URL https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/home_and_away/index.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14915597/Liam-Paynes-girlfriend-Kate-Cassidy-breaks-tears-marks-nine-months-One-Direction-stars-tragic-death.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvshowbiz/tvshowbiz/article-14699803/Kelly-Clarkson-telling-remarks-demanding-talk-gig-mystery-absence.html
[INFO] Scraping article: https://www.dailymail.co.uk/tvs

[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14911541/Steven-Gerrard-left-smitten-shared-touching-moment-forget-gangster-links-no-longer-fancy-KATIE-HIND.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14914003/Myleene-Klass-sets-pulses-racing-strips-recreate-Princess-Margarets-famous-steamy-bathtub-snap-commemorate-MBE-honour.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14915183/Denise-Richards-accuses-Aaron-Phypers-physical-abuse.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14916429/Holly-Hagan-blasts-Geordie-Shore-lack-duty-care.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-14885121/Britains-lawless-fuel-stations-Motorists-drive-without-paying-one-owner-says-new-scam-organised-crime-gangs-cost-10-000-claims-police-nothing.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14913671/Christine

[INFO] Scraping article: https://www.dailymail.co.uk/news/tv/article-14913889/BBC-look-Call-Midwife-Christmas-special-worrying.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14913841/Vanessa-Feltz-parties-Myleene-Klass-ex-Ben-Ofoedu-married.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-14915001/Video-Disgusting-moment-fast-food-worker-SPITS-food-packing-customers-burger-takeaway-order.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/ushome/index.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/article-14912703/MPs-launch-two-probes-Governments-secret-immigration-scheme-Afghans.html#video
[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-14913703/British-tourist-drinks-bill-attackers-stop-Thai-bar.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-14915403/Police-probing-murder-baby-boy-dumped-canal-four-years-ago-offer-10-000-reward.html
[INFO] Scrap

[INFO] Scraping article: https://www.dailymail.co.uk/news/article-14914485/Moment-French-coastguard-hand-lifejackets-migrants-small-boat-Channel-Britain.html?ico=comment-anchor#comments
[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-14913925/daughter-bitter-court-battle-siblings-600000-fortune.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-14911169/Kinahan-Cartel-infiltrated-world-sport-showbiz-Stephen-Gerrards-daughter-welcomes-baby-jailed-gangsters-son.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14917005/Kate-Beckinsale-heartbreaking-mother-passed-away-immeasurable-suffering.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-14914161/Pictured-Two-British-gentle-kind-hearted-souls-drowned-Portuguese-hotel-swimming-pool-friends-launch-fundraiser-bring-bodies-home.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14916815/Bethenny-Frankel-skim

[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-14914191/Labour-confirms-16-year-olds-vote-general-election.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14916063/Iconic-80s-star-youthful-60th-birthday-rare-outing.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14913871/Olivia-Attwood-reveals-thieves-tried-break-away-holiday-warns-weve-got-fingerprints.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/article-14912703/MPs-launch-two-probes-Governments-secret-immigration-scheme-Afghans.html?ico=comment-anchor#comments
[INFO] Scraping article: https://www.dailymail.co.uk/news/shopping-au/index.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-14914485/Moment-French-coastguard-hand-lifejackets-migrants-small-boat-Channel-Britain.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14913839/Julian-McMahon-Dannii-Minogue-interview-dati

[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14914243/Chloe-Madeley-showcases-incredibly-ripped-legs-tiny-denim-shorts-London-day-out.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-14913091/reality-small-town-exposed-Aussies-flee-Sydney-Melbourne-housing-crisis.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/article-14913915/dozens-dead-Iraq-shopping-mall-inferno.html?ico=comment-anchor#comments
[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-14902643/drug-australia-meth-addict-ice.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/news/au/index.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/news/news/au/index.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/news/news/au/index.html on URL https://www.dailymail.co.uk/news/news/au/index.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tv

[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14912783/Angelina-Jolie-Shiloh-lives-rumored-girlfriend.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14915373/Amy-Childs-displays-extreme-weight-loss-yellow-shorts-set-nips-run-errands-Essex.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14913491/Steve-Miller-Band-CANCELS-entire-2025-tour.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/article-14913991/footage-Tomorrowland-inferno-huge-smoke-cloud-fireworks-houses-stage-destroyed-festival.html?ico=comment-anchor#comments
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14915163/Wynne-Evans-jokes-dark-latest-post-plugs-new-radio-leaving-BBC-following-Strictly-controversy.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14912483/ariana-grande-ex-fiance-pete-davidson-baby-girlfriend-elsie-hewitt.html
[INFO] Scraping articl

[ERROR] Failed to extract article from https://www.dailymail.co.uk/news/sport/mma/article-14914719/Conor-McGregor-fiancee-Dee-Devlin-yacht-nudes-leak.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/news/sport/mma/article-14914719/Conor-McGregor-fiancee-Dee-Devlin-yacht-nudes-leak.html on URL https://www.dailymail.co.uk/news/sport/mma/article-14914719/Conor-McGregor-fiancee-Dee-Devlin-yacht-nudes-leak.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14912795/Alessandra-Ambrosio-topless-sunbathes-Ibiza-holiday.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-14914589/Speeding-man-hit-killed-15-year-old-schoolgirl.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14914177/Katie-Price-reveals-pulled-passport-queue-airport-officials-recognise-extensive-plastic-surgery.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article

[INFO] Scraping article: https://www.dailymail.co.uk/news/article-14914481/Extreme-sports-influencer-famed-dare-devil-stunts-plunges-650ft-death-Dolomites-hours-posting-haunting-Instagram-video-mountain-top.html#video
[INFO] Scraping article: https://www.dailymail.co.uk/news/tv/article-14913805/Good-Morning-Britain-viewers-painful-Kate-Garraway-Richard-Madeley.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14913291/khlo-kardashian-photoshop-admission-cartoon.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/tv/article-14913873/80-year-old-woman-rescue-stranded-mountain.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-14860679/There-rituals-wont-budge-TVs-Kate-Lawler-husband-Boj-finding-middle-ground-household-costs.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-14914563/Defence-Secretary-Healey-disclosed-details-secret-Afghan-database-Parliament-despite-Mail-blocked-reporting-it.html


[INFO] Scraping article: https://www.dailymail.co.uk/news/tvshowbiz/article-14914293/Chris-Martin-couple-camera-affair-Coldplay-concert-Boston.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/news/article-14911069/Furious-locals-Molly-Mae-influencer-sister-BANNED-returning-Indonesia-Bali-holiday.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/femail/real-life/article-14914397/Like-wild-animals-American-woman-berates-Oasis-fans-behaviour-Heaton-Park-gig.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/news/femail/real-life/article-14914397/Like-wild-animals-American-woman-berates-Oasis-fans-behaviour-Heaton-Park-gig.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/news/femail/real-life/article-14914397/Like-wild-animals-American-woman-berates-Oasis-fans-behaviour-Heaton-Park-gig.html on URL https://www.dailymail.co.uk/news/femail/real-life/article-14914397/Like-wild-animals-America

[ERROR] Failed to extract article from https://www.dailymail.co.uk/ushome/sport/othersports/article-14913087/shane-gillis-jeffrey-epstein-joke-espys.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/ushome/sport/othersports/article-14913087/shane-gillis-jeffrey-epstein-joke-espys.html on URL https://www.dailymail.co.uk/ushome/sport/othersports/article-14913087/shane-gillis-jeffrey-epstein-joke-espys.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/sport/mma/article-14910263/Conor-McGregor-Azealia-Banks-nude-cheating-scandal.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/ushome/sport/mma/article-14910263/Conor-McGregor-Azealia-Banks-nude-cheating-scandal.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/ushome/sport/mma/article-14910263/Conor-McGregor-Azealia-Banks-nude-cheating-scandal.html on URL https://www.dailymail.co.uk/ushome/sport/m

[INFO] Scraping article: https://www.dailymail.co.uk/ushome/news/article-14916655/hiker-cole-henderson-disappears-spain-pyrenees-mountains.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14911275/Nicole-Scherzingers-plunging-white-linen-maxi-dress-perfect-holiday-wardrobe-stable-weve-high-street.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14907033/Brooklyn-beckham-forced-withdraw-application-trademark-dispute-family-feud.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tv/article-14913861/First-pics-set-Harry-Potter-TV-series-London-Zoo.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14912415/pete-davidson-girlfriend-elsie-hewitt-model-pregnant-child.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/sport/nfl/article-14916971/Travis-Kelce-Jason-future-New-Heights-podcast.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/ushome

[INFO] Scraping article: https://www.dailymail.co.uk/ushome/news/article-14913881/Reckless-driver-street-brawl-BMW-crash-London.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/shopping-uk/article-14889447/Move-Prosecco-ASDAs-new-multi-award-winning-7-sparkling-wine-set-tipple-summer.html?ico=mail_best_commerce_xp_desktop_185
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/shopping-uk/article-14903465/green-people-soothing-sun-tan-accelerator-discount-code-sale.html?ico=mail_best_commerce_xp_desktop_337
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14906795/Britpop-icon-looks-unrecognisable-pictured-Hollywood-hotel.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14906533/Romeo-Beckhams-ex-Mia-Regan-shows-incredible-figure-daring-backless-yellow-dress-launches-new-H-M-collection.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/news/article-14915249/Obama-private-Marthas-Vineyar

[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14907939/Justin-Timberlake-video-screams-stage-crew-concert-meltdown.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14904077/Macaulay-Culkin-Brenda-Song-family-outing-flowers-shopping.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/sport/othersports/article-14913093/Simone-Biles-fury-gender-2025-ESPYs-speech-Riley-Gaines-feud.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/ushome/sport/othersports/article-14913093/Simone-Biles-fury-gender-2025-ESPYs-speech-Riley-Gaines-feud.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/ushome/sport/othersports/article-14913093/Simone-Biles-fury-gender-2025-ESPYs-speech-Riley-Gaines-feud.html on URL https://www.dailymail.co.uk/ushome/sport/othersports/article-14913093/Simone-Biles-fury-gender-2025-ESPYs-speech-Riley-Gaines-feud.html
[INFO] Scra

[INFO] Scraping article: https://www.dailymail.co.uk/ushome/news/article-14915257/jd-vance-epstein-attacks-pressure-defy-trump-expose-truth.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/health/article-14903303/Supernanny-Jo-Frost-reveals-little-known-health-condition-kill-bad-shoving-loaded-gun-face.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14911943/Madonna-Gwyneth-Paltrow-feud-Lourdes.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/news/article-14916505/peter-schultz-cessna-pilot-landing-san-diego-crash.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14906507/Inside-Drakes-star-studded-Wireless-afterparty-list-celebs-OnlyFans-star-fans-left-disappointed-40-minute-festival-performance.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14908321/Former-Playboy-model-Holly-Madison-steps-bandage-wrapped-head-LA.html
[INFO] Scraping article: https:/

[INFO] Scraping article: https://www.dailymail.co.uk/ushome/news/article-14914155/Insider-reveals-Brigitte-Macron-turmoil-claims-born-man-battle-against-trolls-conspiracy-theorists.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/index.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/news/article-14915773/Billion-dollar-dynasty-war-heiress-battles-dad-younger-fourth-wife-shocking-sex-allegations.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14912645/blake-lively-toxic-legal-justin-baldoni-feud.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14912483/ariana-grande-ex-fiance-pete-davidson-baby-girlfriend-elsie-hewitt.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/sport/nba/article-14913523/Megan-Thee-Stallion-Klay-Thompson-relationship-milestone.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/ushome/sport/nba/article-14913523/Megan-Th

[ERROR] Failed to extract article from https://www.dailymail.co.uk/ushome/sport/au/index.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/ushome/sport/au/index.html on URL https://www.dailymail.co.uk/ushome/sport/au/index.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/femail/article-14911881/sarah-michelle-gellar-skincare-secret-youthful.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/news/article-14914121/Air-India-plane-crash-investigation-focuses-captain-remained-calm-officer-panicked-fuel-supply-cut-moments-disaster.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14908261/Jennifer-Aniston-break-silence-romance-guru.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14910245/Natalie-Portman-leggy-display-Lena-Dunham-Good-Sex-Rashida-Jones.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/home/article-2572160/DMA-P

[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14910057/Elle-Dakota-Fanning-smiles-enjoy-sisters-night.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14905471/Blake-Lively-deposition-location-Justin-Baldoni-legal-battle.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/femail/article-14908583/rachael-ray-swimming-tide-career-critical-transition.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/news/article-14915161/dan-rivera-paranormalk-investigator-death-annabelle-doll-tour.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14904993/Angelina-Jolies-mini-Shiloh-proves-shes-no-fashion-snob-recycles-old-50-hoodie-gal-pal.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/news/article-14916123/Dentist-used-lewd-affair-email-chemical-used-poison-wife-fast-tracked-delivered-murder-trial-hears.html
[INFO] Scraping article: https://www.dailymail.co.uk

[INFO] Scraping article: https://www.dailymail.co.uk/ushome/news/article-14916413/tina-brown-magazine-editor-jeffrey-epstein-encounter.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14917007/lana-del-rey-simple-louisiana-life-husband-jeremy-dufrene.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/health/article-14904343/Plastic-surgeon-spill-secrets-Brad-Pitt-facelift.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14910033/Pierce-Brosnans-son-Paris-looks-smitten-model-girlfriend-Alex-Lee-Aillon-opening-art-exhibition-Munich.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/femail/article-14901243/charlene-monaco-prince-albert-gushing-speech.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14916085/Martin-Freeman-legal-row-screaming-children-ruining-life.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14916695/Downton

[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14904561/Jessica-Biel-43-admits-peak-shape-not-maintainable-without-strict-workouts.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/femail/article-14910997/taylor-swift-dad-undergoes-major-heart-surgery.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14915279/Catherine-Zeta-Jones-Michael-Douglas-son-Dylan-hits-falsely-accused-scumbag-caused-chaos-Beyonce-concert.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/sport/nfl/article-14904951/joy-taylor-fox-sports-fs1-speak-sex-lawsuit.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/ushome/sport/nfl/article-14904951/joy-taylor-fox-sports-fs1-speak-sex-lawsuit.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/ushome/sport/nfl/article-14904951/joy-taylor-fox-sports-fs1-speak-sex-lawsuit.html on URL https://www.dailymail.co.

[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14902679/Pregnant-Rihanna-sons-Smurfs-premiere-LA.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14912379/Award-winning-singer-major-role-animated-film.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14905455/rihanna-fathers-death-shaped-powerful-parenting-decision.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14905255/nepo-babies-music-legend-dads-photo-rocks-internet.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/privacy
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tv/article-14911513/Stranger-Things-fans-crying-Netflix-trailer-final-series.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/news/royals/index.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/ushome/news/royals/index.html: Article `download()` failed with 404 Client Er

[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14905361/justin-timberlake-lytham-music-festival.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14907497/blake-lively-legal-blitz-pro-Baldoni-youtubers-whistleblowers.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/media/article-14917017/CBS-cancels-Late-Stephen-Colbert-blasts-settlement-Trump.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14912825/Blake-Lively-deposition-postponed-Justin-Baldoni-legal-battle.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14909201/Jennifer-Love-Hewitt-secrets-fit-healthy-Hollywoods-Ozempic-craze.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/health/index.html
[INFO] Scraping article: https://www.dailymail.co.uk/ushome/tvshowbiz/article-14904963/Sabrina-Carpenter-goes-bra-free-backless-ruby-red-dress-sipping-wine.html
[INFO] Scraping ar

[INFO] Scraping article: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/news/prince_harry/index.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/news/prince_harry/index.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/news/prince_harry/index.html on URL https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/news/prince_harry/index.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/
[ERROR] Failed to extract article from https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/news/royals/https:/www.dailymail.co.uk/index.html on URL https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/
[INFO] S

[INFO] Scraping article: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/howcomplain
[ERROR] Failed to extract article from https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/howcomplain: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/howcomplain on URL https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/howcomplain
[INFO] Scraping article: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/news/royals/index.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/news/royals/index.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/news/royals/index.html on URL https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/news/royals/ind

[ERROR] Failed to extract article from https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/authors: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/authors on URL https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/authors
[INFO] Scraping article: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/topics
[ERROR] Failed to extract article from https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/topics: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/topics on URL https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/topics
[INFO] Scraping article: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/home/article-3633654/CONTRIBUTIONS-STANDARD-TERMS-CONDITIONS.html
[ERROR] Failed

[INFO] Scraping article: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/tvshowbiz/index.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/tvshowbiz/index.html: Article `download()` failed with 504 Server Error: Gateway Time-out for url: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/tvshowbiz/index.html on URL https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/tvshowbiz/index.html
[INFO] Scraping article: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/home/contactus/index.html
[ERROR] Failed to extract article from https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/home/contactus/index.html: Article `download()` failed with 404 Client Error: Not Found for url: https://www.dailymail.co.uk/news/royals/https://www.dailymail.co.uk/home/home/contactus/index.html on URL https://www.dailymail.co.uk/news/roy

In [1]:
#processing
data = pd.read_csv('July.csv', header=0)
print(len(data))
data.head()

In [13]:
# 1. drop the duplicates by title 
data.drop_duplicates(subset='title', inplace=True)
len(data)

645

In [15]:
# 2. drop the none article row
data = data[data['url'].str.contains(r'/article-\d+')]
len(data)

In [17]:
data.to_csv("clean.csv", index=False)