In [None]:
import requests
from bs4 import BeautifulSoup
import json
import csv
import time
import os
from urllib.parse import urljoin
from datetime import datetime

# ================== CONFIG ==================

OUTPUT_DIR = r"C:\Documents\UEH"
os.makedirs(OUTPUT_DIR, exist_ok=True)

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}

MAX_PAGES = 50       # s·ªë trang t·ªëi ƒëa (th·ª±c t·∫ø s·∫Ω t·ª± d·ª´ng khi 404)
SLEEP_TIME = 1       # delay gi·ªØa request

CATEGORIES = {
    "Ch√≠nh tr·ªã": "https://dantri.com.vn/thoi-su/chinh-tri.htm",
    "Kinh doanh": "https://dantri.com.vn/kinh-doanh.htm",
    "Ph√°p lu·∫≠t": "https://dantri.com.vn/phap-luat.htm",
    "Gi√°o d·ª•c": "https://dantri.com.vn/giao-duc.htm",
    "Gi·∫£i tr√≠": "https://dantri.com.vn/giai-tri.htm",
    "Th·ªÉ thao": "https://dantri.com.vn/the-thao.htm",
    "S·ª©c kh·ªèe": "https://dantri.com.vn/suc-khoe.htm",
    "ƒê·ªùi s·ªëng": "https://dantri.com.vn/doi-song.htm"
}

# ================== FETCH (CH·ªêNG 404) ==================

def fetch(url):
    try:
        r = requests.get(url, headers=HEADERS, timeout=15)

        # 404 = h·∫øt trang ‚Üí d·ª´ng category
        if r.status_code == 404:
            return None

        r.raise_for_status()
        r.encoding = "utf-8"
        return r.text

    except Exception:
        return None

# ================== CRAWL ==================

articles = []
seen_links = set()

for category, base_url in CATEGORIES.items():
    print(f"\nüì∞ D√ÇN TR√ç | {category}")
    base = base_url.replace(".htm", "")

    for page in range(1, MAX_PAGES + 1):
        url = base_url if page == 1 else f"{base}/trang-{page}.htm"
        print(f"  ‚Ü≥ Trang {page}: {url}")

        html = fetch(url)
        if not html:
            print(f"  ‚õî H·∫øt trang t·∫°i page {page} ‚Üí d·ª´ng category")
            break

        soup = BeautifulSoup(html, "html.parser")
        new_count = 0

        for a in soup.find_all("a", href=True):
            title = a.get("title") or a.get_text(strip=True)
            href = a["href"]

            # Ch·ªâ l·∫•y link b√†i b√°o D√¢n Tr√≠
            if not title or not href.endswith(".htm"):
                continue

            full_url = urljoin(base_url, href)

            if full_url in seen_links:
                continue

            seen_links.add(full_url)
            articles.append({
                "source": "DanTri",
                "category": category,
                "title": title,
                "url": full_url,
                "crawl_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            })
            new_count += 1

        print(f"    ‚úì +{new_count} b√†i")

        # N·∫øu trang c√≥ 0 b√†i ‚Üí coi nh∆∞ h·∫øt
        if new_count == 0:
            print("  ‚õî Kh√¥ng c√≤n b√†i ‚Üí d·ª´ng category")
            break

        time.sleep(SLEEP_TIME)

# ================== SAVE ==================

json_path = os.path.join(OUTPUT_DIR, "dantri_links.json")
csv_path = os.path.join(OUTPUT_DIR, "dantri_links.csv")

with open(json_path, "w", encoding="utf-8") as f:
    json.dump(articles, f, ensure_ascii=False, indent=2)

with open(csv_path, "w", encoding="utf-8-sig", newline="") as f:
    writer = csv.DictWriter(
        f,
        fieldnames=["source", "category", "title", "url", "crawl_time"]
    )
    writer.writeheader()
    writer.writerows(articles)

print("\n‚úÖ HO√ÄN T·∫§T CRAWL D√ÇN TR√ç")
print(f"T·ªîNG LINK: {len(articles)}")
print(f"üíæ JSON: {json_path}")
print(f"üíæ CSV : {csv_path}")



üì∞ D√ÇN TR√ç | Ch√≠nh tr·ªã
  ‚Ü≥ Trang 1: https://dantri.com.vn/thoi-su/chinh-tri.htm
    ‚úì +176 b√†i
  ‚Ü≥ Trang 2: https://dantri.com.vn/thoi-su/chinh-tri/trang-2.htm
    ‚úì +15 b√†i
  ‚Ü≥ Trang 3: https://dantri.com.vn/thoi-su/chinh-tri/trang-3.htm
    ‚úì +20 b√†i
  ‚Ü≥ Trang 4: https://dantri.com.vn/thoi-su/chinh-tri/trang-4.htm
    ‚úì +19 b√†i
  ‚Ü≥ Trang 5: https://dantri.com.vn/thoi-su/chinh-tri/trang-5.htm
    ‚úì +25 b√†i
  ‚Ü≥ Trang 6: https://dantri.com.vn/thoi-su/chinh-tri/trang-6.htm
    ‚úì +20 b√†i
  ‚Ü≥ Trang 7: https://dantri.com.vn/thoi-su/chinh-tri/trang-7.htm
    ‚úì +20 b√†i
  ‚Ü≥ Trang 8: https://dantri.com.vn/thoi-su/chinh-tri/trang-8.htm
    ‚úì +19 b√†i
  ‚Ü≥ Trang 9: https://dantri.com.vn/thoi-su/chinh-tri/trang-9.htm
    ‚úì +20 b√†i
  ‚Ü≥ Trang 10: https://dantri.com.vn/thoi-su/chinh-tri/trang-10.htm
    ‚úì +25 b√†i
  ‚Ü≥ Trang 11: https://dantri.com.vn/thoi-su/chinh-tri/trang-11.htm
    ‚úì +19 b√†i
  ‚Ü≥ Trang 12: https://dantri.com.vn/thoi-su/ch

In [3]:
import requests, csv, json, time, os
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from datetime import datetime

OUTPUT_DIR = r"C:\Documents\UEH"
os.makedirs(OUTPUT_DIR, exist_ok=True)

CATEGORIES = {
    "Ch√≠nh tr·ªã": "https://vnexpress.net/thoi-su/chinh-tri",
    "Kinh doanh": "https://vnexpress.net/kinh-doanh",
    "Ph√°p lu·∫≠t": "https://vnexpress.net/phap-luat",
    "Gi√°o d·ª•c": "https://vnexpress.net/giao-duc",
    "Gi·∫£i tr√≠": "https://vnexpress.net/giai-tri",
    "Th·ªÉ thao": "https://vnexpress.net/the-thao",
    "S·ª©c kh·ªèe": "https://vnexpress.net/suc-khoe",
    "ƒê·ªùi s·ªëng": "https://vnexpress.net/doi-song",
    "C√¥ng ngh·ªá": "https://vnexpress.net/khoa-hoc-cong-nghe"
}

HEADERS = {"User-Agent": "Mozilla/5.0"}
MAX_PAGES = 50

articles, seen = [], set()

def fetch(url):
    r = requests.get(url, headers=HEADERS, timeout=15)
    r.raise_for_status()
    return r.text

for category, base_url in CATEGORIES.items():
    print(f"\nüì∞ VNExpress | {category}")
    for page in range(1, MAX_PAGES + 1):
        url = base_url if page == 1 else f"{base_url}-p{page}"
        html = fetch(url)
        soup = BeautifulSoup(html, "html.parser")

        new = 0
        for a in soup.find_all("a", href=True):
            title = a.get("title") or a.get_text(strip=True)
            href = a["href"]

            if not title or not href.endswith(".html"):
                continue

            full = urljoin(base_url, href)
            if full in seen:
                continue

            seen.add(full)
            articles.append({
                "source": "VNExpress",
                "category": category,
                "title": title,
                "url": full,
                "crawl_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            })
            new += 1

        print(f"  Trang {page}: +{new}")
        if new == 0:
            break
        time.sleep(1)

with open(os.path.join(OUTPUT_DIR, "vnexpress_links.json"), "w", encoding="utf-8") as f:
    json.dump(articles, f, ensure_ascii=False, indent=2)

print(f"\n‚úÖ VNExpress xong: {len(articles)} link")


üì∞ VNExpress | Ch√≠nh tr·ªã
  Trang 1: +30
  Trang 2: +30
  Trang 3: +30
  Trang 4: +30
  Trang 5: +30
  Trang 6: +30
  Trang 7: +30
  Trang 8: +30
  Trang 9: +30
  Trang 10: +30
  Trang 11: +30
  Trang 12: +30
  Trang 13: +30
  Trang 14: +30
  Trang 15: +30
  Trang 16: +30
  Trang 17: +30
  Trang 18: +30
  Trang 19: +30
  Trang 20: +30
  Trang 21: +0

üì∞ VNExpress | Kinh doanh
  Trang 1: +50
  Trang 2: +30
  Trang 3: +30
  Trang 4: +30
  Trang 5: +30
  Trang 6: +30
  Trang 7: +30
  Trang 8: +30
  Trang 9: +30
  Trang 10: +30
  Trang 11: +30
  Trang 12: +30
  Trang 13: +30
  Trang 14: +30
  Trang 15: +30
  Trang 16: +30
  Trang 17: +30
  Trang 18: +30
  Trang 19: +30
  Trang 20: +30
  Trang 21: +0

üì∞ VNExpress | Ph√°p lu·∫≠t
  Trang 1: +47
  Trang 2: +30
  Trang 3: +30
  Trang 4: +30
  Trang 5: +30
  Trang 6: +29
  Trang 7: +29
  Trang 8: +30
  Trang 9: +30
  Trang 10: +30
  Trang 11: +30
  Trang 12: +30
  Trang 13: +30
  Trang 14: +30
  Trang 15: +30
  Trang 16: +29
  Trang 17