In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import json
import os
import time

# ================== SETUP DRIVER ==================

def setup_driver():
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("--window-size=1920,1080")
    options.add_argument(
        "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
    )
    return webdriver.Chrome(options=options)

# ================== C√ÄO 1 B√ÄI THEO URL ==================

def crawl_article(driver, url):
    data = {
        "url": url,
        "title": "",
        "author": "",
        "publish_date": "",
        "content": "",
        "status": "success"
    }

    try:
        driver.get(url)
        wait = WebDriverWait(driver, 10)

        # ---- TI√äU ƒê·ªÄ ----
        try:
            title = wait.until(
                EC.presence_of_element_located(
                    (By.CSS_SELECTOR, "h1.title-detail, h1.title_news_detail, h1")
                )
            )
            data["title"] = title.text.strip()
        except:
            data["title"] = "N/A"

        # ---- T√ÅC GI·∫¢ ----
        try:
            author = driver.find_element(
                By.CSS_SELECTOR, "p.author_mail strong, p.author strong"
            )
            data["author"] = author.text.strip()
        except:
            data["author"] = "VnExpress"

        # ---- NG√ÄY ƒêƒÇNG ----
        try:
            date = driver.find_element(By.CSS_SELECTOR, "span.date, span.time")
            data["publish_date"] = date.text.strip()
        except:
            data["publish_date"] = "N/A"

        # ---- N·ªòI DUNG ----
        paragraphs = driver.find_elements(
            By.CSS_SELECTOR, "article.fck_detail p.Normal, div.fck_detail p"
        )
        data["content"] = "\n".join(
            p.text.strip() for p in paragraphs if p.text.strip()
        )

        return data

    except Exception as e:
        data["status"] = "failed"
        data["error"] = str(e)
        return data

# ================== CH·∫†Y THEO FILE JSON ==================

def crawl_from_existing_urls(
    input_file=r"C:\Documents\UEH\HKC 2025\vnexpress_articles.json",
    output_dir=r"C:\Documents\UEH\HKC 2025"
):
    os.makedirs(output_dir, exist_ok=True)

    # ƒê·ªçc file URL c√≥ s·∫µn
    with open(input_file, "r", encoding="utf-8") as f:
        articles_by_category = json.load(f)

    driver = setup_driver()
    results = []

    try:
        for category, articles in articles_by_category.items():
            print(f"\nüì∞ {category}")

            for i, article in enumerate(articles, 1):
                url = article["url"]
                print(f"  [{i}/{len(articles)}] {url}")

                data = crawl_article(driver, url)
                data["category"] = category
                results.append(data)

                time.sleep(2)

        # L∆∞u JSON k·∫øt qu·∫£
        out_json = os.path.join(output_dir, "vnexpress_fulltext_from_urls.json")
        with open(out_json, "w", encoding="utf-8") as f:
            json.dump(results, f, ensure_ascii=False, indent=2)

        # L∆∞u CSV
        out_csv = os.path.join(output_dir, "vnexpress_fulltext_from_urls.csv")
        with open(out_csv, "w", encoding="utf-8-sig") as f:
            f.write("category,title,author,publish_date,content,url,status\n")
            for a in results:
                f.write(
                    f'"{a["category"]}","{a["title"].replace(chr(34),"")}","{a["author"]}",'
                    f'"{a["publish_date"]}","{a["content"].replace(chr(34),"")}",'
                    f'"{a["url"]}","{a["status"]}"\n'
                )

        print("\n‚úÖ HO√ÄN T·∫§T")
        print(f"üìÑ JSON: {out_json}")
        print(f"üìä CSV : {out_csv}")

        return results

    finally:
        driver.quit()

# ================== RUN ==================

if __name__ == "__main__":
    crawl_from_existing_urls()