In [1]:
# Đoạn code này có thể bị crash do một số nguyên nhân phổ biến sau:
# 1. Một số link không trả về nội dung (timeout, lỗi mạng, bị chặn bởi bot, v.v.)
# 2. Một số trang không có thẻ <article> nên content = None, dẫn đến file html rỗng hoặc lỗi khi convert PDF.
# 3. Quá nhiều luồng (6 luồng) có thể gây quá tải mạng hoặc bị server chặn.
# 4. Thư viện weasyprint có thể lỗi khi convert HTML không hợp lệ hoặc thiếu resource (CSS, font, v.v.)
# 5. Không log ra lỗi chi tiết nên khó debug.

# Để debug, bạn nên in/log lỗi chi tiết trong except, ví dụ:
import os
import requests
from bs4 import BeautifulSoup
from weasyprint import HTML
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

URL_files = [
    "healthline_cancer-care_links.txt",
    "healthline_fitness_links.txt",
    "healthline_mental-health_links.txt",
    "healthline_nutrition_links.txt",
    "healthline_reviews_links.txt"
]

os.makedirs("data_html", exist_ok=True)
os.makedirs("data_pdf", exist_ok=True)

def url_to_filename(url):
    safe_url = url.replace("https://", "").replace("http://", "")
    safe_url = safe_url.replace("/", "_").replace("?", "_").replace(":", "_").replace("&", "_").replace("=", "_").replace("#", "_")
    return safe_url

def crawl_and_save(url):
    try:
        response = requests.get(url, timeout=20)
        soup = BeautifulSoup(response.content, "html.parser")
        content = soup.find("article")
        if not content:
            print(f"[WARN] No <article> found for {url}")
            return url, False
        html_path = f"data_html/{url_to_filename(url)}.html"
        pdf_path = f"data_pdf/{url_to_filename(url)}.pdf"
        with open(html_path, "w", encoding="utf-8") as f:
            f.write(str(content))
        try:
            HTML(html_path).write_pdf(pdf_path)
        except Exception as e:
            print(f"[ERROR] PDF convert failed for {url}: {e}")
            return url, False
        return url, True
    except Exception as e:
        print(f"[ERROR] Exception for {url}: {e}")
        return url, False

all_links = []
for file in URL_files:
    with open(file, "r", encoding="utf-8") as f:
        all_links.extend([line.strip() for line in f if line.strip()])

results = []
with ThreadPoolExecutor(max_workers=6) as executor:
    futures = {executor.submit(crawl_and_save, url): url for url in all_links}
    for future in tqdm(as_completed(futures), total=len(futures), desc="Crawling & Converting"):
        url, success = future.result()
        results.append((url, success))

Crawling & Converting:   3%|▎         | 3/114 [00:01<00:42,  2.62it/s]

[WARN] No <article> found for https://www.healthline.com/cancer-care/well-being
[WARN] No <article> found for https://www.healthline.com/cancer-care/screenings
[WARN] No <article> found for https://www.healthline.com/cancer-care/caregivers


Crawling & Converting:   5%|▌         | 6/114 [00:01<00:19,  5.60it/s]

[WARN] No <article> found for https://www.healthline.com/cancer-care/treatment
[WARN] No <article> found for https://www.healthline.com/cancer-care/diet
[WARN] No <article> found for https://www.healthline.com/cancer-care/chemotherapy


Crawling & Converting:   8%|▊         | 9/114 [00:02<00:28,  3.66it/s]

[WARN] No <article> found for https://www.healthline.com/fitness/cardio
[WARN] No <article> found for https://www.healthline.com/fitness/strength-training
[WARN] No <article> found for https://www.healthline.com/fitness/exercise


Crawling & Converting:  10%|▉         | 11/114 [00:02<00:21,  4.86it/s]

[WARN] No <article> found for https://www.healthline.com/fitness/holistic-fitness
[WARN] No <article> found for https://www.healthline.com/fitness/products


Crawling & Converting:  11%|█         | 12/114 [00:03<00:20,  4.98it/s]

[WARN] No <article> found for https://www.healthline.com/fitness/yoga


Crawling & Converting:  19%|█▉        | 22/114 [00:08<00:57,  1.59it/s]

[WARN] No <article> found for https://www.healthline.com/health/mental-health/how-to-reach-out


Crawling & Converting:  25%|██▌       | 29/114 [00:11<00:23,  3.61it/s]

[WARN] No <article> found for https://www.healthline.com/mental-health/addiction
[WARN] No <article> found for https://www.healthline.com/mental-health/bipolar
[WARN] No <article> found for https://www.healthline.com/mental-health/apps
[WARN] No <article> found for https://www.healthline.com/mental-health/adhd


Crawling & Converting:  30%|██▉       | 34/114 [00:11<00:12,  6.37it/s]

[WARN] No <article> found for https://www.healthline.com/mental-health/building-relationships
[WARN] No <article> found for https://www.healthline.com/mental-health/eating-disorders
[WARN] No <article> found for https://www.healthline.com/mental-health/crisis-support
[WARN] No <article> found for https://www.healthline.com/mental-health/mind-and-body
[WARN] No <article> found for https://www.healthline.com/mental-health/nutrition-supplements


Crawling & Converting:  33%|███▎      | 38/114 [00:12<00:10,  7.42it/s]

[WARN] No <article> found for https://www.healthline.com/mental-health/ocd
[WARN] No <article> found for https://www.healthline.com/mental-health/schizophrenia
[WARN] No <article> found for https://www.healthline.com/mental-health/treatment
[WARN] No <article> found for https://www.healthline.com/mental-health/sleep


Crawling & Converting:  43%|████▎     | 49/114 [00:17<00:29,  2.21it/s]

[WARN] No <article> found for https://www.healthline.com/nutrition/diets


Crawling & Converting:  48%|████▊     | 55/114 [00:21<00:27,  2.16it/s]

[WARN] No <article> found for https://www.healthline.com/nutrition/feel-good-food


Crawling & Converting:  51%|█████     | 58/114 [00:21<00:17,  3.21it/s]

[WARN] No <article> found for https://www.healthline.com/nutrition/food-and-nutrients


Crawling & Converting:  53%|█████▎    | 60/114 [00:22<00:21,  2.57it/s]

[WARN] No <article> found for https://www.healthline.com/nutrition/food-freedom


Crawling & Converting:  62%|██████▏   | 71/114 [00:25<00:12,  3.39it/s]

[WARN] No <article> found for https://www.healthline.com/nutrition/meal-kits


Crawling & Converting:  66%|██████▌   | 75/114 [00:25<00:07,  4.92it/s]

[WARN] No <article> found for https://www.healthline.com/nutrition/meal-kits/meal-kit-reviews


Crawling & Converting:  67%|██████▋   | 76/114 [00:26<00:10,  3.71it/s]

[WARN] No <article> found for https://www.healthline.com/nutrition/meal-kits/diets


Crawling & Converting:  68%|██████▊   | 78/114 [00:26<00:08,  4.33it/s]

[WARN] No <article> found for https://www.healthline.com/nutrition/meal-kits/grocery-delivery
[WARN] No <article> found for https://www.healthline.com/nutrition/meal-kits/prepared-meal-reviews
[WARN] No <article> found for https://www.healthline.com/nutrition/meal-kits/comparisons


Crawling & Converting:  72%|███████▏  | 82/114 [00:27<00:08,  3.97it/s]

[WARN] No <article> found for https://www.healthline.com/nutrition/nutrition-for-conditions


Crawling & Converting:  82%|████████▏ | 94/114 [00:32<00:07,  2.78it/s]

[WARN] No <article> found for https://www.healthline.com/nutrition/vitamins-and-supplements[WARN] No <article> found for https://www.healthline.com/nutrition/weight-management



Crawling & Converting:  86%|████████▌ | 98/114 [00:33<00:05,  3.07it/s]

[WARN] No <article> found for https://www.healthline.com/reviews/at-home-testing-products


Crawling & Converting:  90%|█████████ | 103/114 [00:34<00:01,  5.60it/s]

[WARN] No <article> found for https://www.healthline.com/reviews/at-home-tests
[WARN] No <article> found for https://www.healthline.com/reviews/at-home-tests/drugs-toxicology


Crawling & Converting:  91%|█████████ | 104/114 [00:34<00:01,  5.15it/s]

[WARN] No <article> found for https://www.healthline.com/reviews/at-home-tests/covid
[WARN] No <article> found for https://www.healthline.com/reviews/at-home-tests/nutrition-and-well-being


Crawling & Converting:  93%|█████████▎| 106/114 [00:35<00:01,  5.12it/s]

[WARN] No <article> found for https://www.healthline.com/reviews/at-home-tests/fertility-family-planning
[WARN] No <article> found for https://www.healthline.com/reviews/mental-health-services-and-products


Crawling & Converting:  96%|█████████▋| 110/114 [00:35<00:00,  7.30it/s]

[WARN] No <article> found for https://www.healthline.com/reviews/at-home-tests/std
[WARN] No <article> found for https://www.healthline.com/reviews/nutrition-products
[WARN] No <article> found for https://www.healthline.com/reviews/mens-health-products


Crawling & Converting: 100%|██████████| 114/114 [00:35<00:00,  3.18it/s]

[WARN] No <article> found for https://www.healthline.com/reviews/sleep-products
[WARN] No <article> found for https://www.healthline.com/reviews/vitamin-and-supplement-products
[WARN] No <article> found for https://www.healthline.com/reviews/womens-health-products





In [None]:
import asyncio
import nest_asyncio
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from tqdm.notebook import tqdm

nest_asyncio.apply()

base_url_list = [
    "nutrition",
    "sleep",
    "mental-health",
    "fitness",
    "reviews",
    "cancer-care",
    "heart-health"
]

async def scrape_healthline_links(end, pbar=None):
    base_url = f"https://www.healthline.com/{end}"
    all_links = set()

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto(base_url, wait_until="networkidle")
        
        while True:
            content = await page.content()
            soup = BeautifulSoup(content, "html.parser")
            article_links = soup.select(f"a[href*='/{end}/']")
            prev_count = len(all_links)
            for link in article_links:
                href = link.get("href")
                full_url = urljoin(base_url, href)
                if f"/{end}/" in full_url and not full_url.endswith((".pdf", "/category/", "/tag/")):
                    all_links.add(full_url)
            if pbar:
                pbar.set_postfix_str(f"{len(all_links)} links")
                pbar.update(len(all_links) - prev_count)
            # Tìm nút "Load More"
            load_more = await page.query_selector("button[class*='load-more']")
            if load_more:
                await load_more.click()
                await page.wait_for_timeout(2000)
            else:
                await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
                await page.wait_for_timeout(2000)
                new_content = await page.content()
                new_soup = BeautifulSoup(new_content, "html.parser")
                new_links = new_soup.select(f"a[href*='/{end}/']")
                if len(new_links) == len(article_links):
                    break
        await browser.close()

    with open(f"healthline_{end}_links.txt", "w", encoding="utf-8") as f:
        for link in sorted(all_links):
            f.write(f"{link}\n")
    if pbar:
        pbar.set_postfix_str(f"Done: {len(all_links)} links")
    return all_links

async def main_parallel():
    tasks = []
    pbar_dict = {}
    loop = asyncio.get_event_loop()
    with tqdm(total=len(base_url_list), desc="Chuyên mục", unit="cat") as main_pbar:
        for end in base_url_list:
            pbar = tqdm(total=0, desc=f"{end}", position=1+base_url_list.index(end), leave=False, unit="link")
            pbar_dict[end] = pbar
            tasks.append(scrape_healthline_links(end, pbar))
        results = await asyncio.gather(*tasks)
        for pbar in pbar_dict.values():
            pbar.close()
        main_pbar.update(len(base_url_list))
    return results

asyncio.run(main_parallel())

In [None]:
import os
import logging
import asyncio
import numpy as np
from lightrag import LightRAG, QueryParam
from lightrag.utils import EmbeddingFunc
from Transformers import AutoTokenizer, AutoModel
from groq import Groq
from pathlib import Path
from tqdm import tqdm
import torch
from tika import parser

# Thiết lập logging
logging.basicConfig(format="%(levelname)s:%(message)s", level=logging.INFO)

# Thư mục chứa file PDF
WORKING_DIR = "./rag_project"
DATA_PDF_DIR = "./data_pdf"
os.makedirs(WORKING_DIR, exist_ok=True)

# Hàm nhúng bge-m3 cục bộ
def bge_m3_embedding(texts):
    model_name = "BAAI/bge-m3"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    model.eval()
    
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1).numpy()
    return embeddings

# Hàm gọi Groq LLM
async def groq_llm_func(prompt, system_prompt=None, history_messages=None, **kwargs):
    client = Groq(api_key="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")
    messages = []
    if system_prompt:
        messages.append({"role": "system", "content": system_prompt})
    if history_messages:
        messages.extend(history_messages)
    messages.append({"role": "user", "content": prompt})
    
    try:
        completion = client.chat.completions.create(
            model="meta-llama/llama-4-scout-17b-16e-instruct",
            messages=messages,
            temperature=1,
            max_completion_tokens=4080,
            top_p=1,
            stream=False
        )
        return completion.choices[0].message.content
    except Exception as e:
        logging.error(f"Lỗi khi gọi Groq LLM: {e}")
        return ""

# Hàm trích xuất văn bản từ PDF với Tika
def extract_pdf_texts(pdf_dir):
    documents = []
    sources = []
    pdf_files = list(Path(pdf_dir).glob("*.pdf"))
    total_files = len(pdf_files)
    
    if total_files == 0:
        logging.warning(f"Không tìm thấy file PDF nào trong thư mục {pdf_dir}")
        return documents, sources
    
    logging.info(f"Bắt đầu trích xuất {total_files} file PDF...")
    for pdf_file in tqdm(pdf_files, desc="Trích xuất PDF", unit="file"):
        try:
            parsed = parser.from_file(str(pdf_file), requestOptions={'timeout': 300})
            text = parsed.get("content", "").strip()
            if not text:
                logging.warning(f"Tệp {pdf_file.name} không chứa nội dung văn bản hoặc có thể là PDF quét.")
                continue
            source = pdf_file.stem  # Lấy tên file không đuôi
            documents.append(text)
            sources.append(source)
            logging.info(f"Đã xử lý: {pdf_file.name}")
        except Exception as e:
            logging.error(f"Lỗi khi xử lý {pdf_file}: {e}")
    return documents, sources

# Hàm tạo câu trả lời với trích dẫn
def format_response_with_citation(response, sources):
    formatted_response = response
    for source in set(sources):  # Loại bỏ trùng lặp
        formatted_response += f" ({source})"
    return formatted_response

# Khởi tạo LightRAG
async def initialize_rag():
    logging.info("Khởi tạo LightRAG...")
    try:
        rag = LightRAG(
            working_dir=WORKING_DIR,
            llm_model_func=groq_llm_func,
            embedding_func=EmbeddingFunc(
                embedding_dim=1024,  # Kích thước nhúng của bge-m3
                max_token_size=512,
                func=bge_m3_embedding
            )
        )
        await rag.initialize_storages()
        logging.info("LightRAG đã được khởi tạo.")
        return rag
    except Exception as e:
        logging.error(f"Lỗi khi khởi tạo LightRAG: {e}")
        return None

# Hàm chính
async def main():
    # Khởi tạo RAG
    rag = await initialize_rag()
    if not rag:
        logging.error("Không thể khởi tạo LightRAG. Thoát chương trình.")
        return
    
    # Trích xuất văn bản từ PDF
    logging.info("Bắt đầu trích xuất văn bản từ PDF...")
    documents, sources = extract_pdf_texts(DATA_PDF_DIR)
    if not documents:
        logging.error("Không có tài liệu nào được trích xuất. Thoát chương trình.")
        return
    
    # Chèn văn bản vào LightRAG với tiến trình
    logging.info("Bắt đầu chèn tài liệu vào LightRAG...")
    for doc, source in tqdm(zip(documents, sources), total=len(documents), desc="Chèn tài liệu", unit="document"):
        try:
            rag.insert(f"{doc}\nSource: {source}")
        except Exception as e:
            logging.error(f"Lỗi khi chèn tài liệu {source}: {e}")
    logging.info("Hoàn tất chèn tài liệu.")
    
    # Ví dụ truy vấn
    query = "How to cope with anxiety?"
    modes = ["naive", "local", "global", "hybrid"]
    
    logging.info("Bắt đầu truy vấn...")
    for mode in tqdm(modes, desc="Truy vấn các chế độ", unit="mode"):
        logging.info(f"Đang truy vấn với chế độ {mode}...")
        try:
            result = await rag.query(query, param=QueryParam(mode=mode))
            formatted_result = format_response_with_citation(result, sources)
            print(f"\nKết quả với chế độ {mode}:\n{formatted_result}")
        except Exception as e:
            logging.error(f"Lỗi khi truy vấn với chế độ {mode}: {e}")
    logging.info("Hoàn tất truy vấn.")

# Chạy chương trình
if __name__ == "__main__":
    asyncio.run(main())

ModuleNotFoundError: No module named 'textract'