In [4]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException
from concurrent.futures import ThreadPoolExecutor
from time import sleep
import pandas as pd
import os

def init_driver():
    """Khởi tạo trình duyệt Chrome cho mỗi luồng."""
    driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
    driver.maximize_window()
    return driver

def scroll_page(driver, scroll_count=10, pause=1.5, offset=25):
    """Cuộn trang để tải thêm nội dung."""
    total_height = driver.execute_script("return document.body.scrollHeight")
    scroll_step = (total_height - offset) / scroll_count
    for i in range(1, scroll_count + 1):
        driver.execute_script(f"window.scrollTo(0, {i * scroll_step});")
        sleep(pause)

def load_bookmark(bookmark_file="bookmark.xlsx"):
    """Tải danh sách các liên kết đã cào từ file Excel."""
    if os.path.exists(bookmark_file):
        df = pd.read_excel(bookmark_file)
        return set(df["Link"].tolist())
    return set()

def save_bookmark(collected_links, bookmark_file="bookmark.xlsx"):
    """Lưu danh sách các liên kết đã cào vào file Excel."""
    df = pd.DataFrame({"Link": list(collected_links)})
    df.to_excel(bookmark_file, index=False)

def append_to_excel(new_data, file_path):
    """Nối thêm dữ liệu mới vào file Excel mà không ghi đè."""
    if os.path.exists(file_path):
        # Đọc dữ liệu cũ nếu file đã tồn tại
        old_data = pd.read_excel(file_path)
        # Nối dữ liệu cũ và mới
        combined_data = pd.concat([old_data, new_data], ignore_index=True)
    else:
        # Nếu file chưa tồn tại, chỉ lưu dữ liệu mới
        combined_data = new_data
    
    # Ghi dữ liệu kết hợp vào file Excel
    combined_data.to_excel(file_path, index=False)


def collect_titles_and_links(driver, limit=100, bookmark_file="bookmark.xlsx"):
    """Thu thập tiêu đề và liên kết bài viết."""
    collected_data = []
    collected_links = load_bookmark(bookmark_file)
    print(f"Đã cào {len(collected_links)} bài. Cào thêm đến {limit} bài.")

    while len(collected_links) < limit:
        articles = driver.find_elements(By.CSS_SELECTOR, "article.item-news")
        if not articles:
            scroll_page(driver, scroll_count=10)
            continue
        for article in articles:
            try:
                title_element = article.find_element(By.CSS_SELECTOR, "h3.title-news a")
                title = title_element.text
                link = title_element.get_attribute("href")
                if link not in collected_links:
                    collected_links.add(link)
                    author = article.find_element(By.CSS_SELECTOR, "p.meta-news a.name-author").text
                    author_link = article.find_element(By.CSS_SELECTOR, "p.meta-news a.name-author").get_attribute("href")
                    category = article.find_element(By.CSS_SELECTOR, "p.meta-news a.cat").text
                    comments_count = article.find_element(By.CSS_SELECTOR, "p.meta-news a.count_cmt span.font_icon").text
                    collected_data.append({
                        "Title": title,
                        "Link": link,
                        "Author": author,
                        "Author Link": author_link,
                        "Category": category,
                        "Comments Count": comments_count
                    })
                    print(f"{len(collected_links)}/{limit}")  # In tiến trình cào
                    save_bookmark(collected_links, bookmark_file)  # Cập nhật bookmark
                    if len(collected_links) >= limit:
                        break
            except NoSuchElementException:
                continue
        scroll_page(driver, scroll_count=5)
    return collected_data


def extract_detail_info(driver, link):
    """Thu thập thông tin chi tiết từ một bài viết."""
    driver.get(link)
    sleep(2)
    try:
        date = driver.find_element(By.CSS_SELECTOR, "span.date").text
        title_detail = driver.find_element(By.CSS_SELECTOR, "h1.title-detail").text
        description = driver.find_element(By.CSS_SELECTOR, "p.description").text
        content = "\n".join([p.text for p in driver.find_elements(By.CSS_SELECTOR, "article.fck_detail p")])

        # Cuộn trang để tải toàn bộ nội dung
        scroll_page(driver, scroll_count=10, pause=1.5, offset=50)

        # Thu thập bình luận
        comments_data = []
        total_comments_count = 0

        while True:  # Nhấn vào "Xem thêm ý kiến" nếu tồn tại
            try:
                view_more_button = driver.find_element(By.CSS_SELECTOR, "div.view_more_coment a#show_more_coment")
                driver.execute_script("arguments[0].click();", view_more_button)
                sleep(2)
                driver.execute_script("window.scrollBy(0, 500);")
                sleep(1)
            except NoSuchElementException:
                break

        comments = driver.find_elements(By.CSS_SELECTOR, "div.comment_item")
        for comment in comments:
            try:
                nickname = comment.find_element(By.CSS_SELECTOR, "span.txt-name a.nickname").text
                nickname_href = comment.find_element(By.CSS_SELECTOR, "span.txt-name a.nickname").get_attribute("href")
                comment_content = comment.find_element(By.CSS_SELECTOR, "p.full_content").text.strip()
                likes = comment.find_element(By.CSS_SELECTOR, "div.reactions-total a.number").text
                comment_time = comment.find_element(By.CSS_SELECTOR, "span.time-com").text
                reply_nicknames, reply_links = [], []
                try:
                    reply_button = comment.find_element(By.CSS_SELECTOR, "a.view_all_reply")
                    driver.execute_script("arguments[0].click();", reply_button)
                    sleep(1)
                    replies = comment.find_elements(By.CSS_SELECTOR, "div.sub_comment div.comment_item")
                    for reply in replies:
                        reply_nicknames.append(reply.find_element(By.CSS_SELECTOR, "span.txt-name a.nickname").text)
                        reply_links.append(reply.find_element(By.CSS_SELECTOR, "span.txt-name a.nickname").get_attribute("href"))
                except NoSuchElementException:
                    pass
                comments_data.append({
                    "Nickname": nickname,
                    "Nickname Link": nickname_href,
                    "Comment": comment_content,
                    "Likes": likes,
                    "Comment Time": comment_time,
                    "Reply Nicknames": reply_nicknames,
                    "Reply Links": reply_links
                })
                total_comments_count += 1
            except NoSuchElementException:
                continue

        return {
            "Date": date,
            "Detailed Title": title_detail,
            "Description": description,
            "Content": content,
            "Comments Count": total_comments_count,
            "Comments": comments_data
        }
    except NoSuchElementException:
        return {}


def process_article(article, idx, total):
    """Xử lý từng bài viết trong một luồng và in tiến trình."""
    print(f"Processing {idx + 1}/{total}: {article['Link']}")
    driver = init_driver()
    try:
        details = extract_detail_info(driver, article["Link"])
        article.update(details)
    finally:
        driver.quit()
    return article


def main():
    # Bước 1: Khởi tạo trình duyệt và thu thập tiêu đề và liên kết
    driver = init_driver()
    driver.get("https://vnexpress.net/goc-nhin")
    articles = collect_titles_and_links(driver, limit=3265, bookmark_file="bookmark.xlsx")
    driver.quit()

    # Bước 2: Xử lý thông tin chi tiết bằng đa luồng
    total_articles = len(articles)
    with ThreadPoolExecutor(max_workers=5) as executor:
        results = list(executor.map(lambda args: process_article(*args), 
                                    [(article, idx, total_articles) for idx, article in enumerate(articles)]))

    # Lưu dữ liệu vào Excel
    df = pd.DataFrame(results)
    print(df)
    append_to_excel(df, "D:/du lieu o cu/HUTECH Courses/Social Networking Course/SocialNetworkingProject/Project của Đạt/vnexpress_articles_4.xlsx")

if __name__ == "__main__":
    main()


Đã cào 1000 bài. Cào thêm đến 3265 bài.
1001/3265
1002/3265
1003/3265
1004/3265
1005/3265
1006/3265
1007/3265
1008/3265
1009/3265
1010/3265
1011/3265
1012/3265
1013/3265
1014/3265
1015/3265
1016/3265
1017/3265
1018/3265
1019/3265
1020/3265
1021/3265
1022/3265
1023/3265
1024/3265
1025/3265
1026/3265
1027/3265
1028/3265
1029/3265
1030/3265
1031/3265
1032/3265
1033/3265
1034/3265
1035/3265
1036/3265
1037/3265
1038/3265
1039/3265
1040/3265
1041/3265
1042/3265
1043/3265
1044/3265
1045/3265
1046/3265
1047/3265
1048/3265
1049/3265
1050/3265
1051/3265
1052/3265
1053/3265
1054/3265
1055/3265
1056/3265
1057/3265
1058/3265
1059/3265
1060/3265
1061/3265
1062/3265
1063/3265
1064/3265
1065/3265
1066/3265
1067/3265
1068/3265
1069/3265
1070/3265
1071/3265
1072/3265
1073/3265
1074/3265
1075/3265
1076/3265
1077/3265
1078/3265
1079/3265
1080/3265
1081/3265
1082/3265
1083/3265
1084/3265
1085/3265
1086/3265
1087/3265
1088/3265
1089/3265
1090/3265
1091/3265
1092/3265
1093/3265
1094/3265
1095/3265
1096/3265
