In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import json
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import requests
import os
import re

In [2]:
options = Options()
options.add_argument('--headless=new')
options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox')
driver = webdriver.Chrome(options=options)
driver.get("https://tuoitre.vn/giai-tri.htm")
wait = WebDriverWait(driver, 10)

k = 5

In [3]:
def scroll_to_bottom(driver):
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        try:
            wait.until(lambda driver: driver.execute_script("return document.body.scrollHeight") > last_height)
            last_height = driver.execute_script("return document.body.scrollHeight")
        except:
            break

In [4]:
def click_load_more(driver, num_clicks):
    for _ in range(num_clicks):
        try:
            scroll_to_bottom(driver)
            load_more_button = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, "box-viewmore")))
            load_more_button.click()
        except Exception as e:
            break

In [5]:
def extract_date(date):
    return date.split(" ")[0].strip()

In [6]:
click_load_more(driver, 2)

In [15]:
news_list = driver.find_element(By.ID, "load-list-news")
box_items = news_list.find_elements(By.CLASS_NAME, "box-category-item")

cnt = 1

for box in box_items:
    if cnt > int(k):
        break

    try:
        data = {}

        # Extract basic information from the main page
        box_category = box.find_element(By.XPATH, './div[@class="box-category-content"]/a')
        box_category_sapo = box.find_element(By.XPATH, './div[@class="box-category-content"]/p')

        link_and_avatar_box = box.find_element(By.CLASS_NAME, "box-category-link-title")
        title = link_and_avatar_box.get_attribute("title")
        href = link_and_avatar_box.get_attribute("href")
        category = box_category.get_attribute("title")
        content = box_category_sapo.text

        # Open the post link in a new tab to extract the author's name
        current_window = driver.current_window_handle
        driver.execute_script("window.open('');")
        driver.switch_to.window(driver.window_handles[-1])
        driver.get(href)

        author_info = driver.find_element(By.CLASS_NAME, "author-info")
        author_name = author_info.find_element(By.CLASS_NAME, "name").text
        detail_time = driver.find_element(By.CLASS_NAME, "detail-time")
        date = detail_time.text

        # Populate the data dictionary
        data['postId'] = f'{cnt:03d}'
        data['title'] = title
        data['link'] = href
        data['category'] = category
        data['date'] = extract_date(date)
        data['author'] = author_name
        data['author'] = author_name
        data['content'] = content
        data['images'] = []

        content_div = driver.find_element(By.CSS_SELECTOR, 'div.detail-content.afcbc-body[data-role="content"][itemprop="articleBody"]')
        figure_elements = content_div.find_elements(By.TAG_NAME, 'figure')
        image_counter = 1
        for figure in figure_elements:
            # Inside each figure, find img tags
            img_elements = figure.find_elements(By.TAG_NAME, 'img')
            for img in img_elements:
                img_url = img.get_attribute('src')
                data['images'].append(img_url)

                if img_url:
                    # Download the image and save it
                    try:
                        image_response = requests.get(img_url, stream=True, timeout=10)
                        if image_response.status_code == 200:
                            os.makedirs(f'images/{cnt:03d}', exist_ok=True)
                            image_path = os.path.join(f'images/{cnt:03d}', f'image{image_counter}.jpg')
                            with open(image_path, 'wb') as f:
                                for chunk in image_response.iter_content(1024):
                                    f.write(chunk)
                            image_counter += 1
                    except Exception as e:
                        print(f"Error downloading image {img_url}: {e}")

        audio_element = driver.find_element(By.TAG_NAME, 'audio')
        audio_url = audio_element.get_attribute('src')
        data['audio_link'] = audio_url

        try:
            response = requests.get(audio_url, stream=True, timeout=10)
            if response.status_code == 200:
                audio_path = os.path.join('audio', f'{cnt:03d}.mp3')

                with open(audio_path, 'wb') as f:
                    for chunk in response.iter_content(1024):
                        if chunk:
                            f.write(chunk)

                print(f"Audio has been successfully downloaded: {audio_path}")
            else:
                print(f"Failed to download audio. Status code: {response.status_code}")
        except Exception as e:
            print(f"An error occurred: {e}")


        try:
            react_info = driver.find_element(By.CLASS_NAME, "reactinfo")
            votes = {
                "icostar": "0",
                "icolikeauthor": "0",
                "icoheartauthor": "0"
            }

            try:
                icostar_span = react_info.find_element(By.CSS_SELECTOR, "i.icostar + span")
                votes["icostar"] = icostar_span.text.strip()
            except:
                votes["icostar"] = "0" 

            # Trích xuất icolikeauthor
            try:
                icolikeauthor_span = react_info.find_element(By.CSS_SELECTOR, "i.icolikeauthor + span")
                votes["icolikeauthor"] = icolikeauthor_span.text.strip()
            except:
                votes["icolikeauthor"] = "0"

            # Trích xuất icoheartauthor
            try:
                icoheartauthor_span = react_info.find_element(By.CSS_SELECTOR, "i.icoheartauthor + span")
                votes["icoheartauthor"] = icoheartauthor_span.text.strip()
            except:
                votes["icoheartauthor"] = "0"

            data['votes'] = votes
        except Exception as e:
            print(f"Không thể trích xuất votes cho bài viết {cnt}: {e}")
            data['votes'] = {
                "icostar": "0",
                "icolikeauthor": "0",
                "icoheartauthor": "0"
            }

        wait = WebDriverWait(driver, 20)  # Increase timeout if necessary
        comment_ul = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'ul[data-view="listcm"]')))
        comment_items = driver.find_elements(By.XPATH, '//ul[@data-view="listcm"]/li[contains(@class, "item-comment")]')

        comments = []
        for comment_item in comment_items:
            comment = {}
            comment['commentId'] = comment_item.get_attribute('data-cmid')
            comment['author'] = comment_item.get_attribute('data-replyname')
            comment['text'] = comment_item.find_element(By.CLASS_NAME, 'contentcomment').text
            comment['date'] = comment_item.find_element(By.CLASS_NAME, 'timeago').get_attribute('title')
            
            comment['votes'] = []

            try:
                # Find the div containing reaction information
                btnright_hasreaction = comment_item.find_element(By.CSS_SELECTOR, 'div.btnright.hasreaction')
                wrapreact = btnright_hasreaction.find_element(By.CSS_SELECTOR, 'div.wrapreact')
                listreact = wrapreact.find_element(By.CSS_SELECTOR, 'div.listreact')
                colreacts = listreact.find_elements(By.CSS_SELECTOR, 'div.colreact')

                for colreact in colreacts:
                    reaction = {}
                    # Get the class of the reaction icon (e.g., 'spritecmt icolikereact')
                    span_icon = colreact.find_element(By.CSS_SELECTOR, 'span[class^="spritecmt"]')
                    reaction_class = span_icon.get_attribute('class').strip()
                    # Get the number of reactions using get_attribute('textContent')
                    num_span = colreact.find_element(By.CLASS_NAME, 'num')
                    num = num_span.get_attribute('textContent').strip()
                    # Debugging statements
                    print(f"Reaction Class: {reaction_class}, Count: {num}")
                    reaction[reaction_class] = num
                    comment['votes'].append(reaction)
            except Exception as e:
                print(f"Error extracting votes for comment {comment['commentId']}: {e}")
                # If there's an error, set votes to an empty list
                comment['votes'] = []
            
            comments.append(comment)

        data['comments'] = comments

        # Save the data to a JSON file
        postID = data['postId']
        with open(f'data/{postID}.json', 'w', encoding='utf-8') as json_file:
            json.dump(data, json_file, ensure_ascii=False, indent=4)

        print(data, sep='\n\n')

        cnt += 1

    except Exception as e:
        print(f"Error processing box {cnt}: {e}")
    finally:
        if len(driver.window_handles) > 1:
            driver.close()
            driver.switch_to.window(current_window)
        else:
            driver.switch_to.window(current_window)

Audio has been successfully downloaded: audio/001.mp3
{'postId': '001', 'title': 'Nhạc Phú Quang đâu chỉ màu buồn', 'link': 'https://tuoitre.vn/nhac-phu-quang-dau-chi-mau-buon-20241026064630969.htm', 'category': 'Giải trí', 'date': '26/10/2024', 'author': 'THIÊN ĐIỂU', 'content': 'Nghe Hồ Quỳnh Hương, Khánh Linh, Hoàng Hải, Olpus hát Phú Quang trong đêm nhạc ‘Tình yêu ở lại’ tại khu vườn của Nhà hát lớn Hà Nội.', 'images': ['https://cdn.tuoitre.vn/thumb_w/730/471584752817336320/2024/10/26/base64-17298997667731748189732.jpeg', 'https://cdn.tuoitre.vn/thumb_w/730/471584752817336320/2024/10/26/base64-1729899766816765267379.jpeg', 'https://cdn.tuoitre.vn/thumb_w/730/471584752817336320/2024/10/26/base64-1729899766859620236085.jpeg', 'https://cdn.tuoitre.vn/thumb_w/730/471584752817336320/2024/10/26/base64-17298997669001224270323.jpeg', 'https://cdn.tuoitre.vn/thumb_w/730/471584752817336320/2024/10/26/base64-1729899766939140645658.jpeg'], 'audio_link': 'https://tts.mediacdn.vn/2024/10/26/tuoi