In [1]:
!pip install selenium webdriver-manager

Collecting selenium
  Downloading selenium-4.38.0-py3-none-any.whl.metadata (7.5 kB)
Collecting webdriver-manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting trio<1.0,>=0.31.0 (from selenium)
  Downloading trio-0.31.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket<1.0,>=0.12.2 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting sortedcontainers (from trio<1.0,>=0.31.0->selenium)
  Using cached sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio<1.0,>=0.31.0->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket<1.0,>=0.12.2->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Collecting pysocks!=1.5.7,<2.0,>=1.5.6 (from urllib3[socks]<3.0,>=2.5.0->selenium)
  Downloading PySocks-1.7.1-py3-none-any.whl.metadata (13 kB)
Collecting python-dotenv (from web

In [2]:
import concurrent.futures
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from urllib.parse import urljoin, urlparse


In [7]:

def get_links_from_url(url, max_links=50):
    """
    Connects to a URL and extracts up to max_links of unique, valid URLs found on the page.
    """
    print(f"Getting links from {url}...")
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    driver = None
    try:
        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=options)
        driver.get(url)
        time.sleep(5)  # Wait for dynamic content to load

        links = set()
        elements = driver.find_elements(By.TAG_NAME, 'a')
        for element in elements:
            if len(links) >= max_links:
                break
            href = element.get_attribute('href')
            if href:
                absolute_url = urljoin(url, href)
                parsed_url = urlparse(absolute_url)
                if parsed_url.scheme in ['http', 'https'] and parsed_url.netloc:
                    links.add(absolute_url)
        return list(links)
    except Exception as e:
        print(f"Error while getting links from {url}: {e}")
        return []
    finally:
        if driver:
            driver.quit()

def crawl_website(url):
    """
    Crawls a single website and returns its main text content.
    Each call creates its own WebDriver instance for thread safety.
    """
    print(f"Crawling {url}...")
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    driver = None
    try:
        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=options)
        driver.get(url)
        time.sleep(3)
        body = driver.find_element(By.TAG_NAME, 'body')
        text = body.text
        print(f"Finished crawling {url}. Text length: {len(text)}")
        return f"--- Content from {url} ---\n{text}\n\n"
    except Exception as e:
        print(f"Could not crawl {url}: {e}")
        return f"--- Could not crawl {url} ---\nError: {e}\n\n"
    finally:
        if driver:
            driver.quit()

# --- Main Execution ---
start_time = time.time()
main_url = "https://vneconomy.vn/hon-2-trieu-khach-hang-da-bat-sinh-loi-tu-dong-cung-techcombank.htm"

# 1. Get up to 50 links from the main page.
initial_links = get_links_from_url(main_url, max_links=1)

if not initial_links:
    print("No links found. Exiting.")
else:
    print(f"Found {len(initial_links)} links to crawl.")
    
    # We will crawl the main URL as well as the links found.
    urls_to_crawl = [main_url] + initial_links
    # Ensure we don't exceed 50 "relative" sites + 1 main site.
    urls_to_crawl = urls_to_crawl[:51] 

    all_text = ""

    # 2. Crawl websites in parallel using a thread pool.
    # The number of workers can be adjusted based on your system's resources.
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        future_to_url = {executor.submit(crawl_website, url): url for url in urls_to_crawl}
        
        for future in concurrent.futures.as_completed(future_to_url):
            try:
                data = future.result()
                all_text += data
            except Exception as exc:
                url_for_exc = future_to_url[future]
                print(f'{url_for_exc} generated an exception: {exc}')

    # 3. Save the result to a single text file.
    output_filename = "crawled_text.txt"
    with open(output_filename, "w", encoding="utf-8") as f:
        f.write(all_text)
        
    print("\n--- Crawling Complete ---")
    print(f"Data saved to {output_filename}")
    
    end_time = time.time()
    print(f"Total execution time: {end_time - start_time:.2f} seconds")


Getting links from https://vneconomy.vn/hon-2-trieu-khach-hang-da-bat-sinh-loi-tu-dong-cung-techcombank.htm...


Found 1 links to crawl.
Crawling https://vneconomy.vn/hon-2-trieu-khach-hang-da-bat-sinh-loi-tu-dong-cung-techcombank.htm...
Crawling https://vneconomy.vn/chung-khoan.htm...
Finished crawling https://vneconomy.vn/hon-2-trieu-khach-hang-da-bat-sinh-loi-tu-dong-cung-techcombank.htm. Text length: 7291
Finished crawling https://vneconomy.vn/chung-khoan.htm. Text length: 6312

--- Crawling Complete ---
Data saved to crawled_text.txt
Total execution time: 24.98 seconds


In [10]:
# import time
# from selenium import webdriver
# from selenium.webdriver.chrome.service import Service
# from webdriver_manager.chrome import ChromeDriverManager
# from selenium.webdriver.common.by import By
# from selenium.webdriver.chrome.options import Options

def crawl_and_save(url, output_filename):
    """
    Crawls a single website using Selenium, extracts its main text,
    and saves it to a specified text file.

    Args:
        url (str): The URL of the website to crawl.
        output_filename (str): The name of the file to save the text to.
    """
    print(f"Starting to crawl {url}...")
    
    # Setup headless Chrome options
    options = Options()
    options.add_argument("--headless")  # Run browser in the background
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    
    driver = None  # Initialize driver to None
    
    try:
        # Automatically manage ChromeDriver
        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=options)
        
        # Navigate to the URL
        driver.get(url)
        
        # Wait a few seconds for dynamic content to load
        time.sleep(3)
        
        # Find the body element and extract all visible text
        body_element = driver.find_element(By.TAG_NAME, 'body')
        content = body_element.text
        
        # Save the extracted content to a file
        with open(output_filename, "w", encoding="utf-8") as f:
            f.write(content)
            
        print(f"Successfully crawled and saved content to '{output_filename}'")
        print(f"Total characters saved: {len(content)}")
        
    except Exception as e:
        print(f"An error occurred while crawling {url}: {e}")
        
    finally:
        # Ensure the browser is closed even if an error occurs
        if driver:
            driver.quit()
            print("Browser closed.")

# --- Example Usage ---
# You can replace this with any URL and filename you want.
target_url = "https://uxfoundation.vn/bai-viet/tinh-nang-sinh-loi-tu-dong-cua-techcombank"
output_file = "techcombank_sinh_loi_tu_dong.txt"

crawl_and_save(target_url, output_file)

Starting to crawl https://uxfoundation.vn/bai-viet/tinh-nang-sinh-loi-tu-dong-cua-techcombank...
Successfully crawled and saved content to 'techcombank_sinh_loi_tu_dong.txt'
Total characters saved: 7337
Browser closed.
