In [2]:
import os
import time
import requests
import urllib3
import re
import hashlib
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from concurrent.futures import ThreadPoolExecutor

# Disable SSL warnings
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# Constants
BASE_DIR = r"N:\CS\rohana DS\linksTEXT\tryagain"
FOLDERS = range(1, 17)  # From 7 to 16
MAX_THREADS = 5  # Number of concurrent threads
DELAY = 6  # Delay between requests to avoid being blocked

def is_valid_url(url):
    return True

def read_file(file_path):
    """Reads the content of a file if it exists, otherwise returns None."""
    if os.path.exists(file_path):
        with open(file_path, "r", encoding="utf-8") as file:
            return file.read().strip()
    print(f"Warning: {file_path} not found.")
    return None

def extract_links_from_page(url, selector):
    """Extracts href links from a webpage based on the given selector."""
    try:
        print(f"Processing: {url}")
        time.sleep(DELAY)  # Delay to avoid request throttling
        response = requests.get(url, verify=False, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        target_element = soup.select_one(selector)
        if not target_element:
            print(f"No matching element found for selector '{selector}' in {url}")
            return []

        links = [urljoin(url, a['href']) for a in target_element.find_all('a', href=True)]
        print(f"Found {len(links)} links in {url}")
        return links
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return []

def extract_text_from_page(url, selector, text_output_dir):
    """Extracts text content from a webpage and saves it as a file."""
    try:
        print(f"Fetching text from: {url}")
        time.sleep(DELAY - 1)  # Delay for politeness
        response = requests.get(url, verify=False, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        target_element = soup.select_one(selector)
        if not target_element:
            print(f"No matching text found for selector '{selector}' in {url}")
            return
        
        text_content = target_element.get_text(strip=True)
        filename = os.path.join(text_output_dir, f"{hashlib.md5(url.encode()).hexdigest()}.txt")
        
        with open(filename, "w", encoding="utf-8") as text_file:
            text_file.write(text_content)
        
        print(f"Saved text from {url} to {filename}")
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")

def process_folder(folder_num):
    """Processes each folder, extracts links, and scrapes text."""
    folder_path = os.path.join(BASE_DIR, f"{folder_num}")
    
    # Define file paths
    structured_books_file = os.path.join(folder_path, f"{folder_num}_structured_books.txt")
    summary_structure_file = os.path.join(folder_path, f"{folder_num}_summary_structure.txt")
    books_summary_file = os.path.join(folder_path, f"{folder_num}_BooksSummry.txt")
    body_text_file = os.path.join(folder_path, f"{folder_num}_body_text.txt")
    text_output_dir = os.path.join(folder_path, "ExtractedTexts")

    # Ensure output directory exists
    os.makedirs(text_output_dir, exist_ok=True)

    # Read base links
    base_links = read_file(structured_books_file)
    if not base_links:
        return
    base_links = base_links.splitlines()

    # Read selectors
    summary_selector = read_file(summary_structure_file)
    body_selector = read_file(body_text_file)
    if not summary_selector or not body_selector:
        return

    valid_links = [link for link in base_links if is_valid_url(link)]
    if not valid_links:
        print(f"No valid links found in {structured_books_file}")
        return

    # Extracting all links
    extracted_links = []
    with ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
        results = executor.map(lambda url: extract_links_from_page(url, summary_selector), valid_links)
        for result in results:
            extracted_links.extend(result)

    # Save extracted links
    with open(books_summary_file, "w", encoding="utf-8") as file:
        file.write("\n".join(extracted_links))
    print(f"Extracted {len(extracted_links)} links and saved to {books_summary_file}")

    # Extract text from extracted links
    with ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
        executor.map(lambda url: extract_text_from_page(url, body_selector, text_output_dir), extracted_links)

    print(f"Completed processing for folder #{folder_num}")

def main():
    for folder_num in FOLDERS:
        process_folder(folder_num)

    print("Pipeline execution completed.")

if __name__ == "__main__":
    main()


Processing: https://nustat.github.io/intro-stat-ds/
Found 249 links in https://nustat.github.io/intro-stat-ds/
Extracted 249 links and saved to N:\CS\rohana DS\linksTEXT\tryagain\4\4_BooksSummry.txt
Fetching text from: https://nustat.github.io/intro-stat-ds/index.html
Fetching text from: https://nustat.github.io/intro-stat-ds/index.html#introduction-for-students
Fetching text from: https://nustat.github.io/intro-stat-ds/index.html#what-you-will-learn-from-this-book
Fetching text from: https://nustat.github.io/intro-stat-ds/index.html#datascience-pipeline
Fetching text from: https://nustat.github.io/intro-stat-ds/index.html#reproducible-research
No matching text found for selector 'body > div.book.without-animation.with-summary.font-size-2.font-family-1 > div.book-body.fixed' in https://nustat.github.io/intro-stat-ds/index.html#reproducible-research
Fetching text from: https://nustat.github.io/intro-stat-ds/1-getting-started.html
No matching text found for selector 'body > div.book.with