In [1]:
import os
import time
import requests
from bs4 import BeautifulSoup
import urllib3
from IPython.display import display
import re
from urllib.parse import urljoin

# Disable SSL warnings
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

def is_valid_url(url):
    return re.match(r'^(https?:\/\/)?([\da-z.-]+)\.([a-z.]{2,6})([\/\w .-]*)*\/?$', url)

def extract_links_from_page(url):
    try:
        print(f"Processing: {url}")
        time.sleep(5)  # Wait for 5 seconds before scraping
        response = requests.get(url, verify=False)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Selecting the required div
        docs_nav_div = soup.select_one("#bd-docs-nav > div")
        
        if not docs_nav_div:
            print(f"No matching div found for {url}")
            return []
        
        # Extracting all href links and making them absolute
        links = [urljoin(url, a['href']) for a in docs_nav_div.find_all('a', href=True)]
        print(f"Found {len(links)} links in {url}")
        return links
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return []

def main():
    input_file = r"N:\\CS\\rohana DS\\linksTEXT\\5\\fifith_structured_books.txt"
    output_file = r"N:\\CS\\rohana DS\\linksTEXT\\5\\fifithBooksSummry.txt"
    
    if not os.path.exists(input_file):
        print("Error: The specified input file does not exist.")
        return

    theridBooksSummry = []
    
    with open(input_file, "r") as file:
        base_links = file.read().splitlines()
    
    valid_links = [link for link in base_links if is_valid_url(link)]
    invalid_links = set(base_links) - set(valid_links)
    
    if invalid_links:
        print("Skipping invalid links:")
        for link in invalid_links:
            print(f" - {link}")
    
    for base_link in valid_links:
        extracted_links = extract_links_from_page(base_link)
        full_links = [urljoin(base_link, link) for link in extracted_links]
        theridBooksSummry.extend(full_links)
        print(f"Total collected links so far: {len(theridBooksSummry)}")
    
    with open(output_file, "w") as file:
        file.write("\n".join(theridBooksSummry))
    
    display(f"Extracted {len(theridBooksSummry)} links and saved to {output_file}")

# Run the main function in Jupyter Notebook
if __name__ == "__main__":
    main()


Processing: https://inferentialthinking.com/chapters/intro.html
Found 97 links in https://inferentialthinking.com/chapters/intro.html
Total collected links so far: 97
Processing: https://fraud-detection-handbook.github.io/fraud-detection-handbook/Foreword.html
Found 40 links in https://fraud-detection-handbook.github.io/fraud-detection-handbook/Foreword.html
Total collected links so far: 137
Processing: https://earth-env-data-science.github.io/intro.html
Found 38 links in https://earth-env-data-science.github.io/intro.html
Total collected links so far: 175
Processing: https://allendowney.github.io/ThinkBayes2/index.html
Found 35 links in https://allendowney.github.io/ThinkBayes2/index.html
Total collected links so far: 210
Processing: https://bdpedigo.github.io/networks-course/landing.html
Found 30 links in https://bdpedigo.github.io/networks-course/landing.html
Total collected links so far: 240
Processing: https://www.textbook.ds100.org/intro.html
Found 149 links in https://www.textbo

'Extracted 420 links and saved to N:\\\\CS\\\\rohana DS\\\\linksTEXT\\\\5\\\\fifithBooksSummry.txt'

In [2]:
import os
import time
import requests
from bs4 import BeautifulSoup
import urllib3
from IPython.display import display
import re

# Disable SSL warnings
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

def extract_text_from_page(url, output_dir):
    try:
        print(f"Fetching text from: {url}")
        time.sleep(3)  # Wait for 3 seconds before scraping
        response = requests.get(url, verify=False)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Selecting the required div
        book_body_div = soup.select_one("#main-content > div")
        
        if not book_body_div:
            print(f"No matching text found for {url}")
            return None
        
        text_content = book_body_div.get_text(strip=True)
        
        # Save text to file
        filename = os.path.join(output_dir, f"{re.sub(r'[^a-zA-Z0-9]', '_', url)}.txt")
        with open(filename, "w", encoding="utf-8") as text_file:
            text_file.write(text_content)
        
        print(f"Saved text from {url} to {filename}")
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")

def main():
    input_file = r"N:\\CS\\rohana DS\\linksTEXT\\5\\fifithBooksSummry.txt"
    text_output_dir = r"N:\\CS\\rohana DS\\linksTEXT\\5\\ExtractedTexts"
    
    if not os.path.exists(input_file):
        print("Error: The specified input file does not exist.")
        return
    
    os.makedirs(text_output_dir, exist_ok=True)
    
    with open(input_file, "r") as file:
        links = file.read().splitlines()
    
    for link in links:
        extract_text_from_page(link, text_output_dir)
    
    display("Text extraction completed.")

# Run the main function in Jupyter Notebook
if __name__ == "__main__":
    main()


Fetching text from: https://inferentialthinking.com/chapters/01/what-is-data-science.html
Saved text from https://inferentialthinking.com/chapters/01/what-is-data-science.html to N:\\CS\\rohana DS\\linksTEXT\\5\\ExtractedTexts\https___inferentialthinking_com_chapters_01_what_is_data_science_html.txt
Fetching text from: https://inferentialthinking.com/chapters/01/1/intro.html
Saved text from https://inferentialthinking.com/chapters/01/1/intro.html to N:\\CS\\rohana DS\\linksTEXT\\5\\ExtractedTexts\https___inferentialthinking_com_chapters_01_1_intro_html.txt
Fetching text from: https://inferentialthinking.com/chapters/01/1/1/computational-tools.html
Saved text from https://inferentialthinking.com/chapters/01/1/1/computational-tools.html to N:\\CS\\rohana DS\\linksTEXT\\5\\ExtractedTexts\https___inferentialthinking_com_chapters_01_1_1_computational_tools_html.txt
Fetching text from: https://inferentialthinking.com/chapters/01/1/2/statistical-techniques.html
Saved text from https://inferen

'Text extraction completed.'