In [1]:
import os
import time
import requests
from bs4 import BeautifulSoup
import urllib3
from IPython.display import display
import re
from urllib.parse import urljoin

# Disable SSL warnings
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

def is_valid_url(url):
    return re.match(r'^(https?:\/\/)?([\da-z.-]+)\.([a-z.]{2,6})([\/\w .-]*)*\/?$', url)

def extract_links_from_page(url):
    try:
        print(f"Processing: {url}")
        time.sleep(5)  # Wait for 5 seconds before scraping
        response = requests.get(url, verify=False)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Selecting the required div
        sidebar_div = soup.select_one("#quarto-sidebar > div.sidebar-menu-container > ul")
        
        if not sidebar_div:
            print(f"No matching div found for {url}")
            return []
        
        # Extracting all href links and making them absolute
        links = [urljoin(url, a['href']) for a in sidebar_div.find_all('a', href=True)]
        print(f"Found {len(links)} links in {url}")
        return links
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return []

def main():
    input_file = r"N:\\CS\\rohana DS\\linksTEXT\\3\\therid_structured_books.txt"
    output_file = r"N:\\CS\\rohana DS\\linksTEXT\\3\\theridBooksSummry.txt"
    
    if not os.path.exists(input_file):
        print("Error: The specified input file does not exist.")
        return

    theridBooksSummry = []
    
    with open(input_file, "r") as file:
        base_links = file.read().splitlines()
    
    valid_links = [link for link in base_links if is_valid_url(link)]
    invalid_links = set(base_links) - set(valid_links)
    
    if invalid_links:
        print("Skipping invalid links:")
        for link in invalid_links:
            print(f" - {link}")
    
    for base_link in valid_links:
        extracted_links = extract_links_from_page(base_link)
        full_links = [urljoin(base_link, link) for link in extracted_links]
        theridBooksSummry.extend(full_links)
        print(f"Total collected links so far: {len(theridBooksSummry)}")
    
    with open(output_file, "w") as file:
        file.write("\n".join(theridBooksSummry))
    
    display(f"Extracted {len(theridBooksSummry)} links and saved to {output_file}")

# Run the main function in Jupyter Notebook
if __name__ == "__main__":
    main()


Processing: https://mdsr-book.github.io/mdsr3e/
Found 30 links in https://mdsr-book.github.io/mdsr3e/
Total collected links so far: 30
Processing: https://r4ds.hadley.nz/
Found 38 links in https://r4ds.hadley.nz/
Total collected links so far: 68
Processing: https://online.stat.psu.edu/stat508/
Found 14 links in https://online.stat.psu.edu/stat508/
Total collected links so far: 82
Processing: http://rafalab.dfci.harvard.edu/dsbook-part-2/
Found 40 links in http://rafalab.dfci.harvard.edu/dsbook-part-2/
Total collected links so far: 122
Processing: https://wesmckinney.com/book/
Found 17 links in https://wesmckinney.com/book/
Total collected links so far: 139
Processing: https://www.crumplab.com/statistics/
Found 15 links in https://www.crumplab.com/statistics/
Total collected links so far: 154
Processing: https://mlr3book.mlr-org.com/
Found 21 links in https://mlr3book.mlr-org.com/
Total collected links so far: 175
Processing: https://ggplot2-book.org/
Found 29 links in https://ggplot2-b

'Extracted 378 links and saved to N:\\\\CS\\\\rohana DS\\\\linksTEXT\\\\3\\\\theridBooksSummry.txt'

In [None]:
import os
import time
import requests
from bs4 import BeautifulSoup
import urllib3
from IPython.display import display
import re

# Disable SSL warnings
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

def extract_text_from_page(url, output_dir):
    try:
        print(f"Fetching text from: {url}")
        time.sleep(3)  # Wait for 3 seconds before scraping
        response = requests.get(url, verify=False)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Selecting the required div
        book_body_div = soup.select_one("#quarto-document-content")
        
        if not book_body_div:
            print(f"No matching text found for {url}")
            return None
        
        text_content = book_body_div.get_text(strip=True)
        
        # Save text to file
        filename = os.path.join(output_dir, f"{re.sub(r'[^a-zA-Z0-9]', '_', url)}.txt")
        with open(filename, "w", encoding="utf-8") as text_file:
            text_file.write(text_content)
        
        print(f"Saved text from {url} to {filename}")
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")

def main():
    input_file = r"N:\\CS\\rohana DS\\linksTEXT\\3\\theridBooksSummry.txt"
    text_output_dir = r"N:\\CS\\rohana DS\\linksTEXT\\3\\ExtractedTexts"
    
    if not os.path.exists(input_file):
        print("Error: The specified input file does not exist.")
        return
    
    os.makedirs(text_output_dir, exist_ok=True)
    
    with open(input_file, "r") as file:
        links = file.read().splitlines()
    
    for link in links:
        extract_text_from_page(link, text_output_dir)
    
    display("Text extraction completed.")

# Run the main function in Jupyter Notebook
if __name__ == "__main__":
    main()


Fetching text from: https://mdsr-book.github.io/mdsr3e/index.html
Saved text from https://mdsr-book.github.io/mdsr3e/index.html to N:\\CS\\rohana DS\\linksTEXT\\3\\ExtractedTexts2\https___mdsr_book_github_io_mdsr3e_index_html.txt
Fetching text from: https://mdsr-book.github.io/mdsr3e/00-authors.html
Saved text from https://mdsr-book.github.io/mdsr3e/00-authors.html to N:\\CS\\rohana DS\\linksTEXT\\3\\ExtractedTexts2\https___mdsr_book_github_io_mdsr3e_00_authors_html.txt
Fetching text from: https://mdsr-book.github.io/mdsr3e/01-intro.html
Saved text from https://mdsr-book.github.io/mdsr3e/01-intro.html to N:\\CS\\rohana DS\\linksTEXT\\3\\ExtractedTexts2\https___mdsr_book_github_io_mdsr3e_01_intro_html.txt
Fetching text from: https://mdsr-book.github.io/mdsr3e/02-datavizI.html
Saved text from https://mdsr-book.github.io/mdsr3e/02-datavizI.html to N:\\CS\\rohana DS\\linksTEXT\\3\\ExtractedTexts2\https___mdsr_book_github_io_mdsr3e_02_datavizI_html.txt
Fetching text from: https://mdsr-book.

'Text extraction completed.'