In [1]:
import os
import time
import requests
from bs4 import BeautifulSoup
import urllib3
from IPython.display import display
import re
from urllib.parse import urljoin

# Disable SSL warnings
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

def is_valid_url(url):
    return re.match(r'^(https?:\/\/)?([\da-z.-]+)\.([a-z.]{2,6})([\/\w .-]*)*\/?$', url)

def extract_links_from_page(url):
    try:
        print(f"Processing: {url}")
        time.sleep(5)  # Wait for 5 seconds before scraping
        response = requests.get(url, verify=False)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Selecting the required div
        main_nav_div = soup.select_one("#main-nav > nav > ul")
        
        if not main_nav_div:
            print(f"No matching div found for {url}")
            return []
        
        # Extracting all href links and making them absolute
        links = [urljoin(url, a['href']) for a in main_nav_div.find_all('a', href=True)]
        print(f"Found {len(links)} links in {url}")
        return links
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return []

def main():
    input_file = r"N:\\CS\\rohana DS\\linksTEXT\\4\\fourth_structured_books.txt"
    output_file = r"N:\\CS\\rohana DS\\linksTEXT\\4\\fourthBooksSummry.txt"
    
    if not os.path.exists(input_file):
        print("Error: The specified input file does not exist.")
        return

    theridBooksSummry = []
    
    with open(input_file, "r") as file:
        base_links = file.read().splitlines()
    
    valid_links = [link for link in base_links if is_valid_url(link)]
    invalid_links = set(base_links) - set(valid_links)
    
    if invalid_links:
        print("Skipping invalid links:")
        for link in invalid_links:
            print(f" - {link}")
    
    for base_link in valid_links:
        extracted_links = extract_links_from_page(base_link)
        full_links = [urljoin(base_link, link) for link in extracted_links]
        theridBooksSummry.extend(full_links)
        print(f"Total collected links so far: {len(theridBooksSummry)}")
    
    with open(output_file, "w") as file:
        file.write("\n".join(theridBooksSummry))
    
    display(f"Extracted {len(theridBooksSummry)} links and saved to {output_file}")

# Run the main function in Jupyter Notebook
if __name__ == "__main__":
    main()


Processing: https://walker-data.com/census-r/
Found 16 links in https://walker-data.com/census-r/
Total collected links so far: 16
Processing: https://sdesabbata.github.io/r-for-geographic-data-science/
Found 19 links in https://sdesabbata.github.io/r-for-geographic-data-science/
Total collected links so far: 35
Processing: https://www.tidytextmining.com/
Found 12 links in https://www.tidytextmining.com/
Total collected links so far: 47
Processing: https://bookdown.org/marc_trussler/IIS/
Found 17 links in https://bookdown.org/marc_trussler/IIS/
Total collected links so far: 64
Processing: https://www.paulamoraga.com/book-spatial/index.html
Found 28 links in https://www.paulamoraga.com/book-spatial/index.html
Total collected links so far: 92
Processing: https://mastering-shiny.org/index.html
Found 29 links in https://mastering-shiny.org/index.html
Total collected links so far: 121
Processing: https://www.paulamoraga.com/book-geospatial/
Found 21 links in https://www.paulamoraga.com/book

'Extracted 274 links and saved to N:\\\\CS\\\\rohana DS\\\\linksTEXT\\\\4\\\\fourthBooksSummry.txt'

In [2]:
import os
import time
import requests
from bs4 import BeautifulSoup
import urllib3
from IPython.display import display
import re

# Disable SSL warnings
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

def extract_text_from_page(url, output_dir):
    try:
        print(f"Fetching text from: {url}")
        time.sleep(3)  # Wait for 3 seconds before scraping
        response = requests.get(url, verify=False)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Selecting the required div
        book_body_div = soup.select_one("#content")
        
        if not book_body_div:
            print(f"No matching text found for {url}")
            return None
        
        text_content = book_body_div.get_text(strip=True)
        
        # Save text to file
        filename = os.path.join(output_dir, f"{re.sub(r'[^a-zA-Z0-9]', '_', url)}.txt")
        with open(filename, "w", encoding="utf-8") as text_file:
            text_file.write(text_content)
        
        print(f"Saved text from {url} to {filename}")
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")

def main():
    input_file = r"N:\\CS\\rohana DS\\linksTEXT\\4\\fourthBooksSummry.txt"
    text_output_dir = r"N:\\CS\\rohana DS\\linksTEXT\\4\\ExtractedTexts"
    
    if not os.path.exists(input_file):
        print("Error: The specified input file does not exist.")
        return
    
    os.makedirs(text_output_dir, exist_ok=True)
    
    with open(input_file, "r") as file:
        links = file.read().splitlines()
    
    for link in links:
        extract_text_from_page(link, text_output_dir)
    
    display("Text extraction completed.")

# Run the main function in Jupyter Notebook
if __name__ == "__main__":
    main()


Fetching text from: https://walker-data.com/census-r/index.html
Saved text from https://walker-data.com/census-r/index.html to N:\\CS\\rohana DS\\linksTEXT\\4\\ExtractedTexts\https___walker_data_com_census_r_index_html.txt
Fetching text from: https://walker-data.com/census-r/the-united-states-census-and-the-r-programming-language.html
Saved text from https://walker-data.com/census-r/the-united-states-census-and-the-r-programming-language.html to N:\\CS\\rohana DS\\linksTEXT\\4\\ExtractedTexts\https___walker_data_com_census_r_the_united_states_census_and_the_r_programming_language_html.txt
Fetching text from: https://walker-data.com/census-r/an-introduction-to-tidycensus.html
Saved text from https://walker-data.com/census-r/an-introduction-to-tidycensus.html to N:\\CS\\rohana DS\\linksTEXT\\4\\ExtractedTexts\https___walker_data_com_census_r_an_introduction_to_tidycensus_html.txt
Fetching text from: https://walker-data.com/census-r/wrangling-census-data-with-tidyverse-tools.html
Saved te

'Text extraction completed.'