In [1]:
import os
import time
import requests
from bs4 import BeautifulSoup
import urllib3
from IPython.display import display
import re
from urllib.parse import urljoin

# Disable SSL warnings
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

def is_valid_url(url):
    return re.match(r'^(https?:\/\/)?([\da-z.-]+)\.([a-z.]{2,6})([\/\w .-]*)*\/?$', url)

def extract_links_from_page(url):
    try:
        print(f"Processing: {url}")
        time.sleep(5)  # Wait for 5 seconds before scraping
        response = requests.get(url, verify=False)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Selecting the required div
        book_summary_div = soup.select_one("#wrap > div > article > div > div:nth-child(7) > div.inner_cell > div")
        
        if not book_summary_div:
            print(f"No matching div found for {url}")
            return []
        
        # Extracting all href links and making them absolute
        links = [urljoin(url, a['href']) for a in book_summary_div.find_all('a', href=True)]
        print(f"Found {len(links)} links in {url}")
        return links
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return []

def main():
    input_file = r"N:\\CS\\rohana DS\\linksTEXT\\2\\second_structured_books.txt"
    output_file = r"N:\\CS\\rohana DS\\linksTEXT\\2\\secondtBooksSummry.txt"
    
    if not os.path.exists(input_file):
        print("Error: The specified input file does not exist.")
        return

    secondtBooksSummry = []
    
    with open(input_file, "r") as file:
        base_links = file.read().splitlines()
    
    valid_links = [link for link in base_links if is_valid_url(link)]
    invalid_links = set(base_links) - set(valid_links)
    
    if invalid_links:
        print("Skipping invalid links:")
        for link in invalid_links:
            print(f" - {link}")
    
    for base_link in valid_links:
        extracted_links = extract_links_from_page(base_link)
        full_links = [urljoin(base_link, link) for link in extracted_links]
        secondtBooksSummry.extend(full_links)
        print(f"Total collected links so far: {len(secondtBooksSummry)}")
    
    with open(output_file, "w") as file:
        file.write("\n".join(secondtBooksSummry))
    
    display(f"Extracted {len(secondtBooksSummry)} links and saved to {output_file}")

# Run the main function in Jupyter Notebook
if __name__ == "__main__":
    main()


Processing: https://jakevdp.github.io/WhirlwindTourOfPython/
Found 19 links in https://jakevdp.github.io/WhirlwindTourOfPython/
Total collected links so far: 19


'Extracted 19 links and saved to N:\\\\CS\\\\rohana DS\\\\linksTEXT\\\\2\\\\secondtBooksSummry.txt'

In [2]:
import os
import time
import requests
from bs4 import BeautifulSoup
import urllib3
from IPython.display import display
import re

# Disable SSL warnings
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

def extract_text_from_page(url, output_dir):
    try:
        print(f"Fetching text from: {url}")
        time.sleep(3)  # Wait for 3 seconds before scraping
        response = requests.get(url, verify=False)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Selecting the required div
        book_body_div = soup.select_one("#wrap > div.container.post")
        
        if not book_body_div:
            print(f"No matching text found for {url}")
            return None
        
        text_content = book_body_div.get_text(strip=True)
        
        # Save text to file
        filename = os.path.join(output_dir, f"{re.sub(r'[^a-zA-Z0-9]', '_', url)}.txt")
        with open(filename, "w", encoding="utf-8") as text_file:
            text_file.write(text_content)
        
        print(f"Saved text from {url} to {filename}")
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")

def main():
    input_file = r"N:\\CS\\rohana DS\\linksTEXT\\2\\secondtBooksSummry.txt"
    text_output_dir = r"N:\\CS\\rohana DS\\linksTEXT\\2\\ExtractedTexts"
    
    if not os.path.exists(input_file):
        print("Error: The specified input file does not exist.")
        return
    
    os.makedirs(text_output_dir, exist_ok=True)
    
    with open(input_file, "r") as file:
        links = file.read().splitlines()
    
    for link in links:
        extract_text_from_page(link, text_output_dir)
    
    display("Text extraction completed.")

# Run the main function in Jupyter Notebook
if __name__ == "__main__":
    main()


Fetching text from: https://jakevdp.github.io/WhirlwindTourOfPython/#Index
Saved text from https://jakevdp.github.io/WhirlwindTourOfPython/#Index to N:\\CS\\rohana DS\\linksTEXT\\2\\ExtractedTexts\https___jakevdp_github_io_WhirlwindTourOfPython__Index.txt
Fetching text from: https://jakevdp.github.io/WhirlwindTourOfPython/00-introduction.html
Saved text from https://jakevdp.github.io/WhirlwindTourOfPython/00-introduction.html to N:\\CS\\rohana DS\\linksTEXT\\2\\ExtractedTexts\https___jakevdp_github_io_WhirlwindTourOfPython_00_introduction_html.txt
Fetching text from: https://jakevdp.github.io/WhirlwindTourOfPython/01-how-to-run-python-code.html
Saved text from https://jakevdp.github.io/WhirlwindTourOfPython/01-how-to-run-python-code.html to N:\\CS\\rohana DS\\linksTEXT\\2\\ExtractedTexts\https___jakevdp_github_io_WhirlwindTourOfPython_01_how_to_run_python_code_html.txt
Fetching text from: https://jakevdp.github.io/WhirlwindTourOfPython/02-basic-python-syntax.html
Saved text from https:

'Text extraction completed.'