In [9]:
import os
import time
import requests
from bs4 import BeautifulSoup
import urllib3

# Disable SSL warnings
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# 📌 Paths
LINKS_FILE = r"N:\CS\rohana DS\extracted_links.txt"
OUTPUT_FILE = r"N:\CS\rohana DS\first_structured_books.txt"
BOOK_NAMES_FILE = r"N:\CS\rohana DS\book_names.txt"

# 📌 Lists to store results
first_struc_books = []  # List of structured book URLs
book_names = []  # List of book names

def check_book_structure(url):
    """Checks if the webpage contains 'book-summary' and 'book-body'."""
    print(f"🔍 Checking: {url}")
    time.sleep(5)  # Wait before scraping

    try:
        response = requests.get(url, timeout=10, verify=False)  # Ignore SSL verification errors
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")

        # 📌 Find all `div` elements with class `book-summary`
        book_summaries = soup.find_all("div", class_="book-summary")
        
        # 📌 Find all `div` elements with class `book-body`
        book_bodies = soup.find_all("div", class_="book-body")

        # ✅ If both elements exist, store the book
        if book_summaries and book_bodies:
            book_name = soup.title.string.strip() if soup.title else f"Unknown Book ({url})"
            
            first_struc_books.append(url)
            book_names.append(book_name)

            print(f"✅ Structured book found: {book_name} ({url})")
        else:
            print(f"⚠ Structure not fully found for: {url}")
    
    except requests.RequestException as e:
        print(f"❌ Error checking {url}: {e}")

def process_links():
    """Reads extracted links and checks their structure."""
    if not os.path.exists(LINKS_FILE):
        print(f"❌ ERROR: {LINKS_FILE} not found!")
        return

    with open(LINKS_FILE, "r", encoding="utf-8") as f:
        links = f.read().splitlines()

    for url in links:
        check_book_structure(url)

    # 📌 Save structured book links
    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        for book_url in first_struc_books:
            f.write(book_url + "\n")
    
    # 📌 Save book names
    with open(BOOK_NAMES_FILE, "w", encoding="utf-8") as f:
        for name in book_names:
            f.write(name + "\n")
    
    print(f"\n📌 Saved {len(first_struc_books)} structured books to: {OUTPUT_FILE}")
    print(f"📌 Saved {len(book_names)} book names to: {BOOK_NAMES_FILE}")

# 🚀 Run the process
process_links()


🔍 Checking: http://linear.ups.edu/html/fcla.html
⚠ Structure not fully found for: http://linear.ups.edu/html/fcla.html
🔍 Checking: https://brouwern.github.io/lbrb/
✅ Structured book found: A Little Book of R for Bioinformatics 2.0 (https://brouwern.github.io/lbrb/)
🔍 Checking: https://jakevdp.github.io/WhirlwindTourOfPython/
⚠ Structure not fully found for: https://jakevdp.github.io/WhirlwindTourOfPython/
🔍 Checking: https://adv-r.hadley.nz/
⚠ Structure not fully found for: https://adv-r.hadley.nz/
🔍 Checking: https://edwinth.github.io/ADSwR/
✅ Structured book found: Agile Data Science with R (https://edwinth.github.io/ADSwR/)
🔍 Checking: https://michael-franke.github.io/intro-data-analysis/index.html
✅ Structured book found: An Introduction to Data Analysis (https://michael-franke.github.io/intro-data-analysis/index.html)
🔍 Checking: https://drfloreiche.github.io/index.html
⚠ Structure not fully found for: https://drfloreiche.github.io/index.html
🔍 Checking: https://bookdown.org/pinga