In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import xml.etree.ElementTree as ET
import os
import concurrent.futures
import random
from datetime import datetime
from threading import Lock
import re

# üöÄ Google Drive Mount (Optional)
try:
    from google.colab import drive
    drive.mount('/content/drive')
    SAVE_PATH_XLSX = "/content/drive/My Drive/Blogger_Novels.xlsx"
    FAILED_LINKS_FILE = "/content/drive/My Drive/failed_links.txt"
    PROGRESS_FILE = "/content/drive/My Drive/progress.txt"
    LOG_FILE = "/content/drive/My Drive/scrape_log.txt"
except ImportError:
    SAVE_PATH_XLSX = "Blogger_Novels.xlsx"
    FAILED_LINKS_FILE = "failed_links.txt"
    PROGRESS_FILE = "progress.txt"
    LOG_FILE = "scrape_log.txt"

# üìå Headers & Session
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
    "Connection": "keep-alive"
}
session = requests.Session()
session.headers.update(HEADERS)

# üîÑ Progress Lock
progress_lock = Lock()

# üì• Step 1: Extract all post URLs from sitemap
SITEMAP_URL = "https://urdureadings.com/post-sitemap1.xml"
try:
    response = session.get(SITEMAP_URL, timeout=30)
    if response.status_code == 200:
        root = ET.fromstring(response.content)
        post_urls = [elem.text for elem in root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}loc")]
        print(f"‚úÖ Found {len(post_urls)} posts to scrape.")
    else:
        print(f"‚ùå Failed to fetch sitemap: HTTP {response.status_code}")
        exit()
except ET.ParseError:
    print("‚ùå Failed to parse sitemap XML.")
    exit()
except requests.exceptions.RequestException as e:
    print(f"‚ùå Sitemap request failed: {e}")
    exit()

# üîÑ Resume Last Progress
last_index = 0
if os.path.exists(PROGRESS_FILE):
    try:
        with open(PROGRESS_FILE, "r") as f:
            last_index = int(f.read().strip())
    except (ValueError, IOError):
        print("‚ö† Could not read progress file. Starting from index 0.")

# ‚è≥ Function to estimate remaining time
def estimate_time(start_time, processed, total):
    elapsed_time = time.time() - start_time
    avg_time_per_post = elapsed_time / processed if processed > 0 else 0
    remaining_posts = total - processed
    estimated_remaining_time = remaining_posts * avg_time_per_post
    return time.strftime('%H:%M:%S', time.gmtime(estimated_remaining_time))

# üîç Function to resolve redirects
def resolve_redirect(url):
    try:
        response = session.head(url, allow_redirects=True, timeout=10)
        return response.url
    except requests.exceptions.RequestException:
        return url

# üîç Step 2: Scrape Each Novel Post with Auto-Retry
def scrape_post(post_url):
    retries = 5
    for attempt in range(retries):
        try:
            response = session.get(post_url, timeout=60)
            if response.status_code == 429:
                print(f"‚ö† Rate limit hit for {post_url}. Waiting...")
                time.sleep(2 ** attempt * 10)
                continue
            if response.status_code != 200:
                print(f"‚ö† Retrying ({attempt+1}/{retries}) for {post_url}: HTTP {response.status_code}")
                time.sleep(5)
                continue

            soup = BeautifulSoup(response.text, "html.parser")

            # Extract Title (unchanged, since titles are working)
            title_selectors = [
                "h1.entry-title",
                "h2.entry-title",
                "h1.post-title",
                "h2.post-title",
                "h3.entry-title",
                "h3.post-title",
                "div.post-title",
                "div.entry-title",
                "h1",
                "h2",
                "h3"
            ]
            title = None
            title_element = None
            for selector in title_selectors:
                title_elem = soup.select_one(selector)
                if title_elem:
                    title = title_elem.text.strip()
                    title_element = title_elem
                    print(f"üìå Found title with selector '{selector}': '{title}' for URL: {post_url}")
                    break

            if not title or "digest library" in title.lower():
                headers = soup.find_all(["h1", "h2", "h3"])
                for header in headers:
                    text = header.text.strip()
                    if "digest library" not in text.lower() and len(text) > 10 and not text.lower().startswith("home"):
                        title = text
                        title_element = header
                        print(f"üìå Fallback title found: '{title}' for URL: {post_url}")
                        break

            if not title or "digest library" in title.lower():
                title_tag = soup.find("title")
                if title_tag:
                    title = title_tag.text.strip()
                    if "digest library" in title.lower():
                        title = title.replace("Digest Library", "").replace("|", "").strip()
                    print(f"üìå Title from <title> tag: '{title}' for URL: {post_url}")

            title = title if title and "digest library" not in title.lower() else "No Title Found"
            print(f"üìå Final title: '{title}' for URL: {post_url}")

            # Extract Download Links
            # 1. Get all <a> tags with href
            all_links = [a["href"] for a in soup.find_all("a", href=True)]
            # 2. Get URLs from <button> tags or data attributes
            button_links = [button.get("onclick", "").strip("'").replace("window.location.href=", "")
                           for button in soup.find_all("button") if button.get("onclick")]
            # 3. Resolve redirects for all links
            all_links = [resolve_redirect(link) for link in all_links + button_links if link]

            # 4. Filter for known download domains
            download_domains = ["drive.google", "mediafire", "dropbox", "mega.nz"]
            google_drive_links = [link for link in all_links if "drive.google" in link]
            mediafire_links = [link for link in all_links if "mediafire" in link]
            other_links = [link for link in all_links if any(domain in link for domain in ["dropbox", "mega.nz"])]

            # 5. Search entire page content for raw URLs
            content_selectors = ["div.post-body", "div.entry-content", "div.post-content", "article", "div.post"]
            content = None
            for selector in content_selectors:
                content = soup.select_one(selector)
                if content:
                    break
            if not content:
                content = soup  # Fallback to entire page

            raw_urls = []
            text = content.get_text()
            url_pattern = r'https?://[^\s<>"]+|www\.[^\s<>"]+'
            raw_urls = re.findall(url_pattern, text)
            raw_urls = [url for url in raw_urls if any(domain in url.lower() for domain in download_domains)]

            # Combine links
            google_drive_links.extend([url for url in raw_urls if "drive.google" in url and url not in google_drive_links])
            mediafire_links.extend([url for url in raw_urls if "mediafire" in url and url not in mediafire_links])
            other_links.extend([url for url in raw_urls if any(domain in url for domain in ["dropbox", "mega.nz"]) and url not in other_links])

            # Log for debugging
            with open(LOG_FILE, "a") as f:
                f.write(f"{datetime.now()}: URL: {post_url}, Title: {title}\n")
                f.write(f"  All <a> hrefs: {all_links}\n")
                f.write(f"  Button links: {button_links}\n")
                f.write(f"  Raw URLs from content: {raw_urls}\n")
                f.write(f"  Google Drive Links: {google_drive_links}\n")
                f.write(f"  Mediafire Links: {mediafire_links}\n")
                f.write(f"  Other Links: {other_links}\n")
                if title_element:
                    parent = title_element.find_parent()
                    context = str(parent)[:200] if parent else "No parent"
                    f.write(f"  Title Context: {context}\n")
                # Log content snippet for debugging
                content_snippet = text[:200].replace('\n', ' ')
                f.write(f"  Content Snippet: {content_snippet}\n")

            return {
                "Title": title,
                "Google Drive Links": ", ".join(google_drive_links) if google_drive_links else "No Google Drive Link",
                "Mediafire Links": ", ".join(mediafire_links) if mediafire_links else "No Mediafire Link",
                "Other Links": ", ".join(other_links) if other_links else "No Other Links"
            }
        except requests.exceptions.RequestException as e:
            print(f"‚ö† Attempt {attempt+1} failed for {post_url}: {e}")
            time.sleep(5)

    # ‚ùå Save failed link
    with open(FAILED_LINKS_FILE, "a") as f:
        f.write(post_url + "\n")
    return None

# üìå Step 3: Scrape Each Post & Save Data in Batches
novels_data = []
BATCH_SIZE = 20
start_time = time.time()

with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    future_to_url = {executor.submit(scrape_post, post_urls[idx]): idx for idx in range(last_index, len(post_urls))}

    for count, future in enumerate(concurrent.futures.as_completed(future_to_url), start=1):
        idx = future_to_url[future]
        result = future.result()
        if result:
            novels_data.append(result)

        # ‚úÖ Save every batch of 100 posts
        if len(novels_data) >= BATCH_SIZE:
            df = pd.DataFrame(novels_data)
            try:
                if os.path.exists(SAVE_PATH_XLSX):
                    existing_df = pd.read_excel(SAVE_PATH_XLSX, engine='openpyxl')
                    df = pd.concat([existing_df, df], ignore_index=True)
                df.to_excel(SAVE_PATH_XLSX, index=False, engine='openpyxl')
                novels_data = []
                with progress_lock:
                    with open(PROGRESS_FILE, "w") as f:
                        f.write(str(idx))
            except (PermissionError, IOError) as e:
                print(f"‚ùå Cannot write to {SAVE_PATH_XLSX}: {e}")
                exit()

        # ‚è≥ Show estimated remaining time
        if count % 50 == 0:
            remaining_time = estimate_time(start_time, count, len(post_urls) - last_index)
            print(f"‚è≥ Estimated time remaining: {remaining_time}")

        time.sleep(random.uniform(1, 2))

# üì• Step 4: Final Save
if novels_data:
    df = pd.DataFrame(novels_data)
    try:
        if os.path.exists(SAVE_PATH_XLSX):
            existing_df = pd.read_excel(SAVE_PATH_XLSX, engine='openpyxl')
            df = pd.concat([existing_df, df], ignore_index=True)
        df.to_excel(SAVE_PATH_XLSX, index=False, engine='openpyxl')
    except (PermissionError, IOError) as e:
        print(f"‚ùå Cannot write to {SAVE_PATH_XLSX}: {e}")
        exit()

# ‚úÖ Delete progress file
if os.path.exists(PROGRESS_FILE):
    try:
        os.remove(PROGRESS_FILE)
    except OSError:
        print("‚ö† Could not delete progress file.")

print(f"‚úÖ Scraping complete! Data saved in '{SAVE_PATH_XLSX}'")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
‚ùå Failed to fetch sitemap: HTTP 429
‚ö† Rate limit hit for https://urdureadings.com/ishq-javdani-novel-by-gul-arbab-pdf/. Waiting...
‚ö† Rate limit hit for https://urdureadings.com/ru-sayah-novel-by-aatir-shaheen-pdf/. Waiting...
‚ö† Rate limit hit for https://urdureadings.com/taash-ghar-novel-by-aymal-raza-pdf/. Waiting...
üìå Found title with selector 'h2.entry-title': 'Mere Jeene Ki Wajah Novel by Tania Hashmi' for URL: https://urdureadings.com/
üìå Final title: 'Mere Jeene Ki Wajah Novel by Tania Hashmi' for URL: https://urdureadings.com/
üìå Found title with selector 'h1.entry-title': 'Mala Novel By Nimra Ahmed' for URL: https://urdureadings.com/mala-novel-by-nimra-ahmed-pdf/
üìå Final title: 'Mala Novel By Nimra Ahmed' for URL: https://urdureadings.com/mala-novel-by-nimra-ahmed-pdf/
üìå Found title with selector 'h1.entry-title': 'Donwload Comple