In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import xml.etree.ElementTree as ET
import os
import concurrent.futures
import random
from datetime import datetime

# üöÄ Google Drive Mount (for saving file)
from google.colab import drive
drive.mount('/content/drive')

# üìå File Paths
SITEMAP_URL = "https://digestlibrary.com/post-sitemap7.xml"
SAVE_PATH_XLSX = "/content/drive/My Drive/Blogger_Novels.xlsx"
FAILED_LINKS_FILE = "/content/drive/My Drive/failed_links.txt"
PROGRESS_FILE = "/content/drive/My Drive/progress.txt"

# üõ° Headers & Session
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"}
session = requests.Session()
session.headers.update(HEADERS)

# üîÑ Resume Last Progress
last_index = 0
if os.path.exists(PROGRESS_FILE):
    with open(PROGRESS_FILE, "r") as f:
        last_index = int(f.read().strip())

# üì• Step 1: Extract all post URLs from sitemap
response = session.get(SITEMAP_URL)
if response.status_code == 200:
    root = ET.fromstring(response.content)
    post_urls = [elem.text for elem in root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}loc")]
    print(f"‚úÖ Found {len(post_urls)} posts to scrape.")
else:
    print("‚ùå Failed to fetch sitemap.")
    exit()

# ‚è≥ Function to estimate remaining time
def estimate_time(start_time, processed, total):
    elapsed_time = time.time() - start_time
    avg_time_per_post = elapsed_time / processed if processed > 0 else 0
    remaining_posts = total - processed
    estimated_remaining_time = remaining_posts * avg_time_per_post
    return time.strftime('%H:%M:%S', time.gmtime(estimated_remaining_time))

# üîç Step 2: Scrape Each Novel Post with Auto-Retry
def scrape_post(post_url):
    """ Extracts title and ALL download links from a Blogger post """
    retries = 5
    for attempt in range(retries):
        try:
            response = session.get(post_url, timeout=30)
            if response.status_code != 200:
                print(f"‚ö† Retrying ({attempt+1}/{retries}) for: {post_url}")
                time.sleep(5)
                continue

            soup = BeautifulSoup(response.text, "html.parser")

            # Extract Title
            title = soup.find("h1")
            if not title:
                title = soup.find("h2")  # Sometimes titles are in h2

            title = title.text.strip() if title else "No Title Found"

            # Extract Download Links
            all_links = [a["href"] for a in soup.find_all("a", href=True)]
            google_drive_links = [link for link in all_links if "drive.google" in link]
            mediafire_links = [link for link in all_links if "mediafire" in link]

            return {
                "Title": title,
                "Google Drive Links": ", ".join(google_drive_links) if google_drive_links else "No Google Drive Link",
                "Mediafire Links": ", ".join(mediafire_links) if mediafire_links else "No Mediafire Link"
            }
        except requests.exceptions.RequestException as e:
            print(f"‚ö† Attempt {attempt+1} failed for {post_url}: {e}")
        time.sleep(5)

    # ‚ùå Save failed link
    with open(FAILED_LINKS_FILE, "a") as f:
        f.write(post_url + "\n")

    return None  # Return None if all retries fail

# üìå Step 3: Scrape Each Post & Save Data in Batches
novels_data = []
BATCH_SIZE = 100
start_time = time.time()

with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    future_to_url = {executor.submit(scrape_post, post_urls[idx]): idx for idx in range(last_index, len(post_urls))}

    for count, future in enumerate(concurrent.futures.as_completed(future_to_url), start=1):
        idx = future_to_url[future]
        result = future.result()
        if result:
            novels_data.append(result)

        # ‚úÖ Save every batch of 100 posts
        if len(novels_data) >= BATCH_SIZE:
            df = pd.DataFrame(novels_data)

            # üîπ Append to existing Excel
            if os.path.exists(SAVE_PATH_XLSX):
                existing_df = pd.read_excel(SAVE_PATH_XLSX, engine='openpyxl')
                df = pd.concat([existing_df, df], ignore_index=True)

            df.to_excel(SAVE_PATH_XLSX, index=False, engine='openpyxl')

            novels_data = []  # Clear batch
            with open(PROGRESS_FILE, "w") as f:
                f.write(str(idx))

        # ‚è≥ Show estimated remaining time after every 50 posts
        if count % 50 == 0:
            remaining_time = estimate_time(start_time, count, len(post_urls) - last_index)
            print(f"‚è≥ Estimated time remaining: {remaining_time}")

        time.sleep(random.uniform(1, 2))  # Random delay

# üì• Step 4: Final Save
if novels_data:
    df = pd.DataFrame(novels_data)

    # üîπ Append to Excel
    if os.path.exists(SAVE_PATH_XLSX):
        existing_df = pd.read_excel(SAVE_PATH_XLSX, engine='openpyxl')
        df = pd.concat([existing_df, df], ignore_index=True)

    df.to_excel(SAVE_PATH_XLSX, index=False, engine='openpyxl')

# ‚úÖ Delete progress file after successful completion
if os.path.exists(PROGRESS_FILE):
    os.remove(PROGRESS_FILE)

print(f"‚úÖ Scraping complete! Data saved in '{SAVE_PATH_XLSX}' (Google Drive)")


I can see the issue you're facing. The Excel output from your script shows that the "Title" column for all entries is "Best Readers Library," which is likely the name of the blog rather than the individual novel titles. This is happening because the script isn't correctly extracting the unique titles for each post. Let's analyze the problem and fix it.

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import xml.etree.ElementTree as ET
import os
import concurrent.futures
import random
from datetime import datetime

# üöÄ Google Drive Mount (for saving file)
from google.colab import drive
drive.mount('/content/drive')

# üìå File Paths
SITEMAP_URL = "https://bestreaderslibrary.blogspot.com/sitemap.xml?page=2"
SAVE_PATH_XLSX = "/content/drive/My Drive/Blogger_Novels.xlsx"
FAILED_LINKS_FILE = "/content/drive/My Drive/failed_links.txt"
PROGRESS_FILE = "/content/drive/My Drive/progress.txt"

# üõ° Headers & Session
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"}
session = requests.Session()
session.headers.update(HEADERS)

# üîÑ Resume Last Progress
last_index = 0
if os.path.exists(PROGRESS_FILE):
    with open(PROGRESS_FILE, "r") as f:
        last_index = int(f.read().strip())

# üì• Step 1: Extract all post URLs from sitemap
response = session.get(SITEMAP_URL)
if response.status_code == 200:
    root = ET.fromstring(response.content)
    post_urls = [elem.text for elem in root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}loc")]
    print(f"‚úÖ Found {len(post_urls)} posts to scrape.")
else:
    print("‚ùå Failed to fetch sitemap.")
    exit()

# ‚è≥ Function to estimate remaining time
def estimate_time(start_time, processed, total):
    elapsed_time = time.time() - start_time
    avg_time_per_post = elapsed_time / processed if processed > 0 else 0
    remaining_posts = total - processed
    estimated_remaining_time = remaining_posts * avg_time_per_post
    return time.strftime('%H:%M:%S', time.gmtime(estimated_remaining_time))

# üîç Step 2: Scrape Each Novel Post with Auto-Retry
def scrape_post(post_url):
    """ Extracts title and ALL download links from a Blogger post """
    retries = 5
    for attempt in range(retries):
        try:
            response = session.get(post_url, timeout=30)
            if response.status_code != 200:
                print(f"‚ö† Retrying ({attempt+1}/{retries}) for: {post_url}")
                time.sleep(5)
                continue

            soup = BeautifulSoup(response.text, "html.parser")

            # Extract Title (Updated Logic Based on HTML Snippet)
            # Look for h1 tag with class 'post-title' or 'entry-title'
            title = soup.find("h1", class_=["post-title", "entry-title"])
            if not title:
                # Fallback to h2 in case the structure varies
                title = soup.find("h2")
            if not title:
                # Fallback to h3 as a last resort
                title = soup.find("h3")

            title = title.text.strip() if title else "No Title Found"
            print(f"üìù Extracted title for {post_url}: {title}")  # Debug output to verify titles

            # Extract Download Links
            all_links = [a["href"] for a in soup.find_all("a", href=True)]
            google_drive_links = [link for link in all_links if "drive.google" in link]
            mediafire_links = [link for link in all_links if "mediafire" in link]

            return {
                "Title": title,
                "Google Drive Links": ", ".join(google_drive_links) if google_drive_links else "No Google Drive Link",
                "Mediafire Links": ", ".join(mediafire_links) if mediafire_links else "No Mediafire Link"
            }
        except requests.exceptions.RequestException as e:
            print(f"‚ö† Attempt {attempt+1} failed for {post_url}: {e}")
        time.sleep(5)

    # ‚ùå Save failed link
    with open(FAILED_LINKS_FILE, "a") as f:
        f.write(post_url + "\n")

    return None  # Return None if all retries fail

# üìå Step 3: Scrape Each Post & Save Data in Batches
novels_data = []
BATCH_SIZE = 100
start_time = time.time()

with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    future_to_url = {executor.submit(scrape_post, post_urls[idx]): idx for idx in range(last_index, len(post_urls))}

    for count, future in enumerate(concurrent.futures.as_completed(future_to_url), start=1):
        idx = future_to_url[future]
        result = future.result()
        if result:
            novels_data.append(result)

        # ‚úÖ Save every batch of 100 posts
        if len(novels_data) >= BATCH_SIZE:
            df = pd.DataFrame(novels_data)

            # üîπ Append to existing Excel
            if os.path.exists(SAVE_PATH_XLSX):
                existing_df = pd.read_excel(SAVE_PATH_XLSX, engine='openpyxl')
                df = pd.concat([existing_df, df], ignore_index=True)

            df.to_excel(SAVE_PATH_XLSX, index=False, engine='openpyxl')

            novels_data = []  # Clear batch
            with open(PROGRESS_FILE, "w") as f:
                f.write(str(idx))

        # ‚è≥ Show estimated remaining time after every 50 posts
        if count % 50 == 0:
            remaining_time = estimate_time(start_time, count, len(post_urls) - last_index)
            print(f"‚è≥ Estimated time remaining: {remaining_time}")

        time.sleep(random.uniform(1, 2))  # Random delay

# üì• Step 4: Final Save
if novels_data:
    df = pd.DataFrame(novels_data)

    # üîπ Append to Excel
    if os.path.exists(SAVE_PATH_XLSX):
        existing_df = pd.read_excel(SAVE_PATH_XLSX, engine='openpyxl')
        df = pd.concat([existing_df, df], ignore_index=True)

    df.to_excel(SAVE_PATH_XLSX, index=False, engine='openpyxl')

# ‚úÖ Delete progress file after successful completion
if os.path.exists(PROGRESS_FILE):
    os.remove(PROGRESS_FILE)

print(f"‚úÖ Scraping complete! Data saved in '{SAVE_PATH_XLSX}' (Google Drive)")

use below

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import xml.etree.ElementTree as ET
import os
import concurrent.futures
import random
from datetime import datetime
from threading import Lock
import re

# üöÄ Google Drive Mount (Optional)
try:
    from google.colab import drive
    drive.mount('/content/drive')
    SAVE_PATH_XLSX = "/content/drive/My Drive/Blogger_Novels.xlsx"
    FAILED_LINKS_FILE = "/content/drive/My Drive/failed_links.txt"
    PROGRESS_FILE = "/content/drive/My Drive/progress.txt"
    LOG_FILE = "/content/drive/My Drive/scrape_log.txt"
except ImportError:
    SAVE_PATH_XLSX = "Blogger_Novels.xlsx"
    FAILED_LINKS_FILE = "failed_links.txt"
    PROGRESS_FILE = "progress.txt"
    LOG_FILE = "scrape_log.txt"

# üìå Headers & Session
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
    "Connection": "keep-alive"
}
session = requests.Session()
session.headers.update(HEADERS)

# üîÑ Progress Lock
progress_lock = Lock()

# üì• Step 1: Extract all post URLs from sitemap
SITEMAP_URL = "https://digestlibrary.com/post-sitemap6.xml"
try:
    response = session.get(SITEMAP_URL, timeout=30)
    if response.status_code == 200:
        root = ET.fromstring(response.content)
        post_urls = [elem.text for elem in root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}loc")]
        print(f"‚úÖ Found {len(post_urls)} posts to scrape.")
    else:
        print(f"‚ùå Failed to fetch sitemap: HTTP {response.status_code}")
        exit()
except ET.ParseError:
    print("‚ùå Failed to parse sitemap XML.")
    exit()
except requests.exceptions.RequestException as e:
    print(f"‚ùå Sitemap request failed: {e}")
    exit()

# üîÑ Resume Last Progress
last_index = 0
if os.path.exists(PROGRESS_FILE):
    try:
        with open(PROGRESS_FILE, "r") as f:
            last_index = int(f.read().strip())
    except (ValueError, IOError):
        print("‚ö† Could not read progress file. Starting from index 0.")

# ‚è≥ Function to estimate remaining time
def estimate_time(start_time, processed, total):
    elapsed_time = time.time() - start_time
    avg_time_per_post = elapsed_time / processed if processed > 0 else 0
    remaining_posts = total - processed
    estimated_remaining_time = remaining_posts * avg_time_per_post
    return time.strftime('%H:%M:%S', time.gmtime(estimated_remaining_time))

# üîç Function to resolve redirects
def resolve_redirect(url):
    try:
        response = session.head(url, allow_redirects=True, timeout=10)
        return response.url
    except requests.exceptions.RequestException:
        return url

# üîç Step 2: Scrape Each Novel Post with Auto-Retry
def scrape_post(post_url):
    retries = 5
    for attempt in range(retries):
        try:
            response = session.get(post_url, timeout=60)
            if response.status_code == 429:
                print(f"‚ö† Rate limit hit for {post_url}. Waiting...")
                time.sleep(2 ** attempt * 10)
                continue
            if response.status_code != 200:
                print(f"‚ö† Retrying ({attempt+1}/{retries}) for {post_url}: HTTP {response.status_code}")
                time.sleep(5)
                continue

            soup = BeautifulSoup(response.text, "html.parser")

            # Extract Title (unchanged, since titles are working)
            title_selectors = [
                "h1.entry-title",
                "h2.entry-title",
                "h1.post-title",
                "h2.post-title",
                "h3.entry-title",
                "h3.post-title",
                "div.post-title",
                "div.entry-title",
                "h1",
                "h2",
                "h3"
            ]
            title = None
            title_element = None
            for selector in title_selectors:
                title_elem = soup.select_one(selector)
                if title_elem:
                    title = title_elem.text.strip()
                    title_element = title_elem
                    print(f"üìå Found title with selector '{selector}': '{title}' for URL: {post_url}")
                    break

            if not title or "digest library" in title.lower():
                headers = soup.find_all(["h1", "h2", "h3"])
                for header in headers:
                    text = header.text.strip()
                    if "digest library" not in text.lower() and len(text) > 10 and not text.lower().startswith("home"):
                        title = text
                        title_element = header
                        print(f"üìå Fallback title found: '{title}' for URL: {post_url}")
                        break

            if not title or "digest library" in title.lower():
                title_tag = soup.find("title")
                if title_tag:
                    title = title_tag.text.strip()
                    if "digest library" in title.lower():
                        title = title.replace("Digest Library", "").replace("|", "").strip()
                    print(f"üìå Title from <title> tag: '{title}' for URL: {post_url}")

            title = title if title and "digest library" not in title.lower() else "No Title Found"
            print(f"üìå Final title: '{title}' for URL: {post_url}")

            # Extract Download Links
            # 1. Get all <a> tags with href
            all_links = [a["href"] for a in soup.find_all("a", href=True)]
            # 2. Get URLs from <button> tags or data attributes
            button_links = [button.get("onclick", "").strip("'").replace("window.location.href=", "")
                           for button in soup.find_all("button") if button.get("onclick")]
            # 3. Resolve redirects for all links
            all_links = [resolve_redirect(link) for link in all_links + button_links if link]

            # 4. Filter for known download domains
            download_domains = ["drive.google", "mediafire", "dropbox", "mega.nz"]
            google_drive_links = [link for link in all_links if "drive.google" in link]
            mediafire_links = [link for link in all_links if "mediafire" in link]
            other_links = [link for link in all_links if any(domain in link for domain in ["dropbox", "mega.nz"])]

            # 5. Search entire page content for raw URLs
            content_selectors = ["div.post-body", "div.entry-content", "div.post-content", "article", "div.post"]
            content = None
            for selector in content_selectors:
                content = soup.select_one(selector)
                if content:
                    break
            if not content:
                content = soup  # Fallback to entire page

            raw_urls = []
            text = content.get_text()
            url_pattern = r'https?://[^\s<>"]+|www\.[^\s<>"]+'
            raw_urls = re.findall(url_pattern, text)
            raw_urls = [url for url in raw_urls if any(domain in url.lower() for domain in download_domains)]

            # Combine links
            google_drive_links.extend([url for url in raw_urls if "drive.google" in url and url not in google_drive_links])
            mediafire_links.extend([url for url in raw_urls if "mediafire" in url and url not in mediafire_links])
            other_links.extend([url for url in raw_urls if any(domain in url for domain in ["dropbox", "mega.nz"]) and url not in other_links])

            # Log for debugging
            with open(LOG_FILE, "a") as f:
                f.write(f"{datetime.now()}: URL: {post_url}, Title: {title}\n")
                f.write(f"  All <a> hrefs: {all_links}\n")
                f.write(f"  Button links: {button_links}\n")
                f.write(f"  Raw URLs from content: {raw_urls}\n")
                f.write(f"  Google Drive Links: {google_drive_links}\n")
                f.write(f"  Mediafire Links: {mediafire_links}\n")
                f.write(f"  Other Links: {other_links}\n")
                if title_element:
                    parent = title_element.find_parent()
                    context = str(parent)[:200] if parent else "No parent"
                    f.write(f"  Title Context: {context}\n")
                # Log content snippet for debugging
                content_snippet = text[:200].replace('\n', ' ')
                f.write(f"  Content Snippet: {content_snippet}\n")

            return {
                "Title": title,
                "Google Drive Links": ", ".join(google_drive_links) if google_drive_links else "No Google Drive Link",
                "Mediafire Links": ", ".join(mediafire_links) if mediafire_links else "No Mediafire Link",
                "Other Links": ", ".join(other_links) if other_links else "No Other Links"
            }
        except requests.exceptions.RequestException as e:
            print(f"‚ö† Attempt {attempt+1} failed for {post_url}: {e}")
            time.sleep(5)

    # ‚ùå Save failed link
    with open(FAILED_LINKS_FILE, "a") as f:
        f.write(post_url + "\n")
    return None

# üìå Step 3: Scrape Each Post & Save Data in Batches
novels_data = []
BATCH_SIZE = 20
start_time = time.time()

with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    future_to_url = {executor.submit(scrape_post, post_urls[idx]): idx for idx in range(last_index, len(post_urls))}

    for count, future in enumerate(concurrent.futures.as_completed(future_to_url), start=1):
        idx = future_to_url[future]
        result = future.result()
        if result:
            novels_data.append(result)

        # ‚úÖ Save every batch of 100 posts
        if len(novels_data) >= BATCH_SIZE:
            df = pd.DataFrame(novels_data)
            try:
                if os.path.exists(SAVE_PATH_XLSX):
                    existing_df = pd.read_excel(SAVE_PATH_XLSX, engine='openpyxl')
                    df = pd.concat([existing_df, df], ignore_index=True)
                df.to_excel(SAVE_PATH_XLSX, index=False, engine='openpyxl')
                novels_data = []
                with progress_lock:
                    with open(PROGRESS_FILE, "w") as f:
                        f.write(str(idx))
            except (PermissionError, IOError) as e:
                print(f"‚ùå Cannot write to {SAVE_PATH_XLSX}: {e}")
                exit()

        # ‚è≥ Show estimated remaining time
        if count % 50 == 0:
            remaining_time = estimate_time(start_time, count, len(post_urls) - last_index)
            print(f"‚è≥ Estimated time remaining: {remaining_time}")

        time.sleep(random.uniform(1, 2))

# üì• Step 4: Final Save
if novels_data:
    df = pd.DataFrame(novels_data)
    try:
        if os.path.exists(SAVE_PATH_XLSX):
            existing_df = pd.read_excel(SAVE_PATH_XLSX, engine='openpyxl')
            df = pd.concat([existing_df, df], ignore_index=True)
        df.to_excel(SAVE_PATH_XLSX, index=False, engine='openpyxl')
    except (PermissionError, IOError) as e:
        print(f"‚ùå Cannot write to {SAVE_PATH_XLSX}: {e}")
        exit()

# ‚úÖ Delete progress file
if os.path.exists(PROGRESS_FILE):
    try:
        os.remove(PROGRESS_FILE)
    except OSError:
        print("‚ö† Could not delete progress file.")

print(f"‚úÖ Scraping complete! Data saved in '{SAVE_PATH_XLSX}'")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
‚ùå Sitemap request failed: HTTPSConnectionPool(host='digestlibrary.com', port=443): Read timed out. (read timeout=30)
üìå Found title with selector 'h1': 'Digest Library' for URL: https://digestlibrary.com/2024/11/11/tu-hi-ishq-tu-hi-janoon-complete-by-saheba-firdous/
üìå Fallback title found: 'Tu hi ishq tu hi janoon complete by Saheba Firdous' for URL: https://digestlibrary.com/2024/11/11/tu-hi-ishq-tu-hi-janoon-complete-by-saheba-firdous/
üìå Final title: 'Tu hi ishq tu hi janoon complete by Saheba Firdous' for URL: https://digestlibrary.com/2024/11/11/tu-hi-ishq-tu-hi-janoon-complete-by-saheba-firdous/
üìå Found title with selector 'h1': 'Digest Library' for URL: https://digestlibrary.com/2023/03/27/hasil-e-tamanna-tum-hi-ho-by-shazia-mustafa/
üìå Fallback title found: 'Hasil e tamanna tum hi ho by Shazia Mustafa' for URL: https://digestlibrary.com/

# New Section

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import xml.etree.ElementTree as ET
import os
import concurrent.futures
import random
from datetime import datetime
from threading import Lock
import re
from functools import lru_cache  # For caching redirects
import logging  # For efficient logging

# üöÄ Google Drive Mount (Optional)
try:
    from google.colab import drive
    drive.mount('/content/drive')
    SAVE_PATH_XLSX = "/content/drive/My Drive/Blogger_Novels.xlsx"
    FAILED_LINKS_FILE = "/content/drive/My Drive/failed_links.txt"
    PROGRESS_FILE = "/content/drive/My Drive/progress.txt"
    LOG_FILE = "/content/drive/My Drive/scrape_log.txt"
except ImportError:
    SAVE_PATH_XLSX = "Blogger_Novels.xlsx"
    FAILED_LINKS_FILE = "failed_links.txt"
    PROGRESS_FILE = "progress.txt"
    LOG_FILE = "scrape_log.txt"

# üìå Setup Logging
logging.basicConfig(
    filename=LOG_FILE,
    level=logging.INFO,
    format="%(asctime)s: %(message)s",
    filemode="a"
)
logger = logging.getLogger()

# üìå Headers & Session
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
    "Connection": "keep-alive"
}
session = requests.Session()
session.headers.update(HEADERS)

# üîÑ Progress Lock
progress_lock = Lock()

# üì• Step 1: Extract all post URLs from sitemap
SITEMAP_URL = "https://digestlibrary.com/post-sitemap7.xml"
try:
    response = session.get(SITEMAP_URL, timeout=30)
    if response.status_code == 200:
        root = ET.fromstring(response.content)
        post_urls = [elem.text for elem in root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}loc")]
        print(f"‚úÖ Found {len(post_urls)} posts to scrape.")
    else:
        print(f"‚ùå Failed to fetch sitemap: HTTP {response.status_code}")
        exit()
except ET.ParseError:
    print("‚ùå Failed to parse sitemap XML.")
    exit()
except requests.exceptions.RequestException as e:
    print(f"‚ùå Sitemap request failed: {e}")
    exit()

# üîÑ Resume Last Progress
last_index = 0
if os.path.exists(PROGRESS_FILE):
    try:
        with open(PROGRESS_FILE, "r") as f:
            last_index = int(f.read().strip())
    except (ValueError, IOError):
        print("‚ö† Could not read progress file. Starting from index 0.")

# ‚è≥ Function to estimate remaining time
def estimate_time(start_time, processed, total):
    elapsed_time = time.time() - start_time
    avg_time_per_post = elapsed_time / processed if processed > 0 else 0
    remaining_posts = total - processed
    estimated_remaining_time = remaining_posts * avg_time_per_post
    return time.strftime('%H:%M:%S', time.gmtime(estimated_remaining_time))

# üîç Cache redirect resolutions
@lru_cache(maxsize=1000)
def resolve_redirect(url):
    try:
        response = session.head(url, allow_redirects=True, timeout=5)  # Faster timeout
        return response.url
    except requests.exceptions.RequestException:
        return url

# üîç Step 2: Scrape Each Novel Post with Auto-Retry
def scrape_post(post_url):
    retries = 3  # Reduced retries for speed
    for attempt in range(retries):
        try:
            response = session.get(post_url, timeout=30)  # Reduced timeout
            if response.status_code == 429:
                print(f"‚ö† Rate limit hit for {post_url}. Waiting...")
                time.sleep(2 ** attempt * 5)  # Reduced backoff
                continue
            if response.status_code in (404, 410):  # Skip non-recoverable errors
                logger.info(f"Skipped {post_url}: HTTP {response.status_code}")
                return None
            if response.status_code != 200:
                print(f"‚ö† Retrying ({attempt+1}/{retries}) for {post_url}: HTTP {response.status_code}")
                time.sleep(3)
                continue

            # Use lxml parser for speed
            soup = BeautifulSoup(response.text, "lxml")

            # Extract Title
            title_selectors = [
                "h1.entry-title",
                "h2.entry-title",
                "h1.post-title",
                "h2.post-title",
                "h3.entry-title",
                "h3.post-title",
                "div.post-title",
                "div.entry-title",
                "h1",
                "h2",
                "h3"
            ]
            title = None
            title_element = None
            for selector in title_selectors:
                title_elem = soup.select_one(selector)
                if title_elem:
                    title = title_elem.text.strip()
                    title_element = title_elem
                    print(f"üìå Found title with selector '{selector}': '{title}' for URL: {post_url}")
                    break

            if not title or "digest library" in title.lower():
                headers = soup.find_all(["h1", "h2", "h3"], limit=5)  # Limit search
                for header in headers:
                    text = header.text.strip()
                    if "digest library" not in text.lower() and len(text) > 10 and not text.lower().startswith("home"):
                        title = text
                        title_element = header
                        print(f"üìå Fallback title found: '{title}' for URL: {post_url}")
                        break

            if not title or "digest library" in title.lower():
                title_tag = soup.find("title")
                if title_tag:
                    title = title_tag.text.strip()
                    if "digest library" in title.lower():
                        title = title.replace("Digest Library", "").replace("|", "").strip()
                    print(f"üìå Title from <title> tag: '{title}' for URL: {post_url}")

            title = title if title and "digest library" in title.lower() else "No Title Found"
            print(f"üìå Final title: '{title}' for URL: {post_url}")

            # Extract Download Links
            # Focus on post content
            content = soup.select_one("div.post-body, div.entry-content, div.post-content, article, div.post")
            if not content:
                content = soup

            # Get <a> tags (limit to 50 to avoid navigation links)
            all_links = [a["href"] for a in content.find_all("a", href=True, limit=50)]
            # Get <button> links
            button_links = [button.get("onclick", "").strip("'").replace("window.location.href=", "")
                           for button in content.find_all("button") if button.get("onclick")]
            # Resolve redirects
            all_links = [resolve_redirect(link) for link in all_links + button_links if link]

            # Filter download links
            download_domains = ["drive.google", "mediafire", "dropbox", "mega.nz"]
            google_drive_links = [link for link in all_links if "drive.google" in link]
            mediafire_links = [link for link in all_links if "mediafire" in link]
            other_links = [link for link in all_links if any(domain in link for domain in ["dropbox", "mega.nz"])]

            # Search content text only if no links found
            if not (google_drive_links or mediafire_links or other_links):
                text = content.get_text(separator=" ", strip=True)
                url_pattern = r'https?://[^\s<>"]{10,500}'  # Optimized regex
                raw_urls = re.findall(url_pattern, text, re.IGNORECASE)
                raw_urls = [url for url in raw_urls if any(domain in url.lower() for domain in download_domains)]
                google_drive_links.extend([url for url in raw_urls if "drive.google" in url and url not in google_drive_links])
                mediafire_links.extend([url for url in raw_urls if "mediafire" in url and url not in mediafire_links])
                other_links.extend([url for url in raw_urls if any(domain in url for domain in ["dropbox", "mega.nz"]) and url not in other_links])

            # Log efficiently
            log_entry = (
                f"URL: {post_url}, Title: {title}\n"
                f"  Google Drive Links: {google_drive_links}\n"
                f"  Mediafire Links: {mediafire_links}\n"
                f"  Other Links: {other_links}\n"
            )
            logger.info(log_entry)

            return {
                "Title": title,
                "Google Drive Links": ", ".join(google_drive_links) if google_drive_links else "No Google Drive Link",
                "Mediafire Links": ", ".join(mediafire_links) if mediafire_links else "No Mediafire Link",
                "Other Links": ", ".join(other_links) if other_links else "No Other Links"
            }
        except requests.exceptions.RequestException as e:
            print(f"‚ö† Attempt {attempt+1} failed for {post_url}: {e}")
            time.sleep(3)

    # ‚ùå Save failed link
    with open(FAILED_LINKS_FILE, "a") as f:
        f.write(post_url + "\n")
    return None

# üìå Step 3: Scrape Each Post & Save Data in Batches
novels_data = []
BATCH_SIZE = 500  # Increased for fewer writes
start_time = time.time()

with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:  # Increased workers
    future_to_url = {executor.submit(scrape_post, post_url): idx for idx, post_url in enumerate(post_urls[last_index:], last_index)}

    for count, future in enumerate(concurrent.futures.as_completed(future_to_url), start=1):
        idx = future_to_url[future]
        result = future.result()
        if result:
            novels_data.append(result)

        # ‚úÖ Save every batch
        if len(novels_data) >= BATCH_SIZE:
            df = pd.DataFrame(novels_data)
            try:
                if os.path.exists(SAVE_PATH_XLSX):
                    existing_df = pd.read_excel(SAVE_PATH_XLSX, engine='openpyxl')
                    df = pd.concat([existing_df, df], ignore_index=True)
                df.to_excel(SAVE_PATH_XLSX, index=False, engine='openpyxl')
                novels_data = []
                with progress_lock:
                    with open(PROGRESS_FILE, "w") as f:
                        f.write(str(idx))
            except (PermissionError, IOError) as e:
                print(f"‚ùå Cannot write to {SAVE_PATH_XLSX}: {e}")
                exit()

        # ‚è≥ Show progress
        if count % 50 == 0:
            remaining_time = estimate_time(start_time, count, len(post_urls) - last_index)
            print(f"‚è≥ Processed {count}/{len(post_urls) - last_index} posts, Estimated time remaining: {remaining_time}")

# üì• Step 4: Final Save
if novels_data:
    df = pd.DataFrame(novels_data)
    try:
        if os.path.exists(SAVE_PATH_XLSX):
            existing_df = pd.read_excel(SAVE_PATH_XLSX, engine='openpyxl')
            df = pd.concat([existing_df, df], ignore_index=True)
        df.to_excel(SAVE_PATH_XLSX, index=False, engine='openpyxl')
    except (PermissionError, IOError) as e:
        print(f"‚ùå Cannot write to {SAVE_PATH_XLSX}: {e}")
        exit()

# ‚úÖ Delete progress file
if os.path.exists(PROGRESS_FILE):
    try:
        os.remove(PROGRESS_FILE)
    except OSError:
        print("‚ö† Could not delete progress file.")

print(f"‚úÖ Scraping complete! Data saved in '{SAVE_PATH_XLSX}'")

üìå Found title with selector 'h1': 'Digest Library' for URL: https://digestlibrary.com/2024/10/12/jasoosi-digest-october-2024-complete-pdf/
üìå Fallback title found: 'Jasoosi digest October 2024 complete pdf' for URL: https://digestlibrary.com/2024/10/12/jasoosi-digest-october-2024-complete-pdf/
üìå Final title: 'No Title Found' for URL: https://digestlibrary.com/2024/10/12/jasoosi-digest-october-2024-complete-pdf/
‚ö† Attempt 1 failed for https://digestlibrary.com/2025/03/03/shafa-epi_2-nd-last-by-eesha-hussain/: HTTPSConnectionPool(host='digestlibrary.com', port=443): Max retries exceeded with url: /2025/03/03/shafa-epi_2-nd-last-by-eesha-hussain/ (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7dd54c2ff3d0>, 'Connection to digestlibrary.com timed out. (connect timeout=30)'))
‚ö† Attempt 1 failed for https://digestlibrary.com/2025/03/04/shafa-epi-_1-by-eesha-hussain/: HTTPSConnectionPool(host='digestlibrary.com', port=443): Max retries exceeded with



üìå Found title with selector 'h1': 'Digest Library' for URL: https://digestlibrary.com/2022/04/06/shua-digest-january-2000-complete-pdf/
üìå Fallback title found: 'Shua digest January 2000 complete pdf' for URL: https://digestlibrary.com/2022/04/06/shua-digest-january-2000-complete-pdf/
üìå Final title: 'No Title Found' for URL: https://digestlibrary.com/2022/04/06/shua-digest-january-2000-complete-pdf/
‚ö† Attempt 1 failed for https://digestlibrary.com/2025/03/06/mohabbat-khuwab-rang-epi_1-by-farhat-ansari/: HTTPSConnectionPool(host='digestlibrary.com', port=443): Max retries exceeded with url: /2025/03/06/mohabbat-khuwab-rang-epi_1-by-farhat-ansari/ (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7dd544a298d0>, 'Connection to digestlibrary.com timed out. (connect timeout=30)'))
‚ö† Attempt 1 failed for https://digestlibrary.com/2025/03/01/hum-tum-aur-chand-by-humaira-shafi/: HTTPSConnectionPool(host='digestlibrary.com', port=443): Max retries exceed



‚ö† Attempt 1 failed for https://digestlibrary.com/2023/01/17/jeena-esi-ka-naam-hai-by-nuzhat-jabeen-zia/: HTTPSConnectionPool(host='digestlibrary.com', port=443): Max retries exceeded with url: /2023/01/17/jeena-esi-ka-naam-hai-by-nuzhat-jabeen-zia/ (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7dd544a20ad0>, 'Connection to digestlibrary.com timed out. (connect timeout=30)'))




‚ö† Attempt 3 failed for https://digestlibrary.com/2025/03/04/shanakhat-by-mahwish-talib/: HTTPSConnectionPool(host='digestlibrary.com', port=443): Max retries exceeded with url: /2025/03/04/shanakhat-by-mahwish-talib/ (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7dd5449a5c90>, 'Connection to digestlibrary.com timed out. (connect timeout=30)'))
‚ö† Attempt 1 failed for https://digestlibrary.com/2023/02/02/kundan-by-nadia-fatima-rizvi/: HTTPSConnectionPool(host='digestlibrary.com', port=443): Max retries exceeded with url: /2023/02/02/kundan-by-nadia-fatima-rizvi/ (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7dd544109690>, 'Connection to digestlibrary.com timed out. (connect timeout=30)'))
üìå Found title with selector 'h1': 'Digest Library' for URL: https://digestlibrary.com/2025/03/07/shua-digest-september-2021-complete-pdf/
üìå Fallback title found: 'Shua digest September 2021 complete pdf' for URL: https://digestl



üìå Found title with selector 'h1': 'Digest Library' for URL: https://digestlibrary.com/2021/02/17/ishq-safar-ki-dhool-by-arshia-rajpoot/
üìå Fallback title found: 'Ishq safar ki dhool by Arshia Rajpoot' for URL: https://digestlibrary.com/2021/02/17/ishq-safar-ki-dhool-by-arshia-rajpoot/
üìå Final title: 'No Title Found' for URL: https://digestlibrary.com/2021/02/17/ishq-safar-ki-dhool-by-arshia-rajpoot/
üìå Found title with selector 'h1': 'Digest Library' for URL: https://digestlibrary.com/2022/05/29/shua-digest-february-2000-complete-pdf/
üìå Fallback title found: 'Shua digest February 2000 complete pdf' for URL: https://digestlibrary.com/2022/05/29/shua-digest-february-2000-complete-pdf/
üìå Final title: 'No Title Found' for URL: https://digestlibrary.com/2022/05/29/shua-digest-february-2000-complete-pdf/
üìå Found title with selector 'h1': 'Digest Library' for URL: https://digestlibrary.com/2025/03/07/sahil-epi_6_7-by-asma-ashraf/
üìå Fallback title found: 'Sahil epi_6_7 by



‚ö† Attempt 1 failed for https://digestlibrary.com/2025/03/10/shua-digest-october-2021-complete-pdf/: HTTPSConnectionPool(host='digestlibrary.com', port=443): Max retries exceeded with url: /2025/03/10/shua-digest-october-2021-complete-pdf/ (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7dd53ef86b50>, 'Connection to digestlibrary.com timed out. (connect timeout=30)'))
üìå Found title with selector 'h1': 'Digest Library' for URL: https://digestlibrary.com/2025/03/09/pardes-epi_9-by-sumaira-hameed/
üìå Fallback title found: 'Pardes epi_9 by Sumaira Hameed' for URL: https://digestlibrary.com/2025/03/09/pardes-epi_9-by-sumaira-hameed/
üìå Final title: 'No Title Found' for URL: https://digestlibrary.com/2025/03/09/pardes-epi_9-by-sumaira-hameed/
üìå Found title with selector 'h1': 'Digest Library' for URL: https://digestlibrary.com/2023/01/25/urti-tittlion-ke-sang-by-fatima-ambreen/
üìå Fallback title found: 'Urti tittlion ke sang by Fatima Ambreen' for 

KeyboardInterrupt: 

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import xml.etree.ElementTree as ET
import os
import concurrent.futures
import random
from datetime import datetime
from threading import Lock
import re
import logging
from urllib.parse import urlparse

# üöÄ Setup Logging
logging.basicConfig(
    filename="scrape_log.txt",
    level=logging.INFO,
    format="%(asctime)s: %(levelname)s: %(message)s"
)

# üìå Google Drive Mount (Optional)
try:
    from google.colab import drive
    drive.mount('/content/drive')
    SAVE_PATH_XLSX = "/content/drive/My Drive/Blogger_Novels.xlsx"
    FAILED_LINKS_FILE = "/content/drive/My Drive/failed_links.txt"
    PROGRESS_FILE = "/content/drive/My Drive/progress.txt"
except ImportError:
    SAVE_PATH_XLSX = "Blogger_Novels.xlsx"
    FAILED_LINKS_FILE = "failed_links.txt"
    PROGRESS_FILE = "progress.txt"

# üìå Headers & Session
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
    "Connection": "keep-alive"
}
session = requests.Session()
session.headers.update(HEADERS)

# üîÑ Progress Lock
progress_lock = Lock()

# üì• Step 1: Extract all post URLs from sitemap
SITEMAP_URL = "https://digestlibrary.com/post-sitemap7.xml"
try:
    response = session.get(SITEMAP_URL, timeout=30)
    if response.status_code == 200:
        root = ET.fromstring(response.content)
        post_urls = [elem.text for elem in root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}loc")]
        logging.info(f"Found {len(post_urls)} posts to scrape.")
    else:
        logging.error(f"Failed to fetch sitemap: HTTP {response.status_code}")
        exit()
except ET.ParseError:
    logging.error("Failed to parse sitemap XML.")
    exit()
except requests.exceptions.RequestException as e:
    logging.error(f"Sitemap request failed: {e}")
    exit()

# üîÑ Resume Last Progress
last_index = 0
if os.path.exists(PROGRESS_FILE):
    try:
        with open(PROGRESS_FILE, "r") as f:
            last_index = int(f.read().strip())
        logging.info(f"Resuming from index {last_index}")
    except (ValueError, IOError) as e:
        logging.warning(f"Could not read progress file: {e}. Starting from index 0.")

# ‚è≥ Function to estimate remaining time
def estimate_time(start_time, processed, total):
    elapsed_time = time.time() - start_time
    avg_time_per_post = elapsed_time / processed if processed > 0 else 0
    remaining_posts = total - processed
    estimated_remaining_time = remaining_posts * avg_time_per_post
    return time.strftime('%H:%M:%S', time.gmtime(estimated_remaining_time))

# üîç Function to resolve redirects
def resolve_redirect(url):
    try:
        response = session.head(url, allow_redirects=True, timeout=10)
        return response.url
    except requests.exceptions.RequestException:
        return url

# üîç Step 2: Scrape Each Novel Post with Auto-Retry
def scrape_post(post_url):
    retries = 5
    for attempt in range(retries):
        try:
            response = session.get(post_url, timeout=60)
            if response.status_code == 429:
                logging.warning(f"Rate limit hit for {post_url}. Waiting...")
                time.sleep(2 ** attempt * 10)
                continue
            if response.status_code != 200:
                logging.warning(f"Retrying ({attempt+1}/{retries}) for {post_url}: HTTP {response.status_code}")
                time.sleep(5)
                continue

            soup = BeautifulSoup(response.text, "html.parser")

            # Extract Title
            title_selectors = [
                "h1.entry-title", "h2.entry-title", "h1.post-title", "h2.post-title",
                "h3.entry-title", "h3.post-title", "div.post-title", "div.entry-title",
                "h1", "h2", "h3"
            ]
            title = None
            title_element = None
            for selector in title_selectors:
                title_elem = soup.select_one(selector)
                if title_elem:
                    title = title_elem.text.strip()
                    title_element = title_elem
                    logging.info(f"Found title with selector '{selector}': '{title}' for URL: {post_url}")
                    break

            if not title or "digest library" in title.lower():
                headers = soup.find_all(["h1", "h2", "h3"])
                for header in headers:
                    text = header.text.strip()
                    if "digest library" not in text.lower() and len(text) > 10 and not text.lower().startswith("home"):
                        title = text
                        title_element = header
                        logging.info(f"Fallback title found: '{title}' for URL: {post_url}")
                        break

            if not title or "digest library" in title.lower():
                title_tag = soup.find("title")
                if title_tag:
                    title = title_tag.text.strip().replace("Digest Library", "").replace("|", "").strip()
                    logging.info(f"Title from <title> tag: '{title}' for URL: {post_url}")

            title = title if title and "digest library" not in title.lower() else "No Title Found"
            logging.info(f"Final title: '{title}' for URL: {post_url}")

            # Extract Download Links
            all_links = [a["href"] for a in soup.find_all("a", href=True)]
            button_links = [button.get("onclick", "").strip("'").replace("window.location.href=", "")
                           for button in soup.find_all("button") if button.get("onclick")]
            all_links = [resolve_redirect(link) for link in all_links + button_links if link]

            download_domains = ["drive.google", "mediafire", "dropbox", "mega.nz"]
            google_drive_links = [link for link in all_links if "drive.google" in link]
            mediafire_links = [link for link in all_links if "mediafire" in link]
            other_links = [link for link in all_links if any(domain in link for domain in ["dropbox", "mega.nz"])]

            content_selectors = ["div.post-body", "div.entry-content", "div.post-content", "article", "div.post"]
            content = None
            for selector in content_selectors:
                content = soup.select_one(selector)
                if content:
                    break
            if not content:
                content = soup

            raw_urls = []
            text = content.get_text()
            url_pattern = r'https?://[^\s<>"]+|www\.[^\s<>"]+'
            raw_urls = re.findall(url_pattern, text)
            raw_urls = [url for url in raw_urls if any(domain in url.lower() for domain in download_domains)]

            google_drive_links.extend([url for url in raw_urls if "drive.google" in url and url not in google_drive_links])
            mediafire_links.extend([url for url in raw_urls if "mediafire" in url and url not in mediafire_links])
            other_links.extend([url for url in raw_urls if any(domain in url for domain in ["dropbox", "mega.nz"]) and url not in other_links])

            logging.debug(f"URL: {post_url}, Title: {title}")
            logging.debug(f"All <a> hrefs: {all_links}")
            logging.debug(f"Button links: {button_links}")
            logging.debug(f"Raw URLs from content: {raw_urls}")
            logging.debug(f"Google Drive Links: {google_drive_links}")
            logging.debug(f"Mediafire Links: {mediafire_links}")
            logging.debug(f"Other Links: {other_links}")
            if title_element:
                parent = title_element.find_parent()
                context = str(parent)[:200] if parent else "No parent"
                logging.debug(f"Title Context: {context}")
            content_snippet = text[:200].replace('\n', ' ')
            logging.debug(f"Content Snippet: {content_snippet}")

            return {
                "Title": title,
                "Google Drive Links": ", ".join(google_drive_links) if google_drive_links else "No Google Drive Link",
                "Mediafire Links": ", ".join(mediafire_links) if mediafire_links else "No Mediafire Link",
                "Other Links": ", ".join(other_links) if other_links else "No Other Links"
            }
        except requests.exceptions.RequestException as e:
            logging.warning(f"Attempt {attempt+1} failed for {post_url}: {e}")
            time.sleep(5)

    logging.error(f"Failed to scrape {post_url} after {retries} attempts")
    with open(FAILED_LINKS_FILE, "a") as f:
        f.write(post_url + "\n")
    return None

# üìå Step 3: Scrape Each Post & Save Data in Batches
novels_data = []
BATCH_SIZE = 150
start_time = time.time()

with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:  # Reduced workers
    future_to_url = {executor.submit(scrape_post, post_urls[idx]): idx for idx in range(last_index, len(post_urls))}

    for count, future in enumerate(concurrent.futures.as_completed(future_to_url), start=1):
        idx = future_to_url[future]
        result = future.result()
        if result:
            novels_data.append(result)

        if len(novels_data) >= BATCH_SIZE:
            df = pd.DataFrame(novels_data)
            try:
                with pd.ExcelWriter(SAVE_PATH_XLSX, mode='a' if os.path.exists(SAVE_PATH_XLSX) else 'w', engine='openpyxl') as writer:
                    df.to_excel(writer, index=False)
                novels_data = []
                with progress_lock:
                    with open(PROGRESS_FILE, "w") as f:
                        f.write(str(idx))
                logging.info(f"Saved batch at index {idx}")
            except (PermissionError, IOError) as e:
                logging.error(f"Cannot write to {SAVE_PATH_XLSX}: {e}. Saving to temp file.")
                df.to_excel("temp_novels.xlsx", index=False, engine='openpyxl')

        if count % 50 == 0:
            remaining_time = estimate_time(start_time, count, len(post_urls) - last_index)
            logging.info(f"Processed {count} posts. Estimated time remaining: {remaining_time}")

        time.sleep(random.uniform(2, 5))  # Increased delay range

# üì• Step 4: Final Save
if novels_data:
    df = pd.DataFrame(novels_data)
    try:
        with pd.ExcelWriter(SAVE_PATH_XLSX, mode='a' if os.path.exists(SAVE_PATH_XLSX) else 'w', engine='openpyxl') as writer:
            df.to_excel(writer, index=False)
        logging.info("Final data saved.")
    except (PermissionError, IOError) as e:
        logging.error(f"Cannot write to {SAVE_PATH_XLSX}: {e}. Saving to temp file.")
        df.to_excel("temp_novels.xlsx", index=False, engine='openpyxl')

# ‚úÖ Delete progress file only if all posts processed
if last_index + len(post_urls) >= len(post_urls):
    if os.path.exists(PROGRESS_FILE):
        try:
            os.remove(PROGRESS_FILE)
            logging.info("Progress file deleted.")
        except OSError as e:
            logging.warning(f"Could not delete progress file: {e}")

logging.info(f"Scraping complete! Data saved in '{SAVE_PATH_XLSX}'")
print(f"‚úÖ Scraping complete! Data saved in '{SAVE_PATH_XLSX}'")

Mounted at /content/drive


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import xml.etree.ElementTree as ET
import os
import concurrent.futures
import random
import logging
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import backoff
from datetime import datetime

# üöÄ Google Drive Mount (for saving file)
try:
    from google.colab import drive
    drive.mount('/content/drive')
    BASE_PATH = "/content/drive/My Drive"
except ImportError:
    BASE_PATH = os.getcwd()  # Fallback to local directory if not in Colab

# üìå File Paths
SITEMAP_URL = "https://digestlibrary.com/post-sitemap2.xml"
SAVE_PATH_XLSX = os.path.join(BASE_PATH, "Blogger_Novels.xlsx")
FAILED_LINKS_FILE = os.path.join(BASE_PATH, "failed_links.txt")
PROGRESS_FILE = os.path.join(BASE_PATH, "progress.txt")

# üõ° Headers & Session
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"
}
session = requests.Session()
adapter = HTTPAdapter(pool_connections=20, pool_maxsize=20)  # Increase connection pool size
session.mount("http://", adapter)
session.mount("https://", adapter)
session.headers.update(HEADERS)

# üîß Logging Setup
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[
        logging.FileHandler(os.path.join(BASE_PATH, "scrape_log.txt")),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# üîÑ Resume Last Progress
last_index = 0
if os.path.exists(PROGRESS_FILE):
    with open(PROGRESS_FILE, "r") as f:
        last_index = int(f.read().strip())
    logger.info(f"Resuming from index {last_index}")

# üì• Step 1: Extract all post URLs from sitemap
try:
    response = session.get(SITEMAP_URL, timeout=30)
    response.raise_for_status()
    root = ET.fromstring(response.content)
    post_urls = [elem.text for elem in root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}loc")]
    logger.info(f"‚úÖ Found {len(post_urls)} posts to scrape.")
except (requests.exceptions.RequestException, ET.ParseError) as e:
    logger.error(f"‚ùå Failed to fetch or parse sitemap: {e}")
    exit()

# ‚è≥ Function to estimate remaining time
def estimate_time(start_time, processed, total):
    elapsed_time = time.time() - start_time
    avg_time_per_post = elapsed_time / processed if processed > 0 else 0
    remaining_posts = total - processed
    estimated_remaining_time = remaining_posts * avg_time_per_post
    return time.strftime('%H:%M:%S', time.gmtime(estimated_remaining_time))

# üîç Step 2: Scrape Each Novel Post with Auto-Retry
@backoff.on_exception(backoff.expo, requests.exceptions.RequestException, max_tries=5, max_time=300)
def scrape_post(post_url):
    """Extracts title and ALL download links from a Blogger post"""
    try:
        response = session.get(post_url, timeout=30)
        if response.status_code == 429:
            logger.warning(f"‚ö† Rate limit hit for {post_url}. Pausing for 60 seconds.")
            time.sleep(60)
            raise requests.exceptions.RequestException("Rate limit")
        elif response.status_code == 503:
            logger.warning(f"‚ö† Service Unavailable for {post_url}. Pausing for 120 seconds.")
            time.sleep(120)
            raise requests.exceptions.RequestException("Service Unavailable")
        response.raise_for_status()

        soup = BeautifulSoup(response.text, "html.parser")

        # Extract Title
        title = (soup.find("h2") or soup.find("h1") or soup.find("h3") or
                 soup.find("meta", property="og:title"))
        title = title.get("content") if title and title.name == "meta" else title.text.strip() if title else "No Title Found"

        # Extract Download Links
        all_links = [a["href"] for a in soup.find_all("a", href=True)]
        google_drive_links = [link for link in all_links if "drive.google" in link]
        mediafire_links = [link for link in all_links if "mediafire" in link]

        logger.info(f"‚úÖ Successfully scraped: {post_url}")
        return {
            "Title": title,
            "Google Drive Links": ", ".join(google_drive_links) if google_drive_links else "No Google Drive Link",
            "Mediafire Links": ", ".join(mediafire_links) if mediafire_links else "No Mediafire Link",
            "URL": post_url
        }
    except requests.exceptions.RequestException as e:
        logger.error(f"‚ùå Failed for {post_url}: {e}")
        with open(FAILED_LINKS_FILE, "a") as f:
            f.write(f"{post_url} | Failed: {str(e)}\n")
        return None
    except Exception as e:
        logger.error(f"‚ùå Parsing error for {post_url}: {e}")
        with open(FAILED_LINKS_FILE, "a") as f:
            f.write(f"{post_url} | Parsing error: {str(e)}\n")
        return None

# üìå Step 3: Scrape Each Post & Save Data in Batches
novels_data = []
BATCH_SIZE = 50
start_time = time.time()
failed_urls = []

with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:  # Reduced to 3
    future_to_url = {executor.submit(scrape_post, post_urls[idx]): idx for idx in range(last_index, len(post_urls))}

    for count, future in enumerate(concurrent.futures.as_completed(future_to_url), start=1):
        idx = future_to_url[future]
        try:
            result = future.result()
            if result:
                novels_data.append(result)
            else:
                failed_urls.append(post_urls[idx])
        except Exception as e:
            logger.error(f"‚ùå Error processing {post_urls[idx]}: {e}")
            failed_urls.append(post_urls[idx])

        # ‚úÖ Save every batch of 50 posts
        if len(novels_data) >= BATCH_SIZE:
            df = pd.DataFrame(novels_data)
            try:
                if os.path.exists(SAVE_PATH_XLSX):
                    existing_df = pd.read_excel(SAVE_PATH_XLSX, engine='openpyxl')
                    if set(existing_df.columns) != set(df.columns):
                        logger.error("‚ö† Column mismatch in Excel file. Creating new file.")
                        df.to_excel(SAVE_PATH_XLSX, index=False, engine='openpyxl')
                    else:
                        df = pd.concat([existing_df, df], ignore_index=True)
                        df.to_excel(SAVE_PATH_XLSX, index=False, engine='openpyxl')
                else:
                    df.to_excel(SAVE_PATH_XLSX, index=False)
                logger.info(f"‚úÖ Saved batch of {len(novels_data)} posts to {SAVE_PATH_XLSX}")
                with open(PROGRESS_FILE, "w") as f:
                    f.write(str(idx))
                novels_data = []
            except Exception as e:
                logger.error(f"‚ùå Failed to save batch: {e}")

        # ‚è≥ Show estimated remaining time
        if count % 5 == 0:  # More frequent updates
            remaining_time = estimate_time(start_time, count, len(post_urls) - last_index)
            logger.info(f"‚è≥ Processed {count} posts. Estimated time remaining: {remaining_time}")

        time.sleep(random.uniform(0.5, 3))  # Increased max delay

# üì• Step 4: Final Save
if novels_data:
    df = pd.DataFrame(novels_data)
    try:
        if os.path.exists(SAVE_PATH_XLSX):
            existing_df = pd.read_excel(SAVE_PATH_XLSX, engine='openpyxl')
            if set(existing_df.columns) != set(df.columns):
                logger.error("‚ö† Column mismatch in Excel file. Creating new file.")
                df.to_excel(SAVE_PATH_XLSX, index=False, engine='openpyxl')
            else:
                df = pd.concat([existing_df, df], ignore_index=True)
                df.to_excel(SAVE_PATH_XLSX, index=False, engine='openpyxl')
        else:
            df.to_excel(SAVE_PATH_XLSX, index=False, engine='openpyxl')
        logger.info(f"‚úÖ Saved final batch of {len(novels_data)} posts to {SAVE_PATH_XLSX}")
    except Exception as e:
        logger.error(f"‚ùå Failed to save final batch: {e}")

# üìå Step 5: Retry Failed URLs
if failed_urls:
    logger.info(f"üîÑ Retrying {len(failed_urls)} failed URLs")
    novels_data = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
        future_to_url = {executor.submit(scrape_post, url): url for url in failed_urls}
        for future in concurrent.futures.as_completed(future_to_url):
            url = future_to_url[future]
            try:
                result = future.result()
                if result:
                    novels_data.append(result)
            except Exception as e:
                logger.error(f"‚ùå Retry failed for {url}: {e}")

    if novels_data:
        df = pd.DataFrame(novels_data)
        try:
            if os.path.exists(SAVE_PATH_XLSX):
                existing_df = pd.read_excel(SAVE_PATH_XLSX, engine='openpyxl')
                df = pd.concat([existing_df, df], ignore_index=True)
            df.to_excel(SAVE_PATH_XLSX, index=False, engine='openpyxl')
            logger.info(f"‚úÖ Saved {len(novels_data)} retried posts to {SAVE_PATH_XLSX}")
        except Exception as e:
            logger.error(f"‚ùå Failed to save retried batch: {e}")

# ‚úÖ Delete progress file
if os.path.exists(PROGRESS_FILE):
    os.remove(PROGRESS_FILE)
    logger.info("‚úÖ Progress file deleted")

# üìä Report failed links
if os.path.exists(FAILED_LINKS_FILE):
    with open(FAILED_LINKS_FILE, "r") as f:
        failed_count = len(f.read().splitlines())
    logger.info(f"‚ö† {failed_count} URLs failed. Check {FAILED_LINKS_FILE} for details.")
else:
    logger.info("‚úÖ No failed URLs.")

logger.info(f"‚úÖ Scraping complete! Data saved in '{SAVE_PATH_XLSX}'")

Mounted at /content/drive


ERROR:__main__:‚ùå Failed for https://digestlibrary.com/2023/04/03/khoi-hui-bat-by-alia-bukhari/: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
ERROR:__main__:‚ùå Failed for https://digestlibrary.com/2023/02/20/kaisar-by-sehar-sajid/: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
ERROR:__main__:‚ùå Failed for https://digestlibrary.com/2023/02/01/shua-digest-may-1998-complete-pdf/: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
ERROR:__main__:‚ùå Failed for https://digestlibrary.com/2023/01/25/hajje-akbar-by-aleem-ul-haq-haqi/: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
ERROR:__main__:‚ùå Failed for https://digestlibrary.com/2022/10/31/ujale-bikharte-rahe-by-asia-razzaqi/: HTTPSConnectionPool(host='digestlibrary.com', port=443): Read timed out. (read timeout=30)
ERROR:__main__:‚ùå Failed for https://

In [None]:
pip install backoff

Collecting backoff
  Downloading backoff-2.2.1-py3-none-any.whl.metadata (14 kB)
Downloading backoff-2.2.1-py3-none-any.whl (15 kB)
Installing collected packages: backoff
Successfully installed backoff-2.2.1
