In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import os
import random
from datetime import datetime
from google.colab import drive

# üöÄ Mount Google Drive (force remount if already mounted)
drive.mount('/content/drive', force_remount=True)

# üìå File paths
SAVE_PATH_XLSX = "/content/drive/My Drive/Blogger_Novels_Latest.xlsx"
FAILED_LINKS_FILE = "/content/drive/My Drive/failed_links.txt"

# üõ° Headers
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"}
session = requests.Session()
session.headers.update(HEADERS)

# üîπ Step 0: Detect current/latest month
today = datetime.today()
YEAR = today.year
MONTH = today.month
BLOG_ARCHIVE_URL = f"https://digestlibrary.com/{YEAR}/{MONTH:02d}/"

print(f"Fetching posts from: {BLOG_ARCHIVE_URL}")

# üîπ Step 1: Get all post URLs for the latest month
response = session.get(BLOG_ARCHIVE_URL)
if response.status_code != 200:
    print("‚ùå Failed to fetch archive page.")
    exit()

soup = BeautifulSoup(response.text, "html.parser")

# Collect all links containing year/month (Blogger-style URLs)
post_links = []
for a_tag in soup.find_all("a", href=True):
    href = a_tag['href']
    if f"/{YEAR}/{MONTH:02d}/" in href and href not in post_links:
        post_links.append(href)

print(f"‚úÖ Found {len(post_links)} posts for latest month.")

# üîπ Step 2: Function to scrape each post
def scrape_post(post_url):
    retries = 3
    for attempt in range(retries):
        try:
            response = session.get(post_url, timeout=30)
            if response.status_code != 200:
                time.sleep(3)
                continue

            soup = BeautifulSoup(response.text, "html.parser")

            # Title
            title_tag = soup.find("h1") or soup.find("h2") or soup.find("h3")
            title = title_tag.text.strip() if title_tag else "No Title Found"

            # Download links
            all_links = [a["href"] for a in soup.find_all("a", href=True)]
            google_drive_links = [l for l in all_links if "drive.google" in l]
            mediafire_links = [l for l in all_links if "mediafire" in l]

            return {
                "Title": title,
                "URL": post_url,
                "Google Drive Links": ", ".join(google_drive_links) if google_drive_links else "No Google Drive Link",
                "Mediafire Links": ", ".join(mediafire_links) if mediafire_links else "No Mediafire Link"
            }

        except Exception as e:
            print(f"‚ö† Attempt {attempt+1} failed for {post_url}: {e}")
            time.sleep(3)

    # Save failed link
    with open(FAILED_LINKS_FILE, "a") as f:
        f.write(post_url + "\n")
    return None

# üîπ Step 3: Scrape all posts
novels_data = []
for idx, link in enumerate(post_links, start=1):
    result = scrape_post(link)
    if result:
        novels_data.append(result)
    print(f"Scraped {idx}/{len(post_links)}: {link}")
    time.sleep(random.uniform(1, 2))  # Random delay

# üîπ Step 4: Save to Excel
if novels_data:
    df = pd.DataFrame(novels_data)
    if os.path.exists(SAVE_PATH_XLSX):
        existing_df = pd.read_excel(SAVE_PATH_XLSX, engine='openpyxl')
        df = pd.concat([existing_df, df], ignore_index=True)
    df.to_excel(SAVE_PATH_XLSX, index=False, engine='openpyxl')

print(f"‚úÖ Scraping complete! Data saved in '{SAVE_PATH_XLSX}'")


Mounted at /content/drive
Fetching posts from: https://digestlibrary.com/2025/12/
‚úÖ Found 53 posts for latest month.
Scraped 1/53: https://digestlibrary.com/2025/12/23/barkha-bahar-by-uzma-bukhari/
Scraped 2/53: https://digestlibrary.com/2025/12/23/poonam-rat-aur-chokor-by-asia-mirza/
Scraped 3/53: https://digestlibrary.com/2025/12/23/aey-wattan-tere-leye-by-haya-bukhari/
Scraped 4/53: https://digestlibrary.com/2025/12/23/hina-digest-october-2018-complete-pdf/
Scraped 5/53: https://digestlibrary.com/2025/12/23/shua-digest-april-2016-complete-pdf/
Scraped 6/53: https://digestlibrary.com/2025/12/23/bus-kuch-be-khabar-they-by-nosheen-naz-akhtar/
Scraped 7/53: https://digestlibrary.com/2025/12/22/naama-bar-hai-bahar-ka-by-sadaf-asif/
Scraped 8/53: https://digestlibrary.com/2025/12/22/tishnagi-by-lubna-tahir/
Scraped 9/53: https://digestlibrary.com/2025/12/22/tere-ishq-nachaya-complete-by-sidra-ijaz/
Scraped 10/53: https://digestlibrary.com/2025/12/22/meri-maa-by-sadia-hameed-chaudhary/
S

using sitemap

In [8]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import xml.etree.ElementTree as ET
import os
import concurrent.futures
import random
import re

from google.colab import drive
drive.mount('/content/drive')

# üåê SITE URL (ONLY THIS)
SITE_URL = "https://digestlibrary.com/"
SITEMAP_URL = SITE_URL.rstrip("/") + "/post-sitemap.xml"

# üìÅ FILE PATHS
SAVE_PATH_XLSX = "/content/drive/My Drive/DigestLibrary_LatestMonth.xlsx"
FAILED_LINKS_FILE = "/content/drive/My Drive/failed_links.txt"

# üõ° HEADERS
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}
session = requests.Session()
session.headers.update(HEADERS)

# üì• LOAD SITEMAP
response = session.get(SITEMAP_URL)
if response.status_code != 200:
    print("‚ùå Sitemap load failed")
    exit()

root = ET.fromstring(response.content)
all_urls = [
    elem.text for elem in root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}loc")
]

print(f"üîó Total posts found: {len(all_urls)}")

# üìÖ DETECT LATEST MONTH FROM LAST POST
last_post_url = all_urls[-1]
print("üß† Last Post URL:", last_post_url)

match = re.search(r"/(\d{4})/(\d{2})/", last_post_url)
if not match:
    print("‚ùå Date not found in last URL")
    exit()

YEAR, MONTH = match.group(1), match.group(2)
print(f"üìÖ Latest month detected: {YEAR}/{MONTH}")

# üéØ FILTER POSTS OF LATEST MONTH
post_urls = [
    url for url in all_urls
    if f"/{YEAR}/{MONTH}/" in url
]

print(f"‚úÖ Posts in latest month: {len(post_urls)}")

# üîé ROBUST TITLE EXTRACTOR
def extract_title(soup):
    selectors = [
        "h1.post-title",
        "h1.entry-title",
        "h1.post-title.entry-title",

        "h2.post-title",
        "h2.entry-title",

        "h3.post-title",
        "h3.entry-title",

        "article h1",
        "article h2",
        "article h3"
    ]

    for selector in selectors:
        tag = soup.select_one(selector)
        if tag:
            title = tag.get_text(strip=True)
            if title and title.lower() not in ["digest library"]:
                return title

    meta = soup.find("meta", property="og:title")
    if meta and meta.get("content"):
        clean = meta["content"].replace("Digest Library", "")
        return clean.strip(" -|")

    return "Title Not Found"

# üîç SCRAPE FUNCTION
def scrape_post(post_url):
    try:
        r = session.get(post_url, timeout=30)
        if r.status_code != 200:
            return None

        soup = BeautifulSoup(r.text, "html.parser")

        title = extract_title(soup)

        links = [a["href"] for a in soup.find_all("a", href=True)]
        drive_links = [l for l in links if "drive.google" in l]
        mediafire_links = [l for l in links if "mediafire" in l]

        return {
            "Title": title,
            "Post URL": post_url,
            "Google Drive Links": ", ".join(drive_links),
            "Mediafire Links": ", ".join(mediafire_links)
        }

    except Exception as e:
        with open(FAILED_LINKS_FILE, "a") as f:
            f.write(post_url + "\n")
        return None

# üöÄ START SCRAPING
novels_data = []

with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    for result in executor.map(scrape_post, post_urls):
        if result:
            novels_data.append(result)
        time.sleep(random.uniform(1, 2))

# üíæ SAVE TO EXCEL
df = pd.DataFrame(novels_data)
df.to_excel(SAVE_PATH_XLSX, index=False)

print("‚úÖ Scraping completed successfully!")
print(f"üìÅ File saved at: {SAVE_PATH_XLSX}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
üîó Total posts found: 1001
üß† Last Post URL: https://digestlibrary.com/2023/01/14/piyar-ki-khushboo-by-huma-kokab-bukhari/
üìÖ Latest month detected: 2023/01
‚úÖ Posts in latest month: 55
‚úÖ Scraping completed successfully!
üìÅ File saved at: /content/drive/My Drive/DigestLibrary_LatestMonth.xlsx
