In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import os
import random
from datetime import datetime
from google.colab import drive

# Mount Drive
drive.mount('/content/drive', force_remount=True)

SAVE_PATH_XLSX = "/content/drive/My Drive/UrduNovelBank_Latest.xlsx"
FAILED_LINKS_FILE = "/content/drive/My Drive/failed_links.txt"

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"
}
session = requests.Session()
session.headers.update(HEADERS)

today = datetime.today()
YEAR = today.year
MONTH = today.month
BASE_URL = f"https://www.urdunovelbanks.com/{YEAR}/{MONTH:02d}/"

print(f"Fetching from: {BASE_URL}")

def get_all_post_links(base_url):
    post_links = set()  # Use set to avoid duplicates
    next_page = base_url

    while next_page:
        response = session.get(next_page, timeout=30)
        if response.status_code != 200:
            print(f"Failed to fetch {next_page}")
            break

        soup = BeautifulSoup(response.text, "html.parser")

        # Better: target actual post title links
        for a_tag in soup.select(".post-title a, h3 a, .entry-title a"):
            href = a_tag.get("href")
            if href and href not in post_links:
                post_links.add(href)

        # Next page
        next_link = soup.find("a", string="Older Posts")
        if next_link and next_link.get("href"):
            next_url = next_link["href"]
            if f"/{YEAR}/{MONTH:02d}/" in next_url:
                next_page = next_url
                time.sleep(random.uniform(1, 3))
            else:
                next_page = None
        else:
            next_page = None

    return list(post_links)

post_links = get_all_post_links(BASE_URL)
print(f"Found {len(post_links)} unique posts.")

def scrape_post(post_url):
    for attempt in range(3):
        try:
            response = session.get(post_url, timeout=30)
            if response.status_code != 200:
                time.sleep(3)
                continue

            soup = BeautifulSoup(response.text, "html.parser")

            # Better title
            title_tag = soup.find("h3", class_="post-title") or soup.find("h1")
            title = title_tag.get_text(strip=True) if title_tag else "No Title"

            all_links = [a["href"] for a in soup.find_all("a", href=True)]
            gdrive = [l for l in all_links if "drive.google.com" in l or "googleusercontent.com" in l]
            mediafire = [l for l in all_links if "mediafire.com" in l]

            return {
                "Title": title,
                "URL": post_url,
                "Google Drive Links": ", ".join(gdrive) if gdrive else "None",
                "Mediafire Links": ", ".join(mediafire) if mediafire else "None",
                "Scraped Date": today.strftime("%Y-%m-%d")
            }
        except Exception as e:
            print(f"Attempt {attempt+1} failed for {post_url}: {e}")
            time.sleep(5)

    with open(FAILED_LINKS_FILE, "a") as f:
        f.write(post_url + "\n")
    return None

novels_data = []
for idx, link in enumerate(post_links, 1):
    data = scrape_post(link)
    if data:
        novels_data.append(data)
    print(f"{idx}/{len(post_links)} scraped")
    time.sleep(random.uniform(1.5, 3))

if novels_data:
    df_new = pd.DataFrame(novels_data)

    if os.path.exists(SAVE_PATH_XLSX):
        df_old = pd.read_excel(SAVE_PATH_XLSX)
        df = pd.concat([df_old, df_new], ignore_index=True)
    else:
        df = df_new

    df = df.drop_duplicates(subset=["URL"])  # Critical!
    df.to_excel(SAVE_PATH_XLSX, index=False, engine="openpyxl")
    print(f"Saved {len(df)} total entries to Excel.")
else:
    print("No new data scraped.")

Mounted at /content/drive
Fetching from: https://www.urdunovelbanks.com/2025/12/
Found 17 unique posts.
1/17 scraped
2/17 scraped
3/17 scraped
4/17 scraped


KeyboardInterrupt: 