In [1]:
# GOOGLE DRIVE PDF TO EXCEL (CLEAN TITLES VERSION)
# Target Folder: 1ruT1ZUfgKZb2YIyLtvr7pooygXUnbk-Z

import pandas as pd
import re  # Text safayi ke liye library
from googleapiclient.discovery import build
from google.colab import auth, drive, files
from google.auth import default

# --- CONFIGURATION ---
TARGET_FOLDER_ID = "1NwVjSAo5OFvuK-29RNUXdSGp9L-dVBs6"
OUTPUT_FILENAME = "Clean_Novel_List.xlsx"

# Step 1: Authentication
print("Authenticating...")
drive.mount('/content/drive')
auth.authenticate_user()
creds, _ = default()
service = build('drive', 'v3', credentials=creds)

print(f"Target ID: {TARGET_FOLDER_ID}")
print("Scanning start... (Sirf PDFs collect ho rahi hain)")

# Step 2: Scanning Logic
def scan_folder_recursive(folder_id):
    found_items = []
    folders_to_process = [folder_id]
    processed_count = 0

    while folders_to_process:
        current_id = folders_to_process.pop(0)

        page_token = None
        while True:
            try:
                query = f"'{current_id}' in parents and trashed = false and (mimeType = 'application/vnd.google-apps.folder' or mimeType = 'application/pdf')"

                results = service.files().list(
                    q=query,
                    fields="nextPageToken, files(id, name, mimeType)",
                    includeItemsFromAllDrives=True,
                    supportsAllDrives=True,
                    pageSize=1000,
                    pageToken=page_token
                ).execute()

                items = results.get('files', [])

                for item in items:
                    if item['mimeType'] == 'application/pdf':
                        found_items.append(item)
                    elif item['mimeType'] == 'application/vnd.google-apps.folder':
                        folders_to_process.append(item['id'])

                page_token = results.get('nextPageToken')
                if not page_token:
                    break
            except Exception as e:
                print(f"Skipping folder due to error: {e}")
                break

        processed_count += 1
        print(f"Folders Scanned: {processed_count} | PDFs Found: {len(found_items)}", end='\r')

    return found_items

# --- EXECUTION ---
raw_items = scan_folder_recursive(TARGET_FOLDER_ID)

print(f"\nScanning Complete! Total PDFs found: {len(raw_items)}")
print("Excel file taiyar kar raha hun...")

# Step 3: Data Cleaning & Excel Prep
data = []

for item in raw_items:
    original_name = item['name']
    file_id = item['id']

    # --- TITLE CLEANING LOGIC START ---

    # 1. Sabse pehle .pdf extension hatao (Case insensitive)
    clean_title = re.sub(r'\.pdf$', '', original_name, flags=re.IGNORECASE)

    # 2. Specific website branding hatao: (www.urdunovelbanks.com)
    # Ye brackets aur uske andar ka text remove karega agar wo specific site hai
    clean_title = clean_title.replace('(www.urdunovelbanks.com)', '')

    # Optional: Agar aap chahte hain ke KOI BHI website jo brackets ma ho remove ho jaye
    # to niche wali line ka # hata den:
    # clean_title = re.sub(r'\(www\..*?\)', '', clean_title)

    # 3. Extra spaces (shuru aur aakhir ki) saaf karo
    clean_title = clean_title.strip()

    # --- TITLE CLEANING LOGIC END ---

    link = f"https://drive.google.com/file/d/{file_id}/view"

    data.append({
        "Titles": clean_title,
        "Links": link
    })

# Step 4: Save to Excel
if data:
    df = pd.DataFrame(data)

    output_path = f"/content/drive/MyDrive/{OUTPUT_FILENAME}"
    df.to_excel(output_path, index=False)

    print(f"\nSUCCESS! Excel file ban gayi hai.")
    print(f"Location: {output_path}")
    files.download(output_path)
else:
    print("\nKoi PDF nahi mili.")

Authenticating...
Mounted at /content/drive




Target ID: 1NwVjSAo5OFvuK-29RNUXdSGp9L-dVBs6
Scanning start... (Sirf PDFs collect ho rahi hain)
Folders Scanned: 41 | PDFs Found: 219
Scanning Complete! Total PDFs found: 219
Excel file taiyar kar raha hun...

SUCCESS! Excel file ban gayi hai.
Location: /content/drive/MyDrive/Clean_Novel_List.xlsx


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>