In [None]:
import requests
from bs4 import BeautifulSoup

BASE_URL = "https://www.advocatekhoj.com"
ANNOUNCEMENT_URL = BASE_URL + "/library/judgments/announcement.php"
HEADERS = { 'User-Agent': 'Mozilla/5.0' }

resp = requests.get(ANNOUNCEMENT_URL, headers=HEADERS)
resp.raise_for_status()

soup = BeautifulSoup(resp.text, 'html.parser')

judgment_table = soup.find('table', attrs={'cellpadding': '7', 'cellspacing': '2'})

judgments = []
for tr in judgment_table.find_all('tr'):
    tds = tr.find_all('td')
    if len(tds) < 2:
        continue

    date_font = tds[0].find('font')
    date = date_font.get_text(strip=True) if date_font else None

    link_tag = tds[1].find('a')
    if link_tag:
        title = link_tag.get_text(strip=True)
        href = link_tag['href']
        full_link = BASE_URL + href if href.startswith('/') else href
    else:
        title = None
        full_link = None

    judgments.append({
        'date': date,
        'case_title': title,
        'judgment_link': full_link
    })

for j in judgments:
    print(f"Date: {j['date']}")
    print(f"Case Title: {j['case_title']}")
    print(f"Link: {j['judgment_link']}")
    print('-' * 80)


Date: 09/06/25
Case Title: Amlesh Kumar Vs. State of Bihar
Link: https://www.advocatekhoj.com/library/judgments/announcement.php?WID=19117
--------------------------------------------------------------------------------
Date: 06/06/25
Case Title: Dhanya M Vs. State of Kerala
Link: https://www.advocatekhoj.com/library/judgments/announcement.php?WID=19112
--------------------------------------------------------------------------------
Date: 05/06/25
Case Title: M/s. Balaji Traders Vs. State of Uttar Pradesh
Link: https://www.advocatekhoj.com/library/judgments/announcement.php?WID=19111
--------------------------------------------------------------------------------
Date: 05/06/25
Case Title: Abhishek Singh Vs. Ajay Kumar
Link: https://www.advocatekhoj.com/library/judgments/announcement.php?WID=19110
--------------------------------------------------------------------------------
Date: 05/06/25
Case Title: Union of India Vs. M/s. Kamakhya Transport Pvt. Ltd.
Link: https://www.advocatekhoj

In [31]:
import requests
from bs4 import BeautifulSoup

BASE_URL = "https://www.sci.gov.in"
JUDGMENTS_URL = BASE_URL + "/#1697446384453-9aeef8cc-5f35"
HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}

resp = requests.get(JUDGMENTS_URL, headers=HEADERS, timeout=10)
resp.raise_for_status()

soup = BeautifulSoup(resp.text, 'html.parser')

judgments_div = soup.find('div', class_='gen-list over-y-scroll no-border no-bg padding-0 border-radius-medium arrow-list')

judgments = []

if judgments_div:
    for li in judgments_div.find_all('li'):
        a_tag = li.find('a')
        if not a_tag:
            continue

        href = a_tag['href']
        full_link = BASE_URL + href if href.startswith('/') else href

        date_span = a_tag.find('span', style="color:red;")
        date = date_span.get_text(strip=True) if date_span else None

        if date_span:
            date_span.decompose()
        upload_div = a_tag.find('div', style="color:#5959dd;")
        if upload_div:
            upload_div.decompose()

        case_text = a_tag.get_text(separator=" ", strip=True)
        parts = case_text.split(' - ')
        if len(parts) >= 3:
            case_title = parts[0].strip()
            case_number = parts[1].strip()
            diary_number = parts[2].strip()
        else:
            case_title = case_number = diary_number = None

        judgments.append({
            'date': date,
            'case_title': case_title,
            'case_number': case_number,
            'diary_number': diary_number,
            'judgment_link': full_link
        })

for index, j in enumerate(judgments, start=1):
    print(f"Judgment #{index}")
    print(f"Date: {j['date']}")
    print(f"Case Title: {j['case_title']}")
    print(f"Case Number: {j['case_number']}")
    print(f"Diary Number: {j['diary_number']}")
    print(f"Link: {j['judgment_link']}")
    print('-' * 80)


Judgment #1
Date: 09-Jun-2025
Case Title: AMLESH KUMAR VS. THE STATE OF BIHAR
Case Number: Crl.A. No. 2901/2025
Diary Number: Diary Number 9701 / 2024 -
Link: https://www.sci.gov.in/view-pdf/?diary_no=97012024&type=j&order_date=2025-06-09&from=latest_judgements_order
--------------------------------------------------------------------------------
Judgment #2
Date: 06-Jun-2025
Case Title: DHANYA M VS. THE STATE OF KERALA
Case Number: Crl.A. No. 2897/2025
Diary Number: Diary Number 47305 / 2024 -
Link: https://www.sci.gov.in/view-pdf/?diary_no=473052024&type=j&order_date=2025-06-06&from=latest_judgements_order
--------------------------------------------------------------------------------
Judgment #3
Date: 05-Jun-2025
Case Title: ABHISHEK SINGH VS. AJAY KUMAR
Case Number: Crl.A. No. 2900/2025
Diary Number: Diary Number 1444 / 2025 -
Link: https://www.sci.gov.in/view-pdf/?diary_no=14442025&type=j&order_date=2025-06-05&from=latest_judgements_order
-----------------------------------------

In [15]:
import os
import requests
from bs4 import BeautifulSoup

BASE_URL = "https://www.sci.gov.in"
JUDGMENTS_URL = BASE_URL + "/#1697446384453-9aeef8cc-5f35"
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# Create folder to store PDFs
os.makedirs("pdfs", exist_ok=True)

# Step 1: Scrape judgment links
resp = requests.get(JUDGMENTS_URL, headers=HEADERS, timeout=10)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, 'html.parser')

judgments_div = soup.find('div', class_='gen-list over-y-scroll no-border no-bg padding-0 border-radius-medium arrow-list')
judgments = []

if judgments_div:
    for li in judgments_div.find_all('li'):
        a_tag = li.find('a')
        if not a_tag:
            continue

        href = a_tag['href']
        full_link = BASE_URL + href if href.startswith('/') else href

        case_title = a_tag.text.strip().split(' - ')[0].replace(' ', '_').replace('/', '_')
        judgments.append({
            'url': full_link,
            'filename': f"{case_title[:100]}.pdf"  # Limit to 100 characters
        })

# Step 2: Download each PDF even if URL doesn’t end in .pdf
for j in judgments:
    try:
        print(f"Trying: {j['url']}")
        resp = requests.get(j['url'], headers=HEADERS, timeout=10)
        resp.raise_for_status()

        content_type = resp.headers.get('Content-Type', '')
        if 'application/pdf' in content_type:
            file_path = os.path.join("pdfs", j['filename'])
            with open(file_path, 'wb') as f:
                f.write(resp.content)
            print(f"✅ Downloaded: {j['filename']}")
        else:
            print(f"⚠️ Not a PDF (Content-Type={content_type}): {j['url']}")

    except Exception as e:
        print(f"❌ Failed to download {j['url']}: {e}")


Trying: https://www.sci.gov.in/view-pdf/?diary_no=97012024&type=j&order_date=2025-06-09&from=latest_judgements_order
⚠️ Not a PDF (Content-Type=text/html; charset=UTF-8): https://www.sci.gov.in/view-pdf/?diary_no=97012024&type=j&order_date=2025-06-09&from=latest_judgements_order
Trying: https://www.sci.gov.in/view-pdf/?diary_no=473052024&type=j&order_date=2025-06-06&from=latest_judgements_order
⚠️ Not a PDF (Content-Type=text/html; charset=UTF-8): https://www.sci.gov.in/view-pdf/?diary_no=473052024&type=j&order_date=2025-06-06&from=latest_judgements_order
Trying: https://www.sci.gov.in/view-pdf/?diary_no=14442025&type=j&order_date=2025-06-05&from=latest_judgements_order
⚠️ Not a PDF (Content-Type=text/html; charset=UTF-8): https://www.sci.gov.in/view-pdf/?diary_no=14442025&type=j&order_date=2025-06-05&from=latest_judgements_order
Trying: https://www.sci.gov.in/view-pdf/?diary_no=571552024&type=j&order_date=2025-06-05&from=latest_judgements_order
⚠️ Not a PDF (Content-Type=text/html; ch

In [25]:
import os
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# Target PDF view URL
url = "https://www.sci.gov.in/view-pdf/?diary_no=97012024&type=j&order_date=2025-06-09&from=latest_judgements_order"

# Directory to save PDFs
download_dir = os.path.abspath("pdfs")
os.makedirs(download_dir, exist_ok=True)

# Set up Chrome options for PDF downloading
chrome_options = Options()
# chrome_options.add_argument('--headless')  # Optional: use without headless to see browser
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_experimental_option("prefs", {
    "download.default_directory": download_dir,
    "plugins.always_open_pdf_externally": True,
    "download.prompt_for_download": False,
    "download.directory_upgrade": True
})

# Start Chrome using Service wrapper
driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=chrome_options
)

driver.get(url)

# Give Chrome time to download the PDF
time.sleep(10)

driver.quit()
print(f"✅ PDF should be downloaded to: {download_dir}")


✅ PDF should be downloaded to: c:\Users\ASUS\Documents\ITProfound\dev\Backend\pdfs


In [29]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time
import requests
import os

url = "https://www.sci.gov.in/view-pdf/?diary_no=97012024&type=j&order_date=2025-06-09&from=latest_judgements_order"
download_dir = os.path.abspath("pdfs")
os.makedirs(download_dir, exist_ok=True)

chrome_options = Options()
# Not headless to avoid being blocked
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')

driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=chrome_options
)

driver.get(url)
time.sleep(3)

# Try to locate an iframe or embed containing PDF src
iframe = driver.find_element("tag name", "iframe")
pdf_src = iframe.get_attribute("src")

print(f"Found PDF link: {pdf_src}")

# Download the PDF using requests
r = requests.get(pdf_src, stream=True)
filename = os.path.join(download_dir, "Judgment_97012024.pdf")
with open(filename, 'wb') as f:
    for chunk in r.iter_content(chunk_size=1024):
        f.write(chunk)

print(f"✅ PDF downloaded: {filename}")
driver.quit()


Found PDF link: https://www.sci.gov.in/sci-get-pdf/?diary_no=97012024&type=j&order_date=2025-06-09&from=latest_judgements_order
✅ PDF downloaded: c:\Users\ASUS\Documents\ITProfound\dev\Backend\pdfs\Judgment_97012024.pdf


In [30]:
import os
import time
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# Constants
BASE_URL = "https://www.sci.gov.in"
JUDGMENTS_URL = BASE_URL + "/#1697446384453-9aeef8cc-5f35"
DOWNLOAD_DIR = os.path.abspath("pdfs")
os.makedirs(DOWNLOAD_DIR, exist_ok=True)

# Set up visible Chrome (not headless)
chrome_options = Options()
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')


def scrape_judgment_links():
    """Scrape all Supreme Court judgment view links."""
    resp = requests.get(JUDGMENTS_URL, headers={"User-Agent": "Mozilla/5.0"})
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, 'html.parser')

    judgments_div = soup.find('div', class_='gen-list over-y-scroll no-border no-bg padding-0 border-radius-medium arrow-list')
    judgment_links = []

    if judgments_div:
        for li in judgments_div.find_all('li'):
            a_tag = li.find('a')
            if not a_tag:
                continue

            href = a_tag['href']
            full_link = BASE_URL + href if href.startswith('/') else href

            # Generate a clean filename from case title or diary number
            case_text = a_tag.get_text(separator=" ", strip=True)
            file_tag = case_text.split(' - ')[0].strip().replace(' ', '_').replace('/', '_')
            filename = f"{file_tag[:100]}.pdf"  # Limit to 100 chars

            judgment_links.append({
                'url': full_link,
                'filename': filename
            })
    return judgment_links


def download_pdf_from_iframe(driver, view_url, filename):
    """Extract the iframe PDF URL and download the file using requests."""
    try:
        driver.get(view_url)
        time.sleep(3)  # wait for iframe to load

        iframe = driver.find_element("tag name", "iframe")
        pdf_url = iframe.get_attribute("src")

        if not pdf_url:
            print(f"[!] No iframe PDF found for: {view_url}")
            return

        # Fetch and save the actual PDF
        resp = requests.get(pdf_url, stream=True, timeout=15)
        if resp.status_code == 200:
            filepath = os.path.join(DOWNLOAD_DIR, filename)
            with open(filepath, 'wb') as f:
                for chunk in resp.iter_content(chunk_size=1024):
                    f.write(chunk)
            print(f"✅ Downloaded: {filename}")
        else:
            print(f"[!] PDF download failed ({resp.status_code}): {pdf_url}")

    except Exception as e:
        print(f"[X] Error downloading from {view_url}: {e}")


def download_all_supreme_court_pdfs():
    # Step 1: Scrape links
    judgment_links = scrape_judgment_links()
    print(f"Found {len(judgment_links)} judgments.")

    # Step 2: Set up Chrome
    driver = webdriver.Chrome(
        service=Service(ChromeDriverManager().install()),
        options=chrome_options
    )

    # Step 3: Visit and download PDFs
    for item in judgment_links:
        print(f"Processing: {item['url']}")
        download_pdf_from_iframe(driver, item['url'], item['filename'])

    driver.quit()
    print("✅ All downloads complete.")


# Run the full process
if __name__ == "__main__":
    download_all_supreme_court_pdfs()


Found 25 judgments.
Processing: https://www.sci.gov.in/view-pdf/?diary_no=97012024&type=j&order_date=2025-06-09&from=latest_judgements_order
✅ Downloaded: AMLESH_KUMAR_VS._THE_STATE_OF_BIHAR.pdf
Processing: https://www.sci.gov.in/view-pdf/?diary_no=473052024&type=j&order_date=2025-06-06&from=latest_judgements_order
✅ Downloaded: DHANYA_M_VS._THE_STATE_OF_KERALA.pdf
Processing: https://www.sci.gov.in/view-pdf/?diary_no=14442025&type=j&order_date=2025-06-05&from=latest_judgements_order
✅ Downloaded: ABHISHEK_SINGH_VS._AJAY_KUMAR.pdf
Processing: https://www.sci.gov.in/view-pdf/?diary_no=571552024&type=j&order_date=2025-06-05&from=latest_judgements_order
✅ Downloaded: M_S_BALAJI_TRADERS_VS._THE_STATE_OF_U.P..pdf
Processing: https://www.sci.gov.in/view-pdf/?diary_no=189052022&type=j&order_date=2025-06-05&from=latest_judgements_order
✅ Downloaded: UNION_OF_INDIA_VS._M_S_KAMAKHYA_TRANSPORT_PVT._LTD..pdf
Processing: https://www.sci.gov.in/view-pdf/?diary_no=365262019&type=j&order_date=2025-06-