In [1]:
# Importing required liabraries

import re
import numpy as np
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support import expected_conditions as EC

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Creating variable for base URL
base_url = 'https://dhcappl.nic.in/FreeText/launchbrowsejud.do#'

In [3]:
# Initializing WebDriver
driver = webdriver.Chrome()
driver.get(base_url)

In [4]:
# Finding the div containing the links for each year
link_element = driver.find_element(By.CLASS_NAME, 'free-text-form')
link_element

<selenium.webdriver.remote.webelement.WebElement (session="f571367892b4f123bf8ae0deea0394a2", element="F1774AC2DD14833B6F7162836D8190CD_element_9")>

In [5]:
# Finding all 'a' elements in link_element
links = link_element.find_elements(By.TAG_NAME, 'a')
links

[<selenium.webdriver.remote.webelement.WebElement (session="f571367892b4f123bf8ae0deea0394a2", element="F1774AC2DD14833B6F7162836D8190CD_element_10")>,
 <selenium.webdriver.remote.webelement.WebElement (session="f571367892b4f123bf8ae0deea0394a2", element="F1774AC2DD14833B6F7162836D8190CD_element_11")>,
 <selenium.webdriver.remote.webelement.WebElement (session="f571367892b4f123bf8ae0deea0394a2", element="F1774AC2DD14833B6F7162836D8190CD_element_12")>,
 <selenium.webdriver.remote.webelement.WebElement (session="f571367892b4f123bf8ae0deea0394a2", element="F1774AC2DD14833B6F7162836D8190CD_element_13")>,
 <selenium.webdriver.remote.webelement.WebElement (session="f571367892b4f123bf8ae0deea0394a2", element="F1774AC2DD14833B6F7162836D8190CD_element_14")>,
 <selenium.webdriver.remote.webelement.WebElement (session="f571367892b4f123bf8ae0deea0394a2", element="F1774AC2DD14833B6F7162836D8190CD_element_15")>,
 <selenium.webdriver.remote.webelement.WebElement (session="f571367892b4f123bf8ae0deea03

In [6]:
# Extracting the href attribute from each 'a' element
year_links = {}
for link in links:
    year = link.text
    url = link.get_attribute('onclick').split("'")[1]
    year_links[year] = f"https://dhcappl.nic.in/FreeText/{url}"

# Printing the extracted links for each year
for year, url in year_links.items():
    print(f"Year: {year}, Link: {url}")

Year: 2023, Link: https://dhcappl.nic.in/FreeText/2023.html
Year: 2022, Link: https://dhcappl.nic.in/FreeText/2022.html
Year: 2021, Link: https://dhcappl.nic.in/FreeText/2021.html
Year: 2020, Link: https://dhcappl.nic.in/FreeText/2020.html
Year: 2019, Link: https://dhcappl.nic.in/FreeText/2019.html
Year: 2018, Link: https://dhcappl.nic.in/FreeText/2018.html
Year: 2017, Link: https://dhcappl.nic.in/FreeText/2017.html
Year: 2016, Link: https://dhcappl.nic.in/FreeText/2016.html
Year: 2015, Link: https://dhcappl.nic.in/FreeText/2015.html
Year: 2014, Link: https://dhcappl.nic.in/FreeText/2014.html
Year: 2013, Link: https://dhcappl.nic.in/FreeText/2013.html
Year: 2012, Link: https://dhcappl.nic.in/FreeText/2012.html
Year: 2011, Link: https://dhcappl.nic.in/FreeText/2011.html
Year: 2010, Link: https://dhcappl.nic.in/FreeText/2010.html
Year: 2009, Link: https://dhcappl.nic.in/FreeText/2009.html
Year: 2008, Link: https://dhcappl.nic.in/FreeText/2008.html
Year: 2007, Link: https://dhcappl.nic.in

In [7]:
driver.quit()

In [None]:
# Loop through each year's link
for year, url in year_links.items():
    driver = webdriver.Chrome()
    driver.get(url)

    # Locating the data table
    data_element = driver.find_element(By.ID, 'data')
    rows = data_element.find_elements(By.TAG_NAME, 'tr')

    case_nos = []
    categories = []
    parties = []
    judgment_dates = []
    download_links = []

    # Iterate through rows to extract data
    for row in rows:
        columns = row.find_elements(By.TAG_NAME, 'td')
        if columns:
            case_nos.append(columns[0].text)
            categories.append(columns[1].text)
            parties.append(columns[2].text)
            judgment_dates.append(columns[3].text)
            download_links.append(columns[4].find_element(By.TAG_NAME, 'a').get_attribute('href'))

    # Creating DataFrame
    data = {
        'S.NO': list(range(1, len(case_nos) + 1)),
        'DIARY NUMBER': np.nan * len(case_nos),
        'CASE NUMBER': case_nos,
        'PARTY NAME': parties,
        'STATUS/STAGE': np.nan * len(case_nos),
        'CATEGORY': categories,
        'JUDGEMENT/ORDER': judgment_dates,
        'OFFICE REPORT': judgment_dates,
        'Download Link': download_links
    }
    df = pd.DataFrame(data)

    def extract_year(case_number):
        match = re.search(r'/(\d{4})$', case_number)
        if match:
            return match.group(1)
        else:
            return None
    # Applying the function to create the "DATE OF FILING" column
    df['DATE OF FILING'] = df['CASE NUMBER'].apply(extract_year)

    # Creating a function to split parties into PETITIONER(S) and RESPONDENT(S) columns
    def split_parties(party):
        parts = party.split(' VS ')
        if len(parts) == 2:
            return parts[0].strip(), parts[1].strip()
        else:
            return party.strip(), party.strip()
    # Applying the function
    df[['PETITIONER(S)', 'RESPONDENT(S)']] = df['PARTY NAME'].apply(lambda x: pd.Series(split_parties(x)))

    # Arranging the columns in dataframe
    column_order = [
        'S.NO', 
        'DIARY NUMBER', 
        'CASE NUMBER', 
        'DATE OF FILING', 
        'PARTY NAME',
        'STATUS/STAGE', 
        'CATEGORY', 
        'PETITIONER(S)', 
        'RESPONDENT(S)', 
        'JUDGEMENT/ORDER', 
        'OFFICE REPORT',
        'Download Link'
        ]
    df = df[column_order]

    # Save DataFrame to a CSV file named after the year
    df.to_csv(f'COURT_DATA_{year}.csv', index=False)

    driver.quit()

In [8]:
# Loop through each year's link
for year, url in year_links.items():
    attempts = 3
    for attempt in range(attempts):
        try:
            driver = webdriver.Chrome()
            driver.set_page_load_timeout(800)  # Set a longer timeout
            driver.get(url)

            # Wait for the data table to be present
            WebDriverWait(driver, 900).until(EC.presence_of_element_located((By.ID, 'data')))

            # Locating the data table
            data_element = driver.find_element(By.ID, 'data')
            rows = data_element.find_elements(By.TAG_NAME, 'tr')

            case_nos = []
            categories = []
            parties = []
            judgment_dates = []
            download_links = []

            # Iterate through rows to extract data
            for row in rows:
                columns = row.find_elements(By.TAG_NAME, 'td')
                if columns:
                    case_nos.append(columns[0].text)
                    categories.append(columns[1].text)
                    parties.append(columns[2].text)
                    judgment_dates.append(columns[3].text)
                    download_links.append(columns[4].find_element(By.TAG_NAME, 'a').get_attribute('href'))

            # Creating DataFrame
            data = {
                'S.NO': list(range(1, len(case_nos) + 1)),
                'DIARY NUMBER': np.nan * len(case_nos),
                'CASE NUMBER': case_nos,
                'PARTY NAME': parties,
                'STATUS/STAGE': np.nan * len(case_nos),
                'CATEGORY': categories,
                'JUDGEMENT/ORDER': judgment_dates,
                'OFFICE REPORT': judgment_dates,
                'Download Link': download_links
            }
            df = pd.DataFrame(data)

            def extract_year(case_number):
                match = re.search(r'/(\d{4})$', case_number)
                if match:
                    return match.group(1)
                else:
                    return None
            # Applying the function to create the "DATE OF FILING" column
            df['DATE OF FILING'] = df['CASE NUMBER'].apply(extract_year)

            # Creating a function to split parties into PETITIONER(S) and RESPONDENT(S) columns
            def split_parties(party):
                parts = party.split(' VS ')
                if len(parts) == 2:
                    return parts[0].strip(), parts[1].strip()
                else:
                    return party.strip(), party.strip()
            # Applying the function
            df[['PETITIONER(S)', 'RESPONDENT(S)']] = df['PARTY NAME'].apply(lambda x: pd.Series(split_parties(x)))

            # Arranging the columns in dataframe
            column_order = [
                'S.NO', 
                'DIARY NUMBER', 
                'CASE NUMBER', 
                'DATE OF FILING', 
                'PARTY NAME',
                'STATUS/STAGE', 
                'CATEGORY', 
                'PETITIONER(S)', 
                'RESPONDENT(S)', 
                'JUDGEMENT/ORDER', 
                'OFFICE REPORT',
                'Download Link'
            ]
            df = df[column_order]

            # Save DataFrame to a CSV file named after the year
            df.to_csv(f'COURT_DATA_{year}.csv', index=False)

            driver.quit()
            break  # Break the loop if successful
        except TimeoutException as ex:
            print(f"Attempt {attempt + 1} failed. Retrying... Error: {ex}")
            continue