In [1]:
# Importing required liabraries

import math
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support import expected_conditions as EC

import warnings
warnings.filterwarnings('ignore')

In [2]:
raw_df = pd.read_csv('Raw_SC_Civil_Cases_Scraped_Data.csv')

In [3]:
print(f"Shape of the dataframe is: {raw_df.shape}")
print("--------------------------------------")
raw_df.head(2)

Shape of the dataframe is: (3188, 4)
--------------------------------------


Unnamed: 0,Diary Number,Case Number,Date of Filing,Party Name
0,41091/2023,Unregistered,04-10-2023,UPENDRA NATH DALAI Vs ALL INDIA PRESIDENT BHAR...
1,40882/2023,Unregistered,03-10-2023,ROHAN CHATTERJEE Vs THE STATE OF WEST BENGAL


In [4]:
# Split 'Diary Number' column into 'Diary Sequence' and 'Year' columns
raw_df[['Diary Sequence', 'Year']] = raw_df['Diary Number'].str.split('/', expand=True)

In [5]:
raw_df.head(2)

Unnamed: 0,Diary Number,Case Number,Date of Filing,Party Name,Diary Sequence,Year
0,41091/2023,Unregistered,04-10-2023,UPENDRA NATH DALAI Vs ALL INDIA PRESIDENT BHAR...,41091,2023
1,40882/2023,Unregistered,03-10-2023,ROHAN CHATTERJEE Vs THE STATE OF WEST BENGAL,40882,2023


### Sraping sample data

In [6]:
# Create a new DataFrame with the first 100 rows using loc method
df = raw_df.loc[:50]
df.shape

(51, 6)

In [7]:
df.head(10)

Unnamed: 0,Diary Number,Case Number,Date of Filing,Party Name,Diary Sequence,Year
0,41091/2023,Unregistered,04-10-2023,UPENDRA NATH DALAI Vs ALL INDIA PRESIDENT BHAR...,41091,2023
1,40882/2023,Unregistered,03-10-2023,ROHAN CHATTERJEE Vs THE STATE OF WEST BENGAL,40882,2023
2,41501/2023,Unregistered,06-10-2023,SOMAKKA DECEASED BY LRS Vs ANAND @ ADIVEPPA,41501,2023
3,41949/2023,Unregistered,09-10-2023,RAKESH KAUSHIK Vs THE STATE OF HARYANA,41949,2023
4,40644/2023,SPECIAL LEAVE PETITION (CIVIL) / 22561 / 2023,02-10-2023,SUDEEPTA BISWAJIT GANGULY Vs BISWAJIT BIJANKUM...,40644,2023
5,42113/2023,Unregistered,10-10-2023,SADANAND KUMAR Vs THE STATE OF BIHAR,42113,2023
6,42435/2023,Unregistered,11-10-2023,MD. ISMAIL Vs SALEHJEE MUSAFIR KHANA WAKF,42435,2023
7,42500/2023,Unregistered,12-10-2023,THAKURDAS ROY Vs GOUTAM DUTTA,42500,2023
8,41489/2023,TRANSFER PETITION (CIVIL) / 2784 / 2023,06-10-2023,SONAL GARG Vs ANKIT GARG,41489,2023
9,41494/2023,SPECIAL LEAVE PETITION (CIVIL) / 23086 / 2023,06-10-2023,MALTI PRAJAPATI Vs UMA BAI AHIRWAR,41494,2023


#### Scraping the data will take a long time so it may end up with a timeout error.
#### To avoid this, dividing the data into new DataFrame variables and each variable will have 400 rows except the last one variable.

In [8]:
# Checking the number of rows in main dataframe and number of dataframes creted if having 400 rows each
total_rows = raw_df.shape[0]
print(f"Total rows in the dataframe: {total_rows}")
num_parts = math.ceil(total_rows / 400)
print(f"Total parts can be formed while each part have 400 rows: {num_parts}")

Total rows in the dataframe: 3188
Total parts can be formed while each part have 400 rows: 8


In [9]:
# Splitting the DataFrame into multiple parts
dataframes_list = []
for i in range(num_parts):
    start_idx = i * 400
    end_idx = min((i + 1) * 400, total_rows)
    df_part = raw_df.iloc[start_idx:end_idx]
    dataframes_list.append(df_part)

# Accessing individual parts
for idx, df_part in enumerate(dataframes_list, start=1):
    globals()[f"df_part{idx}"] = df_part

In [10]:
# Variables created
variable_names = [f"df_part{i}" for i in range(1, num_parts + 1)]

# Total variables created
print(f"Total variables formed: {len(variable_names)}")
print("Variable names:", variable_names)

Total variables formed: 8
Variable names: ['df_part1', 'df_part2', 'df_part3', 'df_part4', 'df_part5', 'df_part6', 'df_part7', 'df_part8']


In [11]:
print(f"Shape of dataframe: {df_part1.shape}")
print(f"Columns in dataframe: {df_part1.columns}")

Shape of dataframe: (400, 6)
Columns in dataframe: Index(['Diary Number', 'Case Number', 'Date of Filing', 'Party Name',
       'Diary Sequence', 'Year'],
      dtype='object')


### Creating a loop to get data for each diary.

In [12]:
# Initializing the WebDriver
driver = webdriver.Edge()
driver.get('https://main.sci.gov.in/case-status')

result_data = []

# Iteratting through each row in the DataFrame
for index, row in df_part1.iterrows():
    diary_sequence = row['Diary Sequence']
    year_value = row['Year']
    diary_number = row['Diary Number']
    case_number = row['Case Number']
    filling_date = row['Date of Filing']
    party_name = row['Party Name']

    try:
        time.sleep(5)

        # Filling the details
        diary_sequence_field = driver.find_element(By.ID, 'CaseDiaryNumber')
        diary_sequence_field.clear()
        diary_sequence_field.send_keys(diary_sequence)

        year_dropdown = Select(driver.find_element(By.ID, 'CaseDiaryYear'))
        year_dropdown.select_by_visible_text(year_value)

        captcha_td = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, '//p[@id="cap"]/font')))
        captcha_code = captcha_td.text

        captcha_input = driver.find_element(By.ID, 'ansCaptcha')
        captcha_input.send_keys(captcha_code)

        submit_button = driver.find_element(By.ID, 'getCaseDiary')
        submit_button.click()

        # Checking if the elements are loaded
        timeout = 20
        wait = WebDriverWait(driver, timeout)
        # Wait for the elements containing necessary data to appear
        wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="collapse1"]/div/table/tbody/tr[1]/td[2]/div/div')))

        # Waiting for the element containing the 'PENDING' text
        try:
            status_element = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, '//*[@id="collapse1"]/div/table/tbody/tr[1]/td[2]/div/div')))
            status = status_element.text
        except:
            status = pd.NA

        # Extracting 'Petitioner(s)' information
        try:
            petitioner_section = driver.find_element(By.XPATH, '//td[text()="Petitioner(s)"]')
            petitioner_text = petitioner_section.find_element(By.XPATH, '../td[2]').text.strip()
        except:
            petitioner_text = pd.NA

        # Extracting 'Respondent(s)' information
        try:
            respondent_section = driver.find_element(By.XPATH, '//td[text()="Respondent(s)"]')
            respondent_text = respondent_section.find_element(By.XPATH, './following-sibling::td').text.strip()
        except:
            respondent_text = pd.NA

        # Extracting 'Category' information
        try:
            category_section = driver.find_element(By.XPATH, '//td[text()="Category"]')
            category_text = category_section.find_element(By.XPATH, './following-sibling::td').text.strip()
        except:
            category_text = pd.NA

        # Clicking on 'Judgement/Orders' link and extracting text
        try:
            link_element = driver.find_element(By.XPATH, '//a[contains(text(), "Judgement/Orders")]')
            link_element.click()
            time.sleep(5)
            judgement_order_element = driver.find_element(By.CSS_SELECTOR, '#collapse10 div.panel-body table tr:nth-of-type(2) td')
            judgement_orders_text = judgement_order_element.text.strip()
        except:
            judgement_orders_text = pd.NA

        # Extracting 'Office Report' information
        try:
            office_report_element = driver.find_element(By.XPATH, '//a[@data-toggle="collapse" and @data-parent="#accordion" and contains(text(), "Office Report")]')
            office_report_element.click()
            time.sleep(5)
            office_table_element = driver.find_element(By.XPATH, '//*[@id="result16"]//table')
            office_table_text = office_table_element.text
        except:
            office_table_text = pd.NA

        # Storing data into a dictionary
        data = {
            'S.NO': index + 1,
            'DIARY NUMBER': diary_number,
            'DIARY SEQUENCE': diary_sequence,
            'YEAR': year_value,
            'CASE NUMBER': case_number,
            'DATE OF FILING': filling_date,
            'PARTY NAME': party_name,
            'STATUS/STAGE': status,
            'CATEGORY': category_text,
            'PETITIONER(S)': petitioner_text,
            'RESPONDENT(S)': respondent_text,
            'JUDGEMENT/ORDER': judgement_orders_text,
            'OFFICE REPORT': office_table_text
        }

        result_data.append(data)

        time.sleep(4)  # Optional delay after each iteration

    except TimeoutException:
        print(f"Timeout: Data not loaded for row {index}. Refreshing the page.")
        driver.refresh()

        continue

    except Exception as e:
        # Handle other exceptions
        print(f"Failed to process row {index}: {e}")
        continue

# Creating a DataFrame
result_df = pd.DataFrame(result_data)
result_df.head()

Timeout: Data not loaded for row 43. Refreshing the page.
Timeout: Data not loaded for row 134. Refreshing the page.
Failed to process row 191: Alert Text: ERROR, Please Contact Server Room
Message: unexpected alert open: {Alert text : ERROR, Please Contact Server Room}
  (Session info: MicrosoftEdge=119.0.2151.58)
Stacktrace:
	GetHandleVerifier [0x00007FF717DF25B2+60402]
	Microsoft::Applications::Events::ILogConfiguration::operator* [0x00007FF717D77302+253666]
	(No symbol) [0x00007FF717B49C99]
	(No symbol) [0x00007FF717BD5B45]
	(No symbol) [0x00007FF717BBB863]
	(No symbol) [0x00007FF717B8C585]
	(No symbol) [0x00007FF717B8B993]
	(No symbol) [0x00007FF717B8CD14]
	Microsoft::Applications::Events::EventProperty::to_string [0x00007FF717FD3454+1161924]
	(No symbol) [0x00007FF717C0E5F6]
	Microsoft::Applications::Events::EventProperty::~EventProperty [0x00007FF717CCB083+37459]
	Microsoft::Applications::Events::EventProperty::~EventProperty [0x00007FF717CC2C4D+3613]
	Microsoft::Applications::E

Unnamed: 0,S.NO,DIARY NUMBER,DIARY SEQUENCE,YEAR,CASE NUMBER,DATE OF FILING,PARTY NAME,STATUS/STAGE,CATEGORY,PETITIONER(S),RESPONDENT(S),JUDGEMENT/ORDER,OFFICE REPORT
0,1,41091/2023,41091,2023,Unregistered,04-10-2023,UPENDRA NATH DALAI Vs ALL INDIA PRESIDENT BHAR...,PENDING,,1 UPENDRA NATH DALAI S/D/W/Thru:- LATE KSHETRA...,1 ALL INDIA PRESIDENT BHARATIYA JANATA PARTY\n...,,
1,2,40882/2023,40882,2023,Unregistered,03-10-2023,ROHAN CHATTERJEE Vs THE STATE OF WEST BENGAL,Cases under Defect List valid for 90 Days,,1 ROHAN CHATTERJEE S/D/W/Thru:- PANKAJ KUMAR C...,1 THE STATE OF WEST BENGAL\n THROUGH THE SECR...,,
2,3,41501/2023,41501,2023,Unregistered,06-10-2023,SOMAKKA DECEASED BY LRS Vs ANAND @ ADIVEPPA,PENDING,,1 SOMAKKA DECEASED BY LRS S/D/W/Thru:- HUVANNA...,1 ANAND @ ADIVEPPA S/D/W/Thru:- NINGAPPA BALAG...,,
3,4,41949/2023,41949,2023,Unregistered,09-10-2023,RAKESH KAUSHIK Vs THE STATE OF HARYANA,Cases under Defect List valid for 90 Days,,1 RAKESH KAUSHIK S/D/W/Thru:- DHARAM SINGH KAU...,1 THE STATE OF HARYANA\n SECRETARY URBAN ESTA...,,
4,5,40644/2023,40644,2023,SPECIAL LEAVE PETITION (CIVIL) / 22561 / 2023,02-10-2023,SUDEEPTA BISWAJIT GANGULY Vs BISWAJIT BIJANKUM...,DISPOSED,1703-Contempt Of Court Matters : Other civil c...,1 SUDEEPTA BISWAJIT GANGULY S/D/W/Thru:- BISWA...,1 BISWAJIT BIJANKUMAR GANGULY S/D/W/Thru:- BIJ...,10-10-2023 [ROP]-of Main Case,


#### However, sometimes fluctuations in Internet speed and server response may impact scraping, so some data is not scraped. We will scrap those data later after comparing both the files.

In [13]:
driver.quit()

In [14]:
result_df.columns

Index(['S.NO', 'DIARY NUMBER', 'DIARY SEQUENCE', 'YEAR', 'CASE NUMBER',
       'DATE OF FILING', 'PARTY NAME', 'STATUS/STAGE', 'CATEGORY',
       'PETITIONER(S)', 'RESPONDENT(S)', 'JUDGEMENT/ORDER', 'OFFICE REPORT'],
      dtype='object')

In [15]:
formate_df = pd.read_excel('COURT DATA CSV.xlsx')
formate_df.columns

Index(['S.NO', 'DIARY NUMBER', 'CASE NUMBER', 'DATE OF FILING', 'PARTY NAME',
       'STATUS/STAGE', 'CATEGORY', 'PETITIONER(S)', 'RESPONDENT(S)',
       'JUDGEMENT/ORDER', 'OFFICE REPORT'],
      dtype='object')

In [16]:
column_order = ['S.NO', 'DIARY NUMBER', 'CASE NUMBER', 'DATE OF FILING', 'PARTY NAME', 
                'STATUS/STAGE', 'CATEGORY', 'PETITIONER(S)', 'RESPONDENT(S)', 'JUDGEMENT/ORDER', 'OFFICE REPORT']
result_df = result_df[column_order]

In [17]:
result_df.head()

Unnamed: 0,S.NO,DIARY NUMBER,CASE NUMBER,DATE OF FILING,PARTY NAME,STATUS/STAGE,CATEGORY,PETITIONER(S),RESPONDENT(S),JUDGEMENT/ORDER,OFFICE REPORT
0,1,41091/2023,Unregistered,04-10-2023,UPENDRA NATH DALAI Vs ALL INDIA PRESIDENT BHAR...,PENDING,,1 UPENDRA NATH DALAI S/D/W/Thru:- LATE KSHETRA...,1 ALL INDIA PRESIDENT BHARATIYA JANATA PARTY\n...,,
1,2,40882/2023,Unregistered,03-10-2023,ROHAN CHATTERJEE Vs THE STATE OF WEST BENGAL,Cases under Defect List valid for 90 Days,,1 ROHAN CHATTERJEE S/D/W/Thru:- PANKAJ KUMAR C...,1 THE STATE OF WEST BENGAL\n THROUGH THE SECR...,,
2,3,41501/2023,Unregistered,06-10-2023,SOMAKKA DECEASED BY LRS Vs ANAND @ ADIVEPPA,PENDING,,1 SOMAKKA DECEASED BY LRS S/D/W/Thru:- HUVANNA...,1 ANAND @ ADIVEPPA S/D/W/Thru:- NINGAPPA BALAG...,,
3,4,41949/2023,Unregistered,09-10-2023,RAKESH KAUSHIK Vs THE STATE OF HARYANA,Cases under Defect List valid for 90 Days,,1 RAKESH KAUSHIK S/D/W/Thru:- DHARAM SINGH KAU...,1 THE STATE OF HARYANA\n SECRETARY URBAN ESTA...,,
4,5,40644/2023,SPECIAL LEAVE PETITION (CIVIL) / 22561 / 2023,02-10-2023,SUDEEPTA BISWAJIT GANGULY Vs BISWAJIT BIJANKUM...,DISPOSED,1703-Contempt Of Court Matters : Other civil c...,1 SUDEEPTA BISWAJIT GANGULY S/D/W/Thru:- BISWA...,1 BISWAJIT BIJANKUMAR GANGULY S/D/W/Thru:- BIJ...,10-10-2023 [ROP]-of Main Case,


In [18]:
result_df.to_csv('Scrap_Data_part1.csv', index= False)