In [9]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd

We need to fetch the third table from the page.

Let's write a function to scrape tables for both the years:

In [26]:
def scrape_election_data(url, csv_filename):
    # Set up the Selenium WebDriver
    driver = webdriver.Chrome()

    # Navigate to the page
    driver.get(url)

    # Use WebDriverWait to wait for all tables to load
    try:
        # Wait until the number of <table> tags is at least 3
        WebDriverWait(driver, 20).until(
            lambda d: len(d.find_elements(By.TAG_NAME, 'table')) >= 3  # Wait until at least 3 <table> tags are present
        )
        
        # Get the page source after JavaScript has run
        html = driver.page_source

        # Use pandas to read the HTML
        dfs = pd.read_html(html)

        # Print the number of tables found
        print(f"Number of tables found: {len(dfs)}")

        # Check if at least one table is found
        if len(dfs) > 2:
            # Extract the table of interest (index 2 as per your original code)
            election_data = dfs[2]

            # Save the DataFrame to a CSV file
            election_data.to_csv(csv_filename, index=False)
            print(f"Data saved to {csv_filename}")
            return election_data
        else:
            print("Not enough tables found to extract data.")
            
    except Exception as e:
        print(f"An error occurred: {e}")
    
    finally:
        # Close the WebDriver
        driver.quit()
    

In [27]:

# Example usage
url_2024 = "https://www.indiavotes.com/lok-sabha/2024/all-states/18/0"
csv_filename_2024 = '../data/raw/turnout_data_2024.csv'

df_2024 = scrape_election_data(url_2024, csv_filename_2024)
df_2024



  dfs = pd.read_html(html)


Number of tables found: 3
Data saved to ../data/raw/turnout_data_2024.csv


Unnamed: 0_level_0,#,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0
Unnamed: 0_level_1,#,PC Name,No,Type,State,Winning Candidate,Party,Electors,Votes,Turnout,Margin,Margin %
0,1,Adilabad,1,ST,Telangana,Godam Nagesh,Bharatiya Janta Party,1650175,1235597,74.9 %,90652.0,7.3%
1,2,Agra,18,SC,Uttar Pradesh [2000 Onwards],Prof S P Singh Baghel,Bharatiya Janta Party,2072685,1123779,54.2 %,271294.0,24.1%
2,3,Ahmadnagar,37,GEN,Maharashtra,Nilesh Dnyandev Lanke,Nationalist Congress Party – Sharadchandra Pawar,1981866,1325477,66.9 %,28929.0,2.2%
3,4,Ahmedabad East,7,GEN,Gujarat,Hasmukhbhai Patel (H.S.PATEL),Bharatiya Janta Party,2038162,1128339,55.4 %,461755.0,40.9%
4,5,Ahmedabad West,8,SC,Gujarat,Dineshbhai Makwana (ADVOCATE),Bharatiya Janta Party,1726987,966646,56.0 %,286437.0,29.6%
...,...,...,...,...,...,...,...,...,...,...,...,...
538,539,Wardha,8,GEN,Maharashtra,Amar Sharadrao Kale,Nationalist Congress Party – Sharadchandra Pawar,1682771,1095012,65.1 %,81648.0,7.5%
539,540,Wayanad,4,GEN,Kerala,Rahul Gandhi,Indian National Congress,1462423,1084653,74.2 %,364422.0,33.6%
540,541,West Delhi,6,GEN,Delhi [1977 Onwards],Kamaljeet Sehrawat,Bharatiya Janta Party,2587977,1524494,58.9 %,199013.0,13.1%
541,542,Yavatmal-Washi,14,GEN,Maharashtra,Sanjay Uttamrao Deshmukh,Shiv Sena (Uddhav Balasaheb Thackrey),1940916,1225530,63.1 %,94473.0,7.7%


In [28]:
# You can use the same function for another URL
url_2019 = "https://www.indiavotes.com/lok-sabha/2019/all-states/17/0"
csv_filename_2019 = '../data/raw/turnout_data_2019.csv'

df_2019 = scrape_election_data(url_2019, csv_filename_2019)
df_2019

  dfs = pd.read_html(html)


Number of tables found: 3
Data saved to ../data/raw/turnout_data_2019.csv


Unnamed: 0_level_0,#,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0
Unnamed: 0_level_1,#,PC Name,No,Type,State,Winning Candidate,Party,Electors,Votes,Turnout,Margin,Margin %
0,1,Adilabad,1,ST,Telangana,Soyam Bapu Rao,Bharatiya Janta Party,1382837,1063730,77.9 %,58560,5.5%
1,2,Agra,18,SC,Uttar Pradesh [2000 Onwards],Satyapal Singh Baghel,Bharatiya Janta Party,1866262,1145323,61.7 %,211546,18.5%
2,3,Ahmadnagar,37,GEN,Maharashtra,Dr. Sujay Radhakrishna Vikhepatil,Bharatiya Janta Party,1793677,1203797,67.3 %,281474,23.4%
3,4,Ahmedabad East,7,GEN,Gujarat,Patel Hasmukhbhai Somabhai,Bharatiya Janta Party,1713598,1116367,65.7 %,434330,38.9%
4,5,Ahmedabad West,8,SC,Gujarat,Dr. Kirit P. Solanki,Bharatiya Janta Party,1580673,997024,64.0 %,321546,32.3%
...,...,...,...,...,...,...,...,...,...,...,...,...
536,537,Wardha,8,GEN,Maharashtra,Ramdas Chandrabhanji Tadas,Bharatiya Janta Party,1679788,1072570,64.2 %,187191,17.5%
537,538,Wayanad,4,GEN,Kerala,Rahul Gandhi,Indian National Congress,1306141,1092197,83.8 %,431770,39.5%
538,539,West Delhi,6,GEN,Delhi [1977 Onwards],Sant Prasad Sinha,Bharatiya Janta Party,2039410,1441601,71.1 %,578486,40.1%
539,540,Yavatmal-Washi,14,GEN,Maharashtra,Bhavana Pundlikrao Gawali,Shiv Sena,1812059,1174220,65.0 %,117939,10.0%


Religion and Caste Census Data:

In [10]:
def scrape_census_data(url, csv_filename):
    # Set up the Selenium WebDriver
    driver = webdriver.Chrome()

    # Navigate to the page
    driver.get(url)

    # Use WebDriverWait to wait for all tables to load
    try:
        # Get the page source after JavaScript has run
        html = driver.page_source

        # Use pandas to read the HTML
        dfs = pd.read_html(html)

        for df in dfs:
            print(df.head())
        
        # Close the WebDriver
        driver.quit()

        # Print the number of tables found
        print(f"Number of tables found: {len(dfs)}")

        index = input("Which table do you wish to scrape?: ")

        # Extract the table of interest 
        census_df = dfs[int(index)]

        # Save the DataFrame to a CSV file
        census_df.to_csv(csv_filename, index=False)
        print(f"Data saved to {csv_filename}")
        return census_df
    
    except Exception as e:
        print(f"An error occurred: {e}")

In [11]:
url_religion = "https://www.census2011.co.in/religion.php"
religion_csv = "../data/raw/religion.csv"
religion_df = scrape_census_data(url_religion, religion_csv)
print(religion_df)

  dfs = pd.read_html(html)


       Religion   Percent     Estimated State  Majority
0  All Religion  100.00 %    121 Crores              35
1         Hindu   79.80 %  96.62 Crores              28
2        Muslim   14.23 %  17.22 Crores               2
3     Christian    2.30 %   2.78 Crores               4
4          Sikh    1.72 %   2.08 Crores               1
            State Majority Religion   Hindu  Muslim
0   Uttar Pradesh             Hindu  79.73%  19.26%
1     Maharashtra             Hindu  79.83%  11.54%
2           Bihar             Hindu  82.69%  16.87%
3     West Bengal             Hindu  70.54%  27.01%
4  Andhra Pradesh             Hindu  88.46%   9.56%
       Religion Percentage     Estimated
0  All Religion   100.00 %  37.71 Crores
1         Hindu    74.82 %  28.22 Crores
2        Muslim    18.23 %   6.87 Crores
3     Christian     2.96 %   1.12 Crores
4          Sikh     1.57 %   59.02 Lakhs
       Religion Percentage     Estimated
0  All Religion   100.00 %  83.38 Crores
1         Hindu    82.05

In [12]:
url_literacy = "https://www.census2011.co.in/literacy.php"
literacy_csv = "../data/raw/literacy.csv"
religion_df = scrape_census_data(url_literacy, literacy_csv)
print(religion_df)

  dfs = pd.read_html(html)


   #        State  Literacy   Male  Female  % Change
0  -        India     74.04  82.14   65.46      8.66
1  1       Kerala     94.00  96.11   92.07      3.14
2  2  Lakshadweep     91.85  95.56   87.95      5.19
3  3      Mizoram     91.33  93.35   89.27      2.53
4  4          Goa     88.70  92.65   84.66      6.69
Number of tables found: 1
Data saved to ../data/raw/literacy.csv
     #                        State  Literacy   Male  Female  % Change
0    -                        India     74.04  82.14   65.46      8.66
1    1                       Kerala     94.00  96.11   92.07      3.14
2    2                  Lakshadweep     91.85  95.56   87.95      5.19
3    3                      Mizoram     91.33  93.35   89.27      2.53
4    4                          Goa     88.70  92.65   84.66      6.69
5    5                      Tripura     87.22  91.53   82.73     14.03
6    6                Daman and Diu     87.10  91.54   79.55      8.92
7    7  Andaman and Nicobar Islands     86.63  90

In [13]:
url_sc = "https://www.census2011.co.in/scheduled-castes.php"
sc_csv = "../data/raw/sc.csv"
religion_df = scrape_census_data(url_sc, sc_csv)
print(religion_df)

  dfs = pd.read_html(html)


   #           State     No_HH Population      Males    Female Child(0-6)  \
0  -           India  41694863  201378372  103535314  97843058    14.50 %   
1  1   Uttar Pradesh   7375437   41357608   21676975  19680633    16.81 %   
2  2     West Bengal   4861303   21463270   11003304  10459966    11.91 %   
3  3      Tamil Nadu   3591953   14438445    7204687   7233758    11.02 %   
4  4  Andhra Pradesh   3429973   13878078    6913047   6965031    11.17 %   

  Literacy Sex-Ratio  
0  66.07 %       945  
1  60.89 %       908  
2  69.43 %       951  
3  73.26 %      1004  
4  62.28 %      1008  
Number of tables found: 1
Data saved to ../data/raw/sc.csv
     #                        State     No_HH Population      Males    Female  \
0    -                        India  41694863  201378372  103535314  97843058   
1    1                Uttar Pradesh   7375437   41357608   21676975  19680633   
2    2                  West Bengal   4861303   21463270   11003304  10459966   
3    3          

In [14]:
url_st = "https://www.census2011.co.in/scheduled-tribes.php"
st_csv = "../data/raw/st.csv"
religion_df = scrape_census_data(url_st, st_csv)
print(religion_df)

  dfs = pd.read_html(html)


   #           State     No_HH Population     Males    Female Child(0-6)  \
0  -           India  21511528  104545716  52547215  51998501    16.01 %   
1  1  Madhya Pradesh   3122061   15316784   7719404   7597380    18.46 %   
2  2          Orissa   2163110    9590756   4727732   4863024    15.86 %   
3  3     Maharashtra   2156957   10510213   5315025   5195188    14.78 %   
4  4       Rajasthan   1787715    9238534   4742943   4495591    18.40 %   

  Literacy Sex-Ratio  
0  58.95 %       990  
1  50.55 %       984  
2  52.24 %      1029  
3  65.73 %       977  
4  52.80 %       948  
Number of tables found: 1
Data saved to ../data/raw/st.csv
     #                        State     No_HH Population     Males    Female  \
0    -                        India  21511528  104545716  52547215  51998501   
1    1               Madhya Pradesh   3122061   15316784   7719404   7597380   
2    2                       Orissa   2163110    9590756   4727732   4863024   
3    3                  Ma

In [15]:
url_slums = "https://www.census2011.co.in/slums.php"
slums_csv = "../data/raw/slums.csv"
religion_df = scrape_census_data(url_slums, slums_csv)
print(religion_df)

  dfs = pd.read_html(html)


   #           State   Slums   Slum HH  Slum Pop      Male    Female  \
0  -           India   5.41%  13920191  65494604  33968203  31526401   
1  1     Maharashtra  10.54%   2499948  11848423   6328217   5520206   
2  2  Andhra Pradesh  12.04%   2431474  10186934   5103377   5083557   
3  3      Tamil Nadu   8.04%   1463689   5798459   2886993   2911466   
4  4     West Bengal   7.03%   1391756   6418594   3321700   3096894   

   Child (0-6)        SC       ST Literacy  
0      8082743  13354080  2216533  77.72 %  
1      1428850   1863882   364254  84.55 %  
2      1149779   1428212   270556  75.32 %  
3       614969   1853315    30996  82.06 %  
4       656780   1060811   106373  81.38 %  
Number of tables found: 1
Data saved to ../data/raw/slums.csv
     #                        State   Slums   Slum HH  Slum Pop      Male  \
0    -                        India   5.41%  13920191  65494604  33968203   
1    1                  Maharashtra  10.54%   2499948  11848423   6328217   
2   