In [49]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd

# Scraping data from dynamic webpages:

The `scrape_dynamic_data` function is written to scrape dynamic content from a webpage using Selenium WebDriver. Unlike static webpages, which contain all necessary HTML elements and data at the time of the initial request, dynamic webpages load additional content using JavaScript after the initial HTML is loaded. This content can include tables, images, or any data rendered after the page has loaded in the user's browser.

**Why Use Selenium Instead of Requests and BeautifulSoup?**

1. **Handling Dynamic Content**: The primary reason for using Selenium in this function is its ability to handle dynamic content. Many modern websites use JavaScript to fetch and display data asynchronously (i.e., after the page initially loads). Traditional web scraping libraries like `requests` and `BeautifulSoup` only capture the static HTML content returned in the initial server response. They cannot execute JavaScript, which means any data rendered or modified after the page load will not be captured. Selenium, however, operates a real web browser, allowing it to execute JavaScript and thus access dynamic content.

2. **Simulating User Interactions**: Selenium is also beneficial when user interaction (such as scrolling, clicking buttons, or filling out forms) is required to load additional content on a webpage. For instance, some pages load more data as you scroll down or click a "Load More" button. Selenium can simulate these actions, ensuring that all necessary data is retrieved. This is not possible with `requests` and `BeautifulSoup` alone, as they cannot interact with the webpage in this manner.

3. **Ensuring Fully Loaded Content**: In some cases, even if data is not loaded dynamically with JavaScript, it may take time for all elements on a page to render. Selenium allows for the use of WebDriverWait, which waits for specific conditions (like the presence of particular elements) to be met before proceeding. This ensures that the HTML content is fully loaded before scraping begins. In contrast, `requests` fetches the HTML immediately without waiting, potentially missing content that loads more slowly.

**Function Workflow**:

- **Set Up Selenium WebDriver**: The function initializes the Selenium WebDriver to open a Chrome browser instance.
- **Navigate and Wait for Content**: It navigates to the given URL and waits for the page to load fully, including all JavaScript-rendered content.
- **Extract HTML and Read Tables**: Once the page is fully loaded, it retrieves the complete HTML content and uses `pandas` to read all tables present on the page.
- **User Interaction and Data Saving**: The function then prompts the user to select which table to save as a CSV, based on their inspection of the table columns. The selected table is saved to a CSV file, allowing for further analysis or processing.

Using Selenium in this context ensures that all dynamic content is accurately captured, providing a more comprehensive dataset than would be possible with traditional static scraping techniques.

In [50]:
def scrape_dynamic_data(url, csv_filename):
    # Set up the Selenium WebDriver
    driver = webdriver.Chrome()

    # Navigate to the page
    driver.get(url)

    # Use WebDriverWait to wait for all tables to load
    try:
        # Get the page source after JavaScript has run
        html = driver.page_source

        # Use pandas to read the HTML
        dfs = pd.read_html(html)

        # Close the WebDriver
        driver.quit()

        # Print the number of tables found
        print(f"Number of tables found: {len(dfs)}")

        count = 1 
        
        for df in dfs:
            print(f"Table {count}: {df.columns}")
            count += 1
        

        index = input("Which table do you wish to scrape?: ")

        # Extract the table of interest 
        census_df = dfs[int(index)-1]

        # Save the DataFrame to a CSV file
        census_df.to_csv(csv_filename, index=False)
        print(f"Data saved to {csv_filename}")
        return census_df
    
    except Exception as e:
        print(f"An error occurred: {e}")

### Winning Candidates from 2019 and 2024 elections:

In [51]:
winning_2019_url = "https://myneta.info/LokSabha2019/index.php?action=show_winners&sort=default"
winning_2019_csv = "../data/raw/winning_2019.csv"

scrape_dynamic_data(winning_2019_url, winning_2019_csv)

  dfs = pd.read_html(html)


Number of tables found: 9
Table 1: Index([0, 1], dtype='int64')
Table 2: Index([0, 1], dtype='int64')
Table 3: MultiIndex([('HIGHLIGHTS OF CANDIDATES', ...),
            (      'Unnamed: 1_level_0', ...)],
           )
Table 4: Index(['HIGHLIGHTS OF WINNERS', 'Unnamed: 1'], dtype='object')
Table 5: Index(['Sno', 'Candidate', 'Constituency ∇', 'Party', 'Criminal Case',
       'Education', 'Total Assets', 'Liabilities'],
      dtype='object')
Table 6: Index(['Sno', 'Candidate', 'Constituency ∇', 'Party', 'Criminal Case',
       'Education', 'Total Assets', 'Liabilities'],
      dtype='object')
Table 7: Index([0, 1, 2, 3, 4, 5, 6, 7], dtype='int64')
Table 8: Index([0, 1, 2, 3, 4], dtype='int64')
Table 9: Index([0, 1, 2, 3, 4, 5, 6, 7], dtype='int64')
Data saved to ../data/raw/winning_2019.csv


Unnamed: 0,Sno,Candidate,Constituency ∇,Party,Criminal Case,Education,Total Assets,Liabilities
0,1,Bapu Rao Soyam,ADILABAD,BJP,52,12th Pass,"Rs 30,99,414 ~ 30 Lacs+","Rs 2,31,450 ~ 2 Lacs+"
1,2,Satyapal Singh Baghel,AGRA,BJP,5,Doctorate,"Rs 7,42,74,036 ~ 7 Crore+","Rs 86,06,522 ~ 86 Lacs+"
2,3,Patel Hasmukhbhai Somabhai,AHMEDABAD EAST,BJP,0,Others,"Rs 7,46,99,690 ~ 7 Crore+","Rs 62,52,577 ~ 62 Lacs+"
3,4,Dr. Solanki Kiritbhai,AHMEDABAD WEST,BJP,0,Post Graduate,"Rs 8,94,74,039 ~ 8 Crore+",Rs 0 ~
4,5,Sujay Radhakrishna Vikhe,AHMEDNAGAR,BJP,0,Doctorate,,
...,...,...,...,...,...,...,...,...
537,538,Ramdas Chandrabhanji Tadas,WARDHA,BJP,0,10th Pass,"Rs 6,58,07,822 ~ 6 Crore+","Rs 89,61,387 ~ 89 Lacs+"
538,539,Rahul Gandhi,WAYANAD,INC,5,Post Graduate,"Rs 15,88,77,063 ~ 15 Crore+","Rs 72,01,904 ~ 72 Lacs+"
539,540,Parvesh Sahib Singh,WEST DELHI,BJP,0,Post Graduate,,
540,541,Bhavana Pundlikrao Gawali,YAVATMAL WASHIM,SHS,3,Graduate,"Rs 9,68,73,189 ~ 9 Crore+","Rs 73,96,250 ~ 73 Lacs+"


In [52]:
winning_2024_url = "https://myneta.info/LokSabha2024/index.php?action=show_winners&sort=default"
winning_2024_csv = "../data/raw/winning_2024.csv"

scrape_dynamic_data(winning_2024_url, winning_2024_csv)

  dfs = pd.read_html(html)


Number of tables found: 8
Table 1: Index([0, 1], dtype='int64')
Table 2: Index([0, 1], dtype='int64')
Table 3: MultiIndex([('HIGHLIGHTS OF CANDIDATES', ...),
            (      'Unnamed: 1_level_0', ...)],
           )
Table 4: Index(['HIGHLIGHTS OF WINNERS', 'Unnamed: 1'], dtype='object')
Table 5: Index(['Sno', 'Candidate', 'Constituency ∇', 'Party', 'Criminal Case',
       'Education', 'Total Assets', 'Liabilities'],
      dtype='object')
Table 6: Index([0, 1, 2, 3, 4, 5, 6, 7], dtype='int64')
Table 7: Index([0, 1, 2, 3, 4], dtype='int64')
Table 8: Index([0, 1, 2, 3, 4, 5, 6, 7], dtype='int64')
Data saved to ../data/raw/winning_2024.csv


Unnamed: 0,Sno,Candidate,Constituency ∇,Party,Criminal Case,Education,Total Assets,Liabilities
0,1,Godam Nagesh,ADILABAD (ST),BJP,1,Post Graduate,"Rs 3,09,16,833 ~ 3 Crore+","Rs 29,01,575 ~ 29 Lacs+"
1,2,Prof S P Singh Baghel,AGRA (SC),BJP,0,Doctorate,"Rs 10,43,58,691 ~ 10 Crore+","Rs 51,70,325 ~ 51 Lacs+"
2,3,Hasmukhbhai Patel (H.S.Patel),AHMEDABAD EAST,BJP,0,12th Pass,"Rs 8,64,70,298 ~ 8 Crore+","Rs 1,35,15,602 ~ 1 Crore+"
3,4,Dineshbhai Makwana (Advocate),AHMEDABAD WEST (SC),BJP,0,Graduate Professional,"Rs 14,17,69,586 ~ 14 Crore+","Rs 1,32,22,046 ~ 1 Crore+"
4,5,Nilesh Dnyandev Lanke,AHMEDNAGAR,Nationalist Congress Party – Sharadchandra Pawar,2,Graduate,,
...,...,...,...,...,...,...,...,...
538,539,Amar Sharadrao Kale,WARDHA,Nationalist Congress Party – Sharadchandra Pawar,0,12th Pass,"Rs 10,61,75,044 ~ 10 Crore+","Rs 2,95,04,353 ~ 2 Crore+"
539,540,Rahul Gandhi,WAYANAD,INC,18,Post Graduate,,
540,541,Kamaljeet Sehrawat,WEST DELHI,BJP,0,Post Graduate,"Rs 14,74,36,650 ~ 14 Crore+","Rs 1,18,57,294 ~ 1 Crore+"
541,542,Sanjay Uttamrao Deshmukh,YAVATMAL - WASHIM,ShivSena (Uddhav Balasaheb Thackeray),0,Graduate,"Rs 28,44,22,940 ~ 28 Crore+","Rs 9,48,74,636 ~ 9 Crore+"


### Voter Turnout for 2019 and 2023 elections:

In [53]:
# Example usage
turnout_2024_url = "https://www.indiavotes.com/lok-sabha/2024/all-states/18/0"
csv_filename_2024 = '../data/raw/turnout_data_2024.csv'

scrape_dynamic_data(turnout_2024_url,csv_filename_2024)

  dfs = pd.read_html(html)


Number of tables found: 2
Table 1: Index(['Party', 'Seats', 'Votes %', 'Unnamed: 3'], dtype='object')
Table 2: MultiIndex([(                  '#',                 '#'),
            ( 'Unnamed: 1_level_0',           'PC Name'),
            ( 'Unnamed: 2_level_0',                'No'),
            ( 'Unnamed: 3_level_0',              'Type'),
            ( 'Unnamed: 4_level_0',             'State'),
            ( 'Unnamed: 5_level_0', 'Winning Candidate'),
            ( 'Unnamed: 6_level_0',             'Party'),
            ( 'Unnamed: 7_level_0',          'Electors'),
            ( 'Unnamed: 8_level_0',             'Votes'),
            ( 'Unnamed: 9_level_0',           'Turnout'),
            ('Unnamed: 10_level_0',            'Margin'),
            ('Unnamed: 11_level_0',          'Margin %')],
           )
Data saved to ../data/raw/turnout_data_2024.csv


Unnamed: 0_level_0,#,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0
Unnamed: 0_level_1,#,PC Name,No,Type,State,Winning Candidate,Party,Electors,Votes,Turnout,Margin,Margin %
0,1,Adilabad,1,ST,Telangana,Godam Nagesh,Bharatiya Janta Party,1650175,1235597,74.9 %,90652.0,7.3%
1,2,Agra,18,SC,Uttar Pradesh [2000 Onwards],Prof S P Singh Baghel,Bharatiya Janta Party,2072685,1123779,54.2 %,271294.0,24.1%
2,3,Ahmadnagar,37,GEN,Maharashtra,Nilesh Dnyandev Lanke,Nationalist Congress Party – Sharadchandra Pawar,1981866,1325477,66.9 %,28929.0,2.2%
3,4,Ahmedabad East,7,GEN,Gujarat,Hasmukhbhai Patel (H.S.PATEL),Bharatiya Janta Party,2038162,1128339,55.4 %,461755.0,40.9%
4,5,Ahmedabad West,8,SC,Gujarat,Dineshbhai Makwana (ADVOCATE),Bharatiya Janta Party,1726987,966646,56.0 %,286437.0,29.6%
...,...,...,...,...,...,...,...,...,...,...,...,...
538,539,Wardha,8,GEN,Maharashtra,Amar Sharadrao Kale,Nationalist Congress Party – Sharadchandra Pawar,1682771,1095012,65.1 %,81648.0,7.5%
539,540,Wayanad,4,GEN,Kerala,Rahul Gandhi,Indian National Congress,1462423,1084653,74.2 %,364422.0,33.6%
540,541,West Delhi,6,GEN,Delhi [1977 Onwards],Kamaljeet Sehrawat,Bharatiya Janta Party,2587977,1524494,58.9 %,199013.0,13.1%
541,542,Yavatmal-Washi,14,GEN,Maharashtra,Sanjay Uttamrao Deshmukh,Shiv Sena (Uddhav Balasaheb Thackrey),1940916,1225530,63.1 %,94473.0,7.7%


In [54]:
turnout_2019_url = "https://www.indiavotes.com/lok-sabha/2019/all-states/17/0"
csv_filename_2019 = '../data/raw/turnout_data_2019.csv'
scrape_dynamic_data(turnout_2019_url, csv_filename_2019)

  dfs = pd.read_html(html)


Number of tables found: 2
Table 1: Index(['Party', 'Seats', 'Votes %', 'Unnamed: 3'], dtype='object')
Table 2: MultiIndex([(                  '#',                 '#'),
            ( 'Unnamed: 1_level_0',           'PC Name'),
            ( 'Unnamed: 2_level_0',                'No'),
            ( 'Unnamed: 3_level_0',              'Type'),
            ( 'Unnamed: 4_level_0',             'State'),
            ( 'Unnamed: 5_level_0', 'Winning Candidate'),
            ( 'Unnamed: 6_level_0',             'Party'),
            ( 'Unnamed: 7_level_0',          'Electors'),
            ( 'Unnamed: 8_level_0',             'Votes'),
            ( 'Unnamed: 9_level_0',           'Turnout'),
            ('Unnamed: 10_level_0',            'Margin'),
            ('Unnamed: 11_level_0',          'Margin %')],
           )
Data saved to ../data/raw/turnout_data_2019.csv


Unnamed: 0_level_0,#,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0
Unnamed: 0_level_1,#,PC Name,No,Type,State,Winning Candidate,Party,Electors,Votes,Turnout,Margin,Margin %
0,1,Adilabad,1,ST,Telangana,Soyam Bapu Rao,Bharatiya Janta Party,1382837,1063730,77.9 %,58560,5.5%
1,2,Agra,18,SC,Uttar Pradesh [2000 Onwards],Satyapal Singh Baghel,Bharatiya Janta Party,1866262,1145323,61.7 %,211546,18.5%
2,3,Ahmadnagar,37,GEN,Maharashtra,Dr. Sujay Radhakrishna Vikhepatil,Bharatiya Janta Party,1793677,1203797,67.3 %,281474,23.4%
3,4,Ahmedabad East,7,GEN,Gujarat,Patel Hasmukhbhai Somabhai,Bharatiya Janta Party,1713598,1116367,65.7 %,434330,38.9%
4,5,Ahmedabad West,8,SC,Gujarat,Dr. Kirit P. Solanki,Bharatiya Janta Party,1580673,997024,64.0 %,321546,32.3%
...,...,...,...,...,...,...,...,...,...,...,...,...
536,537,Wardha,8,GEN,Maharashtra,Ramdas Chandrabhanji Tadas,Bharatiya Janta Party,1679788,1072570,64.2 %,187191,17.5%
537,538,Wayanad,4,GEN,Kerala,Rahul Gandhi,Indian National Congress,1306141,1092197,83.8 %,431770,39.5%
538,539,West Delhi,6,GEN,Delhi [1977 Onwards],Sant Prasad Sinha,Bharatiya Janta Party,2039410,1441601,71.1 %,578486,40.1%
539,540,Yavatmal-Washi,14,GEN,Maharashtra,Bhavana Pundlikrao Gawali,Shiv Sena,1812059,1174220,65.0 %,117939,10.0%


### Religion and Caste Census Data:

In [55]:
religion_url = "https://www.census2011.co.in/religion.php"
religion_csv = "../data/raw/religion.csv"
scrape_dynamic_data(religion_url, religion_csv)

  dfs = pd.read_html(html)


Number of tables found: 4
Table 1: Index(['Religion', 'Percent', 'Estimated', 'State  Majority'], dtype='object')
Table 2: Index(['State', 'Majority Religion', 'Hindu', 'Muslim'], dtype='object')
Table 3: Index(['Religion', 'Percentage', 'Estimated'], dtype='object')
Table 4: Index(['Religion', 'Percentage', 'Estimated'], dtype='object')
Data saved to ../data/raw/religion.csv


Unnamed: 0,State,Majority Religion,Hindu,Muslim
0,Uttar Pradesh,Hindu,79.73%,19.26%
1,Maharashtra,Hindu,79.83%,11.54%
2,Bihar,Hindu,82.69%,16.87%
3,West Bengal,Hindu,70.54%,27.01%
4,Andhra Pradesh,Hindu,88.46%,9.56%
5,Madhya Pradesh,Hindu,90.89%,6.57%
6,Tamil Nadu,Hindu,87.58%,5.86%
7,Rajasthan,Hindu,88.49%,9.07%
8,Karnataka,Hindu,84.00%,12.92%
9,Gujarat,Hindu,88.57%,9.67%


In [56]:
literacy_url = "https://www.census2011.co.in/literacy.php"
literacy_csv = "../data/raw/literacy.csv"
scrape_dynamic_data(literacy_url, literacy_csv)

  dfs = pd.read_html(html)


Number of tables found: 1
Table 1: Index(['#', 'State', 'Literacy', 'Male', 'Female', '% Change'], dtype='object')
Data saved to ../data/raw/literacy.csv


Unnamed: 0,#,State,Literacy,Male,Female,% Change
0,-,India,74.04,82.14,65.46,8.66
1,1,Kerala,94.0,96.11,92.07,3.14
2,2,Lakshadweep,91.85,95.56,87.95,5.19
3,3,Mizoram,91.33,93.35,89.27,2.53
4,4,Goa,88.7,92.65,84.66,6.69
5,5,Tripura,87.22,91.53,82.73,14.03
6,6,Daman and Diu,87.1,91.54,79.55,8.92
7,7,Andaman and Nicobar Islands,86.63,90.27,82.43,5.33
8,8,Delhi,86.21,90.94,80.76,4.54
9,9,Chandigarh,86.05,89.99,81.19,4.11


In [57]:
sc_url = "https://www.census2011.co.in/scheduled-castes.php"
sc_csv = "../data/raw/sc.csv"
scrape_dynamic_data(sc_url, sc_csv)

  dfs = pd.read_html(html)


Number of tables found: 1
Table 1: Index(['#', 'State', 'No_HH', 'Population', 'Males', 'Female', 'Child(0-6)',
       'Literacy', 'Sex-Ratio'],
      dtype='object')
Data saved to ../data/raw/sc.csv


Unnamed: 0,#,State,No_HH,Population,Males,Female,Child(0-6),Literacy,Sex-Ratio
0,-,India,41694863,201378372,103535314,97843058,14.50 %,66.07 %,945
1,1,Uttar Pradesh,7375437,41357608,21676975,19680633,16.81 %,60.89 %,908
2,2,West Bengal,4861303,21463270,11003304,10459966,11.91 %,69.43 %,951
3,3,Tamil Nadu,3591953,14438445,7204687,7233758,11.02 %,73.26 %,1004
4,4,Andhra Pradesh,3429973,13878078,6913047,6965031,11.17 %,62.28 %,1008
5,5,Bihar,3073109,16567325,8606253,7961072,21.02 %,48.65 %,925
6,6,Maharashtra,2898245,13275898,6767759,6508139,12.20 %,79.66 %,962
7,7,Madhya Pradesh,2402342,11342320,5908638,5433682,15.51 %,66.16 %,920
8,8,Rajasthan,2279366,12221593,6355564,5866029,16.60 %,59.75 %,923
9,9,Karnataka,2178501,10474992,5264545,5210447,12.97 %,65.33 %,990


In [58]:
st_url = "https://www.census2011.co.in/scheduled-tribes.php"
st_csv = "../data/raw/st.csv"
scrape_dynamic_data(st_url, st_csv)


  dfs = pd.read_html(html)


Number of tables found: 1
Table 1: Index(['#', 'State', 'No_HH', 'Population', 'Males', 'Female', 'Child(0-6)',
       'Literacy', 'Sex-Ratio'],
      dtype='object')
Data saved to ../data/raw/st.csv


Unnamed: 0,#,State,No_HH,Population,Males,Female,Child(0-6),Literacy,Sex-Ratio
0,-,India,21511528,104545716,52547215,51998501,16.01 %,58.95 %,990
1,1,Madhya Pradesh,3122061,15316784,7719404,7597380,18.46 %,50.55 %,984
2,2,Orissa,2163110,9590756,4727732,4863024,15.86 %,52.24 %,1029
3,3,Maharashtra,2156957,10510213,5315025,5195188,14.78 %,65.73 %,977
4,4,Rajasthan,1787715,9238534,4742943,4495591,18.40 %,52.80 %,948
5,5,Chhattisgarh,1743277,7822902,3873191,3949711,15.33 %,59.09 %,1020
6,6,Gujarat,1699510,8917174,4501389,4415785,15.85 %,62.48 %,981
7,7,Jharkhand,1699215,8645042,4315407,4329635,16.97 %,57.13 %,1003
8,8,Andhra Pradesh,1417289,5918073,2969362,2948711,13.03 %,49.21 %,993
9,9,West Bengal,1160069,5296953,2649974,2646979,13.17 %,57.93 %,999


In [59]:
slums_url = "https://www.census2011.co.in/slums.php"
slums_csv = "../data/raw/slums.csv"
scrape_dynamic_data(slums_url, slums_csv)

  dfs = pd.read_html(html)


Number of tables found: 1
Table 1: Index(['#', 'State', 'Slums', 'Slum HH', 'Slum Pop', 'Male', 'Female',
       'Child (0-6)', 'SC', 'ST', 'Literacy'],
      dtype='object')
Data saved to ../data/raw/slums.csv


Unnamed: 0,#,State,Slums,Slum HH,Slum Pop,Male,Female,Child (0-6),SC,ST,Literacy
0,-,India,5.41%,13920191,65494604,33968203,31526401,8082743,13354080,2216533,77.72 %
1,1,Maharashtra,10.54%,2499948,11848423,6328217,5520206,1428850,1863882,364254,84.55 %
2,2,Andhra Pradesh,12.04%,2431474,10186934,5103377,5083557,1149779,1428212,270556,75.32 %
3,3,Tamil Nadu,8.04%,1463689,5798459,2886993,2911466,614969,1853315,30996,82.06 %
4,4,West Bengal,7.03%,1391756,6418594,3321700,3096894,656780,1060811,106373,81.38 %
5,5,Madhya Pradesh,7.83%,1117764,5688993,2957524,2731469,771999,1251713,356481,77.25 %
6,6,Uttar Pradesh,3.12%,1066363,6239965,3298339,2941626,863392,1373211,19186,68.98 %
7,7,Karnataka,5.39%,707662,3291434,1650724,1640710,418295,922589,172129,75.63 %
8,8,Chhattisgarh,7.43%,413831,1898931,966623,932308,254080,338098,174050,80.36 %
9,9,Rajasthan,3.02%,394391,2068000,1078991,989009,307035,582562,100675,69.79 %
