In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import os
from io import StringIO

In [2]:
# Directories containing HTML files for 2019 and 2024
html_files_dir_2019 = "../data/html/candidate-background-2019"
html_files_dir_2024 = "../data/html/candidate-background-2024"

In [3]:
# Function to process HTML content and extract data
def process_html(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        # Use BeautifulSoup to parse the HTML content
        soup = BeautifulSoup(file, 'html.parser')
        
        # Find the table within the HTML
        table = soup.find('table')
        
        # Use pandas to read the HTML table, wrapping the string in a StringIO object
        df = pd.read_html(StringIO(str(table)))[0]
        
        return df

In [4]:

# Function to process all HTML files in a directory and return a DataFrame
def process_html_directory(directory_path):
    # List all HTML files in the directory
    html_files = [file for file in os.listdir(directory_path) if file.endswith('.html')]

    # Initialize an empty DataFrame to store all data
    all_data = pd.DataFrame()

    # Process each HTML file and append the data to the DataFrame
    for html_file in html_files:
        # Extract the state name from the file name
        state = html_file.split('_')[0]
        
        # Construct the full file path
        file_path = os.path.join(directory_path, html_file)
        
        # Process the HTML file
        df = process_html(file_path)
        
        # Add the state column to the DataFrame
        df['State'] = state
        
        # Append to the main DataFrame
        all_data = pd.concat([all_data, df], ignore_index=True)

    # Drop the 'Sno .' column if it exists
    all_data = all_data.drop(columns=['Sno .'], errors='ignore')

    return all_data

In [5]:
# Process HTML files for 2019 and 2024 elections
data_2019 = process_html_directory(html_files_dir_2019)
data_2024 = process_html_directory(html_files_dir_2024)

# Save the combined DataFrame to CSV files
data_2019.to_csv('../data/raw/candidate_background_2019.csv', index=False)
data_2024.to_csv('../data/raw/candidate_background_2024.csv', index=False)

print("Data saved to CSV files successfully.")

Data saved to CSV files successfully.


In [3]:

# Directory containing HTML files
html_dir = "../data/html/state-census/"

# Initialize an empty list to store DataFrames
dfs = []

# Loop through each HTML file in the directory
for filename in os.listdir(html_dir):
    if filename.endswith("_CensusData.html"):
        # Extract state name by removing '_CensusData.html' suffix
        state_name = filename.replace('_CensusData.html', '')
        file_path = os.path.join(html_dir, filename)

        # Read the HTML file and parse with BeautifulSoup
        with open(file_path, 'r', encoding='utf-8') as f:
            soup = BeautifulSoup(f, 'html.parser')

        # Find the table with the ID 'fixedHeaders'
        table = soup.find('table', {'id': 'fixedHeaders'})

        # Use pandas to read the HTML table into a DataFrame
        if table:
            html_str = str(table)
            # Use StringIO to wrap the HTML string
            df = pd.read_html(StringIO(html_str))[0]
            
            # Add a column for the state name
            df['State'] = state_name

            # Append the DataFrame to the list
            dfs.append(df)
        else:
            print(f"No table found in {filename}")

# Concatenate all DataFrames into a single DataFrame
final_df = pd.concat(dfs, ignore_index=True)

# Display the first few rows of the combined DataFrame
print(final_df.head())

# Save the final DataFrame to a CSV file
final_df.to_csv('../data/raw/census_data_combined.csv', index=False)

    Type Area (Sq Km)               Male             Female  \
0  Rural         8211  1,26,287  (33.2%)  1,10,806  (29.1%)   
1  Urban           37    76,584  (20.1%)    66,904  (17.6%)   
2  Total         8249  2,02,871  (53.3%)  1,77,710  (46.7%)   
3  Rural            -  5,46,011  (39.5%)  5,20,347  (37.6%)   
4  Urban            -  1,67,901  (12.1%)  1,49,468  (10.8%)   

     Population Total Male SC Female SC Total SC            Male ST  \
0   2,37,093  (62.3%)       -         -        -     13,837  (3.6%)   
1   1,43,488  (37.7%)       -         -        -        894  (0.2%)   
2  3,80,581  (100.0%)       -         -        -     14,731  (3.9%)   
3  10,66,358  (77.1%)       -         -        -  3,90,625  (28.2%)   
4   3,17,369  (22.9%)       -         -        -     77,765  (5.6%)   

           Female ST           Total ST     Male Literates   Female Literates  \
0     12,878  (3.4%)     26,715  (7.0%)    99,960  (26.3%)    78,065  (20.5%)   
1        921  (0.2%)      1,815 

In [6]:
final_df.tail(20)

Unnamed: 0,Type,Area (Sq Km),Male,Female,Population Total,Male SC,Female SC,Total SC,Male ST,Female ST,Total ST,Male Literates,Female Literates,Total Literates,State
82,Urban,6632,"89,09,250 (13.0%)","81,38,835 (11.9%)","1,70,48,085 (24.9%)","13,97,001 (2.0%)","12,87,629 (1.9%)","26,84,630 (3.9%)","2,88,127 (0.4%)","2,57,284 (0.4%)","5,45,411 (0.8%)","67,83,823 (9.9%)","50,19,673 (7.3%)","1,18,03,496 (17.2%)",Rajasthan
83,Total,342239,"3,55,50,997 (51.9%)","3,29,97,440 (48.1%)","6,85,48,437 (100.0%)","63,55,564 (9.3%)","58,66,029 (8.6%)","1,22,21,593 (17.8%)","47,42,943 (6.9%)","44,95,591 (6.6%)","92,38,534 (13.5%)","2,36,88,412 (34.6%)","1,45,86,870 (21.3%)","3,82,75,282 (55.8%)",Rajasthan
84,Rural,7057,"2,42,797 (39.8%)","2,14,202 (35.1%)","4,56,999 (74.8%)","10,496 (1.7%)","9,839 (1.6%)","20,335 (3.3%)","86,059 (14.1%)","81,087 (13.3%)","1,67,146 (27.4%)","1,84,245 (30.2%)","1,37,685 (22.5%)","3,21,930 (52.7%)",Sikkim
85,Urban,38,"80,273 (13.1%)","73,305 (12.0%)","1,53,578 (25.2%)","3,958 (0.6%)","3,982 (0.7%)","7,940 (1.3%)","19,202 (3.1%)","20,012 (3.3%)","39,214 (6.4%)","67,024 (11.0%)","55,998 (9.2%)","1,23,022 (20.1%)",Sikkim
86,Total,7096,"3,23,070 (52.9%)","2,87,507 (47.1%)","6,10,577 (100.0%)","14,454 (2.4%)","13,821 (2.3%)","28,275 (4.6%)","1,05,261 (17.2%)","1,01,099 (16.6%)","2,06,360 (33.8%)","2,51,269 (41.2%)","1,93,683 (31.7%)","4,44,952 (72.9%)",Sikkim
87,Rural,116427,"1,86,79,065 (25.9%)","1,85,50,525 (25.7%)","3,72,29,590 (51.6%)","47,36,003 (6.6%)","47,39,472 (6.6%)","94,75,475 (13.1%)","3,33,178 (0.5%)","3,27,102 (0.5%)","6,60,280 (0.9%)","1,36,65,839 (18.9%)","1,08,36,356 (15.0%)","2,45,02,195 (34.0%)",Tamil Nadu
88,Urban,13632,"1,74,58,910 (24.2%)","1,74,58,530 (24.2%)","3,49,17,440 (48.4%)","24,68,684 (3.4%)","24,94,286 (3.5%)","49,62,970 (6.9%)","67,890 (0.1%)","66,527 (0.1%)","1,34,417 (0.2%)","1,43,74,652 (19.9%)","1,29,60,660 (18.0%)","2,73,35,312 (37.9%)",Tamil Nadu
89,Total,130060,"3,61,37,975 (50.1%)","3,60,09,055 (49.9%)","7,21,47,030 (100.0%)","72,04,687 (10.0%)","72,33,758 (10.0%)","1,44,38,445 (20.0%)","4,01,068 (0.6%)","3,93,629 (0.5%)","7,94,697 (1.1%)","2,80,40,491 (38.9%)","2,37,97,016 (33.0%)","5,18,37,507 (71.8%)",Tamil Nadu
90,Rural,10094,"13,87,173 (37.8%)","13,25,291 (36.1%)","27,12,464 (73.8%)","2,24,498 (6.1%)","2,13,495 (5.8%)","4,37,993 (11.9%)","5,63,908 (15.3%)","5,53,658 (15.1%)","11,17,566 (30.4%)","10,81,503 (29.4%)","9,11,270 (24.8%)","19,92,773 (54.2%)",Tripura
91,Urban,391,"4,87,203 (13.3%)","4,74,250 (12.9%)","9,61,453 (26.2%)","1,09,872 (3.0%)","1,07,053 (2.9%)","2,16,925 (5.9%)","24,419 (0.7%)","24,828 (0.7%)","49,247 (1.3%)","4,19,866 (11.4%)","3,92,144 (10.7%)","8,12,010 (22.1%)",Tripura
