In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import os
from io import StringIO

In [2]:
# Directories containing HTML files for 2019 and 2024
html_files_dir_2019 = "../data/html/candidate-background-2019"
html_files_dir_2024 = "../data/html/candidate-background-2024"

In [3]:
# Function to process HTML content and extract data
def process_html(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        # Use BeautifulSoup to parse the HTML content
        soup = BeautifulSoup(file, 'html.parser')
        
        # Find the table within the HTML
        table = soup.find('table')
        
        # Use pandas to read the HTML table, wrapping the string in a StringIO object
        df = pd.read_html(StringIO(str(table)))[0]
        
        return df

In [4]:

# Function to process all HTML files in a directory and return a DataFrame
def process_html_directory(directory_path):
    # List all HTML files in the directory
    html_files = [file for file in os.listdir(directory_path) if file.endswith('.html')]

    # Initialize an empty DataFrame to store all data
    all_data = pd.DataFrame()

    # Process each HTML file and append the data to the DataFrame
    for html_file in html_files:
        # Extract the state name from the file name
        state = html_file.split('_')[0]
        
        # Construct the full file path
        file_path = os.path.join(directory_path, html_file)
        
        # Process the HTML file
        df = process_html(file_path)
        
        # Add the state column to the DataFrame
        df['State'] = state
        
        # Append to the main DataFrame
        all_data = pd.concat([all_data, df], ignore_index=True)

    # Drop the 'Sno .' column if it exists
    all_data = all_data.drop(columns=['Sno .'], errors='ignore')

    return all_data

In [5]:
# Process HTML files for 2019 and 2024 elections
data_2019 = process_html_directory(html_files_dir_2019)
data_2024 = process_html_directory(html_files_dir_2024)

# Save the combined DataFrame to CSV files
data_2019.to_csv('../data/raw/candidate_background_2019.csv', index=False)
data_2024.to_csv('../data/raw/candidate_background_2024.csv', index=False)

print("Data saved to CSV files successfully.")

Data saved to CSV files successfully.
