In [11]:
from bs4 import BeautifulSoup
import pandas as pd
import os
from io import StringIO

In [12]:
# Directory containing HTML files
html_files_dir = "../data/html"

# List all HTML files in the directory
html_files = [file for file in os.listdir(html_files_dir) if file.endswith('.html')]
# print(html_files)

# Initialize an empty DataFrame to store all data
all_data = pd.DataFrame()

In [13]:
# Function to process HTML content and extract data
def process_html(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        # Use BeautifulSoup to parse the HTML content
        soup = BeautifulSoup(file, 'html.parser')
        
        # Find the table within the HTML
        table = soup.find('table')
        
        # Use pandas to read the HTML table, wrapping the string in a StringIO object
        df = pd.read_html(StringIO(str(table)))[0]
        
        return df

In [14]:

# Process each HTML file and append the data to the DataFrame
for html_file in html_files:
    # Extract the state name from the file name
    state = html_file.split('_')[0]
    
    # Construct the full file path
    file_path = os.path.join(html_files_dir, html_file)
    
    # Process the HTML file
    df = process_html(file_path)
    
    # Add the state column to the DataFrame
    df['State'] = state
    
    # Append to the main DataFrame
    all_data = pd.concat([all_data, df], ignore_index=True)

# drop the Sno . column
all_data = all_data.drop(columns=['Sno .'])
# Save the combined DataFrame to a CSV file
# all_data = all_data.drop_duplicates()
all_data.to_csv('../data/raw/candidate_background_2024.csv', index=False)

In [15]:
all_data

Unnamed: 0,Candidate,Party,Criminal Cases,Education,Age,Total Assets,Liabilities,State
0,Anand Ramnath Arlekar,IND,0,10th Pass,58,"Rs 8,43,025 ~ 8 Lacs+","Rs 13,36,346 ~ 13 Lacs+",Andaman and Nicobar Islands
1,Bishnu Pada Ray,BJP,0,Graduate,73,"Rs 2,74,39,170 ~ 2 Crore+","Rs 3,02,788 ~ 3 Lacs+",Andaman and Nicobar Islands
2,D Ayyappan,CPI(M),0,Post Graduate,60,"Rs 1,32,87,710 ~ 1 Crore+","Rs 26,49,000 ~ 26 Lacs+",Andaman and Nicobar Islands
3,Dr Arun Kumar Mallik,BSP,0,Post Graduate,64,"Rs 11,18,72,135 ~ 11 Crore+",Rs 0 ~,Andaman and Nicobar Islands
4,K J B Selvaraj,AIADMK,0,Graduate,50,,,Andaman and Nicobar Islands
...,...,...,...,...,...,...,...,...
8333,Uttam Barik S/O Late Narayan Barik,IND,0,8th Pass,57,"Rs 7,82,132 ~ 7 Lacs+",Rs 0 ~,West Bengal
8334,Uttam Chatterjee,SUCI(C),0,Others,64,"Rs 78,44,105 ~ 78 Lacs+","Rs 2,00,000 ~ 2 Lacs+",West Bengal
8335,Vaskar Malik,IND,0,12th Pass,25,,,West Bengal
8336,Yasmin Islam,Social Democratic Party Of India,0,12th Pass,42,"Rs 34,36,438 ~ 34 Lacs+",Rs 0 ~,West Bengal
