In [None]:
import json
import lxml.html
import httpx
from lxml.cssselect import CSSSelector

url = 'https://en.wikipedia.org/wiki/List_of_Chicago_alderpersons_since_1923'

def scrape_wikipedia_tables(url):
    '''
    Scrape data about Chicago alderpersons from a Wikipedia page.

    This function extracts information about alderpersons for each ward in 
    Chicago from a table in a given Wikipedia URL. It collects data including 
    the ward name, alderperson name,start and end dates of their term, party 
    affiliation, and any additional notes.

    Args:
        url (str): The URL of the Wikipedia page to scrape.

    Returns:
        list: A list of dictionaries, where each dictionary contains information about
        an alderperson. The keys in each dictionary are:
        - "Ward": The name of the ward (e.g., "1st Ward")
        - "Alderperson": The name of the alderperson
        - "Start Date": The start date of their term
        - "End Date": The end date of their term (or "Present" if still in office)
        - "Party": The political party affiliation of the alderperson
        - "Notes": Any additional notes about the alderperson or their term
    '''

    response = httpx.get(url)
    root = lxml.html.fromstring(response.content)
    
    all_data = []

    # Find all h3 elements (ward headers)
    ward_headers = root.cssselect('h3')

    for ward_header in ward_headers:
        # Extract full ward name
        ward_name = ward_header.text_content().strip()
        
        # Find the table following this ward header
        table = ward_header.xpath('./following::table[@class="wikitable sortable"][1]')
        
        if not table:
            continue  # Skip if no table found for this ward
        
        table = table[0]  # Get the first (and should be only) table

        # Define specific CSS selectors for each column
        alderperson_selector = CSSSelector('th[scope="row"] a')
        term_selector = CSSSelector('td:nth-of-type(4)')
        party_selector = CSSSelector('td:nth-of-type(5) a')
        notes_selector = CSSSelector('td:nth-of-type(6)')

        # Extract rows from the table
        rows = table.cssselect('tr:nth-of-type(n+2)')

        # Iterate through each row
        for index, row in enumerate(rows):
            
            # Extract data using the defined CSS selectors
            alderperson = alderperson_selector(row)
            term = term_selector(row)
            party = party_selector(row)
            notes = notes_selector(row)

            # Skip the row if 'Alderperson' is empty
            if not alderperson:
                continue

            # Extract and format the term dates
            if term:
                term_text = ' '.join(term[0].xpath('.//text()')).strip().replace('\n', ' ')
                term_parts = term_text.split('–')
                
                start_date = term_parts[0].strip()  # Get the start date
                end_date = term_parts[1].strip() if len(term_parts) > 1 else "Present"  # Get the end date
            else:
                start_date = end_date = term_text = 'N/A'

            notes_text = ' '.join(notes[0].xpath('.//text()')).strip().replace('\n', ' ') if notes else 'N/A'

            row_data = {
                "Ward": ward_name,
                "Alderperson": alderperson[0].text_content().strip(),
                "Start Date": start_date,
                "End Date": end_date,
                "Party": party[0].text_content().strip() if party else 'N/A',
                "Notes": notes_text
            }

            # Append the data to the list
            all_data.append(row_data)

    # Save the data to a JSON file
    with open('scraped_data.json', 'w', encoding='utf-8') as f:
        json.dump(all_data, f, ensure_ascii=False, indent=4)

    return all_data

# Call the function and store the scraped data
scraped_data1 = scrape_wikipedia_tables(url)



In [None]:
import pandas as pd

# Read the JSON file into a DataFrame
df = pd.read_json('scraped_data.json')


# Adjust display settings to show all rows
pd.set_option('display.max_rows', None)

# Display the DataFrame as a table with rows
print(df)

