In [None]:
import json
import lxml.html
import httpx
from lxml.cssselect import CSSSelector
from collections import defaultdict

url = 'https://en.wikipedia.org/wiki/List_of_Chicago_alderpersons_since_1923'

def scrape_wikipedia_tables(url):
    '''
    Scrape data about Chicago alderpersons from a Wikipedia page.
    
    This function extracts information about alderpersons for each ward in 
    Chicago from a table in a given Wikipedia URL. It collects data including 
    the ward name, alderperson name, start and end dates of their term, party 
    affiliation, and any additional notes.
    
    Args:
        url (str): The URL of the Wikipedia page to scrape.
    
    Returns:
        list: A list of dictionaries, where each dictionary contains information about
              an alderperson. The keys in each dictionary are:
              - "Ward": The name of the ward (e.g., "1st Ward")
              - "Alderperson": The name of the alderperson
              - "Start Date": The start date of their term
              - "End Date": The end date of their term (or "Present" if still in office)
              - "Party": The political party affiliation of the alderperson
              - "Notes": Any additional notes about the alderperson or their term
    '''

    # Get the webpage content
    response = httpx.get(url)
    root = lxml.html.fromstring(response.content)
    
    all_data = []
    alderperson_wards = defaultdict(list)  # Dictionary to track alderperson and associated wards

    # Find all h3 elements (ward headers)
    ward_headers = root.cssselect('h3')

    for ward_header in ward_headers:
        # Extract full ward name
        ward_name = ward_header.text_content().strip()
        print(f"Processing ward: {ward_name}")
        
        # Find the table following this ward header
        table = ward_header.xpath('./following::table[@class="wikitable sortable"][1]')
        
        if not table:
            print(f"Warning: No table found for ward: {ward_name}. Skipping.")
            continue  # Skip if no table found for this ward
        
        table = table[0]  # Get the first (and should be only) table

        # Define specific CSS selectors for each column
        alderperson_selector = CSSSelector('th[scope="row"] a')
        term_selector = CSSSelector('td:nth-of-type(4)')
        party_selector = CSSSelector('td:nth-of-type(5) a')
        notes_selector = CSSSelector('td:nth-of-type(6)')

        # Extract rows from the table
        rows = table.cssselect('tr:nth-of-type(n+2)')

        # Iterate through each row
        for row in rows:
            # Extract data using the defined CSS selectors
            alderperson = alderperson_selector(row)
            term = term_selector(row)
            party = party_selector(row)
            notes = notes_selector(row)

            # Skip the row if 'Alderperson' is empty
            if not alderperson:
                continue

            # Extract and format the term dates
            if term:
                term_text = ' '.join(term[0].xpath('.//text()')).strip().replace('\n', ' ')
                term_parts = term_text.split('–')
                
                start_date = term_parts[0].strip()  # Get the start date
                end_date = term_parts[1].strip() if len(term_parts) > 1 else "Present"  # Get the end date
            else:
                start_date = end_date = 'N/A'

            notes_text = ' '.join(notes[0].xpath('.//text()')).strip().replace('\n', ' ') if notes else 'N/A'

            alderperson_name = alderperson[0].text_content().strip()

            # Add alderperson and ward association to the dictionary
            alderperson_wards[alderperson_name].append(ward_name)

            # Store the data in a row_data dictionary
            row_data = {
                "Ward": ward_name,
                "Alderperson": alderperson_name,
                "Start Date": start_date,
                "End Date": end_date,
                "Party": party[0].text_content().strip() if party else 'N/A',
                "Notes": notes_text
            }

            # Append the data to the all_data list
            all_data.append(row_data)
            print(f"Added alderperson: {alderperson_name}, ward: {ward_name}")

    # Now let's format the data to make sure each alderperson only appears once.
    unique_alderpersons = []

    for name, wards in alderperson_wards.items():
        # If the alderperson is in multiple wards, list all of them.
        # Otherwise, just add the first ward.
        unique_alderpersons.append({
            "Alderperson": name,
            "Wards": ', '.join(wards),
        })
        print(f"Processed alderperson: {name} -> Wards: {', '.join(wards)}")

    # Print out repeated alderpersons and their wards
    print("\nAlderpersons who served in multiple wards over time:")
    for alderperson in unique_alderpersons:
        if len(alderperson["Wards"].split(', ')) > 1:
            print(f"{alderperson['Alderperson']} served in wards: {alderperson['Wards']}")

    # Save the data to a JSON file
    with open('table_data.json', 'w', encoding='utf-8') as f:
        json.dump(all_data, f, ensure_ascii=False, indent=4)

    return unique_alderpersons

# Call the function to scrape data
scraped_data = scrape_wikipedia_tables(url)




In [7]:
import pandas as pd

# Read the JSON file into a DataFrame
df = pd.read_json('table_data.json')


# Adjust display settings to show all rows
pd.set_option('display.max_rows', None)

# Display the DataFrame as a table with rows
print(df)



          Ward               Alderperson          Start Date  \
0     1st Ward             John Coughlin      April 16, 1923   
1     1st Ward             Michael Kenna      April 12, 1939   
2     1st Ward             John Budinger       April 9, 1943   
3     1st Ward           John D'Arco Sr.                1951   
4     1st Ward                 Fred Roti                1968   
5     1st Ward                Ted Mazola                1991   
6     1st Ward             Jesse Granato                1995   
7     1st Ward             Manuel Flores        May 19, 2003   
8     1st Ward          Proco Joe Moreno      March 26, 2010   
9     1st Ward           Daniel La Spata        May 20, 2019   
10    2nd Ward         Louis B. Anderson      April 16, 1923   
11    2nd Ward         William L. Dawson                1933   
12    2nd Ward         Earl B. Dickerson      April 12, 1939   
13    2nd Ward         William H. Harvey       April 9, 1943   
14    2nd Ward           Fred D. Hubbard