In [None]:
import json
import lxml.html
import httpx
from lxml.cssselect import CSSSelector

def scrape_wikipedia_tables(url):
    # Send a request to the URL and get the HTML content
    response = httpx.get(url)
    root = lxml.html.fromstring(response.content)
    
    # Find all tables with the class 'wikitable sortable'
    tables = root.cssselect('table.wikitable.sortable')

    # Store extracted data
    all_data = []

    # Define specific CSS selectors for each column
    alderperson_selector = CSSSelector('th[scope="row"] a')
    term_selector = CSSSelector('td:nth-of-type(4)')
    party_selector = CSSSelector('td:nth-of-type(5) a')
    notes_selector = CSSSelector('td:nth-of-type(6)')

    # Iterate through each table
    for table_index, table in enumerate(tables):
        # Extract rows from the table
        rows = table.cssselect('tr:nth-of-type(n+2)')

        # Iterate through each row
        for index, row in enumerate(rows):
            # Debugging: Print table and row number
            print(f"Processing table {table_index + 1}, row {index + 1}")
            
            # Extract data using the defined CSS selectors
            alderperson = alderperson_selector(row)
            term = term_selector(row)
            party = party_selector(row)
            notes = notes_selector(row)

            # Debugging: Print extracted values
            print(f"Alderperson: {alderperson[0].text_content().strip() if alderperson else 'N/A'}")
            print(f"Term: {' '.join(term[0].xpath('.//text()')).strip().replace('– ', '–').replace('\n', ' ') if term else 'N/A'}")
            print(f"Party: {party[0].text_content().strip() if party else 'N/A'}")
            print(f"Notes: {' '.join(notes[0].xpath('.//text()')).strip().replace('\n', ' ') if notes else 'N/A'}")

            # Skip the row if 'Alderperson' is empty
            if not alderperson:
                continue

            # Prepare the row data
            term_text = ' '.join(term[0].xpath('.//text()')).strip().replace('– ', '–').replace('\n', ' ') if term else 'N/A'
            notes_text = ' '.join(notes[0].xpath('.//text()')).strip().replace('\n', ' ') if notes else 'N/A'
            
            row_data = [
                alderperson[0].text_content().strip(),
                term_text,
                party[0].text_content().strip() if party else 'N/A',
                notes_text
            ]

            # Append the data to the list
            all_data.append(row_data)

    # Save the data to a JSON file
    with open('scraped_data.json', 'w', encoding='utf-8') as f:
        json.dump(all_data, f, ensure_ascii=False, indent=4)

    return all_data

# Specify the URL of the Wikipedia page
url = 'https://en.wikipedia.org/wiki/List_of_Chicago_alderpersons_since_1923'

# Call the function and store the scraped data
scraped_data = scrape_wikipedia_tables(url)

print("Scraping completed and data saved to 'scraped_data.json'.")



In [373]:
import pandas as pd

# Read the JSON file into a DataFrame
df = pd.read_json('scraped_data.json')


# Adjust display settings to show all rows
pd.set_option('display.max_rows', None)

# Display the DataFrame as a table with rows
print(df)



                            0  \
0               John Coughlin   
1               Michael Kenna   
2               John Budinger   
3             John D'Arco Sr.   
4                   Fred Roti   
5                  Ted Mazola   
6               Jesse Granato   
7               Manuel Flores   
8            Proco Joe Moreno   
9             Daniel La Spata   
10          Louis B. Anderson   
11          William L. Dawson   
12          Earl B. Dickerson   
13          William H. Harvey   
14            Fred D. Hubbard   
15            William Barnett   
16                 Bobby Rush   
17         Madeline Haithcock   
18            Robert Fioretti   
19           Brian K. Hopkins   
20          Robert R. Jackson   
21           Benjaim A. Grant   
22    Oscar Stanton De Priest   
23        Archibald Carey Jr.   
24             Ralph Metcalfe   
25           Tyrone T. Kenner   
26            Dorothy Tillman   
27                 Pat Dowell   
28           Abraham H. Cohen   
29        