In [30]:
# Read in
import lxml.html
import httpx 
import pandas as pd
import re
import json
from lxml.cssselect import CSSSelector

In [31]:
def scrape_wikipedia_tables(url):
    """
    Scrape data about Chicago alderpersons from tables in a Wikipedia page.

    This function extracts information about alderpersons for each ward in 
    Chicago from tables in a given Wikipedia URL. It collects data including 
    the ward number, alderperson name, start and end dates of their term, party 
    affiliation, and any additional notes.

    Args:
        url (str): The URL of the Wikipedia page to scrape.

    Returns:
        list: A list of dictionaries, where each dictionary contains information about
        an alderperson. The keys in each dictionary are:
        - "Ward": The ward number
        - "Alderperson": The name of the alderperson
        - "Start Date": The start date of their term
        - "End Date": The end date of their term (or "Present" if still in office)
        - "Party": The political party affiliation of the alderperson
        - "Notes": Any additional notes about the alderperson or their term
    """

    response = httpx.get(url)
    root = lxml.html.fromstring(response.content)
    
    all_data = []

    # Find all tables with class "wikitable sortable"
    tables = root.cssselect('table.wikitable.sortable')

    for table in tables:
        # Try to find the preceding h3 element for ward information
        ward_header = table.xpath('./preceding::h3[1]')
        if ward_header:
            ward_text = ward_header[0].text_content().strip()
            ward_number = ward_text.split()[0]  
        else:
            continue  # Skip if no ward header found

        # Define specific CSS selectors for each column
        alderperson_selector = CSSSelector('th[scope="row"] a')
        term_selector = CSSSelector('td:nth-of-type(4)')
        party_selector = CSSSelector('td:nth-of-type(5) a')
        notes_selector = CSSSelector('td:nth-of-type(6)')

        # Extract rows from the table
        rows = table.cssselect('tr:nth-of-type(n+2)')

        # Iterate through each row
        for row in rows:
            # Extract data using the defined CSS selectors
            alderperson = alderperson_selector(row)
            if not alderperson:
                continue
            alderperson = alderperson[0].text_content().strip()
            
            term = term_selector(row)
            if term:
                term_text = ' '.join(term[0].xpath('.//text()')).strip().replace('\n', ' ')
                term_parts = term_text.split('–')
                start_date = term_parts[0].strip()  # Get the start date
                end_date = term_parts[1].strip() if len(term_parts) > 1 else "Present"  # Get the end date
            else:
                start_date = end_date = 'N/A'

            party = party_selector(row)
            party = party[0].text_content().strip() if party else 'N/A'
            
            notes = notes_selector(row)
            notes_text = ' '.join(notes[0].xpath('.//text()')).strip().replace('\n', ' ') if notes else 'N/A'

            row_data = {
                "Ward": ward_number,
                "Alderperson": alderperson,
                "Start Date": start_date,
                "End Date": end_date,
                "Party": party,
                "Notes": notes_text
            }

            # Append the data to the list
            all_data.append(row_data)

    # Save the data to a JSON file
    with open('getnet_data.json', 'w', encoding='utf-8') as f:
        json.dump(all_data, f, ensure_ascii=False, indent=4)

    return all_data

# Call the function and store the scraped data
#scraped_data = scrape_wikipedia_tables(url)


# Scraping Wikipedia: Bullet Points

We broke up scraping into two tasks: scraping tables and scraping bullet points. This jupyter notebook walks you through Libby's bullet scraping process. 

## Ultimate Goal
Compile a database of all the aldermen who served between 2012 and 2023. 

## Steps
This will be broken down into two parts. Part 1 is where I will grab all the bullet points from wikipedia. Part 2 is where I scrape the pages of people who have profile links.

# Part 1

In [32]:
def compile_aldermen_for_wards(ul_element, ward_number, aldermen_dict):
    """
    Gather each the ward aldermen for wards whose aldermen are listed as bullet oints. 
    
    inputs:
        ul_element: element known to contain bullet points
        ward_number: str associated with number of ward
        aldermen_dict: dictionary to contain ward_numbers as the key, values 
        will be a list of dictionaries. Key is an alderman's name, value is any 
        link associated with them
    
    outputs: 
        aldermen_dict: dictionary to contain ward_numbers as the key, values 
        will be a list of dictionaries. Key is an alderman's name, value is any 
        link associated with them
    
    """
    # Define key elements
    
    ward_aldermen = []
    bullet_points = ul_element.cssselect("li")
    for bullet in bullet_points:
        links = bullet.cssselect('a')
        profile_link = None        
        if links is not None:
            for element in links:
                # Wikipedia will often link to a file or its bibliography. I 
                # only want a link if it has the person's biography there
                href = element.get('href')
                if re.search("File", str(href), re.IGNORECASE) or \
                    re.search("cite_note", str(href), re.IGNORECASE):
                    continue
                else:
                    profile_link = "https://en.wikipedia.org/" + href
                    break
        # Now I should create a dictionary with the indivudual as the key and 
        # the links as the value
        # I will then add this dictionary to a list. This list will then be the 
        # value of final dictionary where the key is the ward
        
        # clean name courtesy of: https://stackoverflow.com/questions/640001/how-can-i-remove-text-within-parentheses-with-a-regex
        clean_name = re.sub(r'\([^)]*\)', '', bullet.text_content())
        person_tuple = (str.strip(clean_name), profile_link)
        ward_aldermen.append(person_tuple)
    
    aldermen_dict[ward_number] = ward_aldermen
    
    return aldermen_dict
    

In [33]:
def find_ward():

    aldermen_dict = {}
    
    # Define key elements
    url = "https://en.wikipedia.org/wiki/List_of_Chicago_alderpersons_since_1923"
    url_text = httpx.get(url)
    root = lxml.html.fromstring(url_text.text)

    body_content = root.cssselect('div.mw-body-content')[0]
    heading = body_content.cssselect('div.mw-heading.mw-heading3')

    for ward_number in heading:
        num = re.findall("\\d+", ward_number.text_content())
        ward = num[0]
        
        # Each part of the page is structured a bit differently. 
        # Sometimes, I a ul element is found immediately after the heading. Other 
        # times, they have figures or paragraphs after the heading and before the 
        # list of aldermen. For that reason, I grab the next five elements.
        
        sibling_1 = ward_number.getnext()
        sibling_2 = sibling_1.getnext()
        sibling_3 = sibling_2.getnext()
        sibling_4 = sibling_3.getnext()
        sibling_5 = sibling_4.getnext()
        
        for element in [sibling_1, sibling_2, sibling_3, sibling_4, sibling_5]:
            if re.search("table", str(element), re.IGNORECASE):
                # if there's a table associated with this ward, I don't care about
                # it, so I move onto the next ward
                break
            elif re.search("ul", str(element), re.IGNORECASE):
                aldermen_dict = \
                    compile_aldermen_for_wards(element, ward, aldermen_dict)
                break
            else:
                continue

    return aldermen_dict
        
    

In [34]:
aldermen_dict = find_ward()

aldermen_dict

{'5': [('Charles S. Eaton', None),
  ('Leonard J. Grossman', None),
  ('Charles S. Eaton', None),
  ('Irving J. Schreiber', None),
  ('James J. Cusack Jr.', None),
  ('Paul Howard Douglas',
   'https://en.wikipedia.org//wiki/Paul_Howard_Douglas'),
  ('Bertram B. Moss', None),
  ('Robert E. Merriam', 'https://en.wikipedia.org//wiki/Robert_E._Merriam'),
  ('Leon Despres', 'https://en.wikipedia.org//wiki/Leon_Despres'),
  ('Ross Lathrop', None),
  ('Lawrence Bloom', None),
  ('Barbara Holt', None),
  ('Leslie Hairston', 'https://en.wikipedia.org//wiki/Leslie_Hairston')],
 '6': [('Guy Guernsey', None),
  ('John F. Healy', None),
  ('Patrick Sheridan Smith', None),
  ('Francis J. Hogan', None),
  ('David R. Muir', None),
  ('Sydney A. Jones Jr.', None),
  ('Robert H. Miller', None),
  ('A. A. Rayner Jr.',
   'https://en.wikipedia.org//w/index.php?title=A._A._Rayner_Jr.&action=edit&redlink=1'),
  ('Eugene Sawyer', 'https://en.wikipedia.org//wiki/Eugene_Sawyer'),
  ('John O. Steele',
   'http

# Part 2
Great! Now that I have my dictionary. I find the start and end year for each person. I am particularly interested in the years 2012-2023, so I won't bother to get data for everyone. 

In [46]:
def find_alder_link(aldermen_dict):
    aldermen_dates_dict = {}
    for ward, aldermen_and_links in aldermen_dict.items():
        all_info = []
        for tup in aldermen_and_links:
            alderperson = tup[0]
            link = tup[1]
            # we have a series of different labels for dates so we can evaluate 
            # what has happened
            if link is not None:
                dates = "link exists"
                resp = httpx.get(link)
                if resp.status_code == 200:
                    root = lxml.html.fromstring(resp.text)
                    # This is the box that has the in office dates, but sometimes 
                    # it doesn't exist
                    info_box = root.cssselect("table.infobox.vcard")
                    print(alderperson,info_box, len(info_box))
                    if len(info_box) == 1:
                        info_elements = info_box[0].cssselect('th.infobox-header')
                        print(info_elements)
                        if len(info_elements) == 0:
                            dates = "no headings"
                        else:
                            for element in info_elements:
                                print(f"{ward}, {alderperson}, has this heading: {element.text_content()}")
                                # we want to find the row header that is associated 
                                # with being a Chicago alderperson — and not the council president                       
                                if re.search("ward", (element.text_content()), re.IGNORECASE) \
                                    or re.search("alder", element.text_content(), \
                                        re.IGNORECASE) or re.search("Chicago City Council", \
                                            element.text_content(), re.IGNORECASE) and \
                                            not re.search("President", element.text_content(), re.IGNORECASE):
                                    print("Good heading!")
                                    # We also want to see if they have the correct ward listed -- if there is one. 
                                    if re.search("\\d+", element.text_content(), re.IGNORECASE) is None or ward in element.text_content():
                                        print("either ward in text or no ward")
                                        # We want to find the row itself this is associated 
                                        # with rather than the actual header     
                                        row_parent = element.getparent()
                                        # The header we want is in the next two rows
                                        header_option1 = row_parent.getnext()
                                        header_option2 = header_option1.getnext()
                                        if re.search("office", header_option1.text_content(), re.IGNORECASE):
                                            dates_raw = header_option1.text_content()
                                            dates_raw = re.sub("In office", "", dates_raw)
                                            dates_raw = re.sub("Assumed office", "", dates_raw)
                        # clean name courtesy of: https://stackoverflow.com/questions/640001/how-can-i-remove-text-within-parentheses-with-a-regex
                                            dates_raw = re.sub(r'\([^)]*\)', ' ', dates_raw)
                                            dates_raw = re.sub(r'\xa0', ' ', dates_raw)
                                            dates = str.strip(re.sub(r'\[[^)]*\]', ' ', dates_raw))
                                        elif re.search("office", header_option2.text_content(), re.IGNORECASE):
                                            dates_raw = header_option2.text_content()
                                            dates_raw = re.sub("In office", "", dates_raw)
                                            dates_raw = re.sub("Assumed office", "", dates_raw)
                        # clean name courtesy of: https://stackoverflow.com/questions/640001/how-can-i-remove-text-within-parentheses-with-a-regex
                                            dates_raw = re.sub(r'\([^)]*\)', ' ', dates_raw)
                                            dates_raw = re.sub(r'\xa0', ' ', dates_raw)
                                            dates = str.strip(re.sub(r'\[[^)]*\]', '', dates_raw))
                                        else:
                                            dates = "unknown from link"
                                        break
                                else:
                                    print("bad heading")
                                    dates = "no suitable heading"
                    else:
                        dates = "ERROR, BOX ELEMENT"
                    
                else:
                    dates = "ERROR, BAD LINK"       
            else:
                dates = "Unknown"
                
            person_tup = (alderperson, dates)
            all_info.append(person_tup)
            
        aldermen_dates_dict[ward] = all_info
            
    return aldermen_dates_dict

I've checked through the bad links and decided I'm okay with letting those go. 

Now it's time to finalize a list of aldermen who have a start date and end date, add them to a dictionary

In [47]:
aldermen_dates_dict = find_alder_link(aldermen_dict)

Paul Howard Douglas [<Element table at 0x11f8cd180>] 1
[<Element th at 0x12841b250>, <Element th at 0x12841a990>, <Element th at 0x12841a120>, <Element th at 0x128418af0>, <Element th at 0x11fc5e3a0>]
5, Paul Howard Douglas, has this heading: United States Senatorfrom Illinois
bad heading
5, Paul Howard Douglas, has this heading: Member of the Chicago City Councilfrom the 5th ward
Good heading!
either ward in text or no ward
Robert E. Merriam [] 0
Leon Despres [<Element table at 0x11f8cd180>] 1
[<Element th at 0x11f89dd10>, <Element th at 0x12804b7f0>]
5, Leon Despres, has this heading: Chicago Alderman from the 5th ward
Good heading!
either ward in text or no ward
Leslie Hairston [<Element table at 0x12841a120>] 1
[<Element th at 0x128c984b0>, <Element th at 0x12841b250>]
5, Leslie Hairston, has this heading: Member of the Chicago City Councilfrom the 5th ward
Good heading!
either ward in text or no ward
Eugene Sawyer [<Element table at 0x128c98550>] 1
[<Element th at 0x128418af0>, <E

In [48]:
aldermen_dates_dict

{'5': [('Charles S. Eaton', 'Unknown'),
  ('Leonard J. Grossman', 'Unknown'),
  ('Charles S. Eaton', 'Unknown'),
  ('Irving J. Schreiber', 'Unknown'),
  ('James J. Cusack Jr.', 'Unknown'),
  ('Paul Howard Douglas', '1939–1942'),
  ('Bertram B. Moss', 'Unknown'),
  ('Robert E. Merriam', 'ERROR, BOX ELEMENT'),
  ('Leon Despres', '1955–1975'),
  ('Ross Lathrop', 'Unknown'),
  ('Lawrence Bloom', 'Unknown'),
  ('Barbara Holt', 'Unknown'),
  ('Leslie Hairston', 'May 1999 – May 15, 2023')],
 '6': [('Guy Guernsey', 'Unknown'),
  ('John F. Healy', 'Unknown'),
  ('Patrick Sheridan Smith', 'Unknown'),
  ('Francis J. Hogan', 'Unknown'),
  ('David R. Muir', 'Unknown'),
  ('Sydney A. Jones Jr.', 'Unknown'),
  ('Robert H. Miller', 'Unknown'),
  ('A. A. Rayner Jr.', 'ERROR, BAD LINK'),
  ('Eugene Sawyer', 'February 28, 1971  – December 2, 1987'),
  ('John O. Steele', 'ERROR, BAD LINK'),
  ('Freddrenna Lyle', 'February 8, 1998 – May 15, 2011'),
  ('Roderick Sawyer', 'May 16, 2011 – May 15, 2023')],
 '7

In [11]:
def aldermen_and_dates(aldermen_dates_dict):
    all_data = []
    alderman_and_dates = {}

    for ward, values in aldermen_dates_dict.items():
        aldermen = []
        for alder_info in values:
            alderperson = alder_info[0]
            dates = alder_info[1]
            if re.search("\\d+", dates):
                # Two types of hyphens I have to be aware of here
                if re.search('–', dates):
                    beg_end = re.split('–', dates)
                    begin = beg_end[0]
                elif re.search('-', dates):
                    beg_end = re.split('-', dates)
                    begin = beg_end[0]
                else:
                    beg_end = []
                    begin = dates
                if len(beg_end) == 2:
                    end = beg_end[1]
                else:
                    end = "present"
                    
            # To ensure I'll match Getnet's data, I'll copy the format he used
                row_data = {
                    "Ward": ward,
                    "Alderperson": alderperson,
                    "Start Date": str.strip(begin),
                    "End Date": str.strip(end),
                    "Party": None,
                    "Notes": None
                }
                all_data.append(row_data)
                    
            else:
                continue
            
    #with open('../data/wiki/libbys_scraped_data.json', 'w', encoding='utf-8') as f:
    #    json.dump(all_data, f, ensure_ascii=False, indent=4)

    return all_data
            
                
            

# Test Combining

In [16]:
url = 'https://en.wikipedia.org/wiki/List_of_Chicago_alderpersons_since_1923'
    
    # Call the function and store the scraped data
scraped_data = scrape_wikipedia_tables(url)

alderperson_dict = find_ward()
alder_dates = find_alder_link(alderperson_dict)
bullet_data = aldermen_and_dates(alder_dates)

5, Paul Howard Douglas, has this heading: United States Senatorfrom Illinois
5, Paul Howard Douglas, has this heading: Member of the Chicago City Councilfrom the 5th ward
Good heading!
either ward in text or no ward
5, Leon Despres, has this heading: Chicago Alderman from the 5th ward
Good heading!
either ward in text or no ward
5, Leslie Hairston, has this heading: Member of the Chicago City Councilfrom the 5th ward
Good heading!
either ward in text or no ward
6, Eugene Sawyer, has this heading: 53rd Mayor of Chicago
6, Eugene Sawyer, has this heading: Member of the Chicago City Councilfrom the 6th ward
Good heading!
either ward in text or no ward
6, Freddrenna Lyle, has this heading: Judge of the Circuit Court of Cook County
6, Freddrenna Lyle, has this heading: Member of the Chicago City Council from the 6th ward
Good heading!
either ward in text or no ward
6, Roderick Sawyer, has this heading: Member of the Chicago City Councilfrom the 6th ward
Good heading!
either ward in text or 

In [19]:
all_data = []

all_data.extend(scraped_data)

all_data.extend(bullet_data)

In [20]:
all_data

[{'Ward': '1st',
  'Alderperson': 'John Coughlin',
  'Start Date': 'April 16, 1923',
  'End Date': 'November 11, 1938',
  'Party': 'Democratic',
  'Notes': 'Had been serving since 1892    Died in office'},
 {'Ward': '1st',
  'Alderperson': 'Michael Kenna',
  'Start Date': 'April 12, 1939',
  'End Date': 'April 9, 1943',
  'Party': 'Democratic',
  'Notes': 'Had previously served from 1897 to 1923'},
 {'Ward': '1st',
  'Alderperson': 'John Budinger',
  'Start Date': 'April 9, 1943',
  'End Date': '1951',
  'Party': 'Democratic',
  'Notes': 'Had previously served (previous iteration of) the 4th ward from 1910 to 1912'},
 {'Ward': '1st',
  'Alderperson': "John D'Arco Sr.",
  'Start Date': '1951',
  'End Date': '1963',
  'Party': 'Democratic',
  'Notes': ''},
 {'Ward': '1st',
  'Alderperson': 'Fred Roti',
  'Start Date': '1968',
  'End Date': '1991',
  'Party': 'Democratic',
  'Notes': ''},
 {'Ward': '1st',
  'Alderperson': 'Ted Mazola',
  'Start Date': '1991',
  'End Date': '1995',
  'Part

In [21]:
pd.DataFrame(all_data)

Unnamed: 0,Ward,Alderperson,Start Date,End Date,Party,Notes
0,1st,John Coughlin,"April 16, 1923","November 11, 1938",Democratic,Had been serving since 1892 Died in office
1,1st,Michael Kenna,"April 12, 1939","April 9, 1943",Democratic,Had previously served from 1897 to 1923
2,1st,John Budinger,"April 9, 1943",1951,Democratic,Had previously served (previous iteration of) ...
3,1st,John D'Arco Sr.,1951,1963,Democratic,
4,1st,Fred Roti,1968,1991,Democratic,
...,...,...,...,...,...,...
199,46,Angela Clay,"May 15, 2023",present,,
200,48,Kathy Osterman,1987,1989,,
201,48,Mary Ann Smith,1989,2011,,
202,48,Harry Osterman,"May 16, 2011","May 15, 2023",,
