# Project: FML - Senator list 
#### Get Canadian Senate Members, Electoral district and Political Party from 1972-2016 (29th-42nd Parliament)
Senator's name and birthday are web scraped from their Wikipedia pages

NOTES:
 - From 29th to 42nd Parliament, there are total of 1806 MP elected (combined total)
 - <b>433 unique Senator Wiki profiles</b>
 - There are no Missing Wiki profiles
 - From 433 wiki profiles, this script found 431 actual birthdays 

Date: March 31, 2021<br/>
Python notebook: Kole Krstev

In [27]:
import pandas as pd
import requests
from tqdm.auto import tqdm
from bs4 import BeautifulSoup
import numpy as np

# Canadian Senate Parliaments WIKI pages url:
#######################################################################
# 29-42 parliament: url = "https://en.wikipedia.org/wiki/List_of_senators_in_the_36th_Parliament_of_Canada" 
#######################################################################

parliaments = ["29th","30th","31st","32nd","33rd","34th","35th","36th","37th","38th","39th","40th","41st","42nd"]

wiki = "https://en.wikipedia.org"

all_senators_29_43=[]


for par in tqdm(parliaments):
    
    url = "https://en.wikipedia.org/wiki/List_of_senators_in_the_"+par+"_Parliament_of_Canada"       
    r = requests.get(url)
    
    #--------------------------------------------------------------------------------------------  
    
    # Use Pandas read_html function to read the tables right away from the scraped web page by the match 
    tables = pd.read_html(r.text, match="Date appointed", header=0)

    for table in tables:
        # we used pandas read_html to get the tables, but we need to get the links of the Senators's wiki pages
        # from their name by using Beautiful soup and
        soup = BeautifulSoup(r.text, "html.parser")
        
        for i,k in enumerate(table["Name"]):

            # find the al the links that have the name as anchor text and get the href value
            # but we'll use only the first one we find. We use find_all() vs find() because 
            # find_all doesn't give Nontype if empty, like find() (this might interfere with the value assigning)
            if k[-1] in ["0", "1", "2", "3", "4", "5", "6", "7", "8","9"]:
                k = k[:-1]
            
            link = soup.find_all('a', href=True, string=k, limit=1)
            if len(link)>0:
                table.loc[i,"Wiki_link"] = wiki + link[0]["href"]
            else:
                # if we don't find a link or href value (for some reason)
                # we'll assign this as null value
                table.loc[i,"Wiki_link"] = np.NaN

            # add the number of the Parliament as new column
            table.loc[i, "Parliament"] = par[:-2]
        
        # for 41-42 parliament tables, drop the last columns (start and end)
        if par in ["41st","42nd"]:
            table.drop(columns=['Start'], inplace=True)
            table.drop(columns=['End'], inplace=True)
            
        # unify the columns names
        if "Appointed by1" in table.columns:
            table.rename(columns={"Appointed by1":"Appointed by"}, inplace = True)
        
        # drop the columns not consistent with all tables
        if "On the Advice of" in table.columns:
            table.drop(columns=['On the Advice of'], inplace=True)
        
        # unify the columns names
        if "Province (Division)[Details]" in table.columns:
            table.rename(columns={"Province (Division)[Details]":"Province (Division)"}, inplace = True)
            
        
        # drop the first column (has different name for each table, so we'll drop it by column id)
        table.drop(columns=table.columns[0], axis=1,inplace=True)
        
    # here we concatenate all provinces as a Dataframe from the N-th Parliament 
    all_province_tables = pd.concat(tables, ignore_index=True)
       
    # Append this Dataframe into the list]
    all_senators_29_43.append(all_province_tables)


# create a final Dataframe of all Senators from 29-42 parliament
Canadian_Sen_29_43 = pd.concat(all_senators_29_43, ignore_index=True)


HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))




In [28]:
Canadian_Sen_29_43

Unnamed: 0,Name,Party,Province (Division),Date appointed,Appointed by,Left office,Reason,Wiki_link,Parliament
0,John Black Aird,Liberal,Ontario,"November 10, 1964",Pearson,"November 28, 1974",Resignation,https://en.wikipedia.org/wiki/John_Black_Aird,29
1,Hazen Argue,Liberal,Saskatchewan,"February 24, 1966",Pearson,"October 2, 1991",Death,https://en.wikipedia.org/wiki/Hazen_Argue,29
2,Martial Asselin,Progressive Conservative,Quebec,"September 1, 1972",Trudeau,"August 7, 1990",Resignation,https://en.wikipedia.org/wiki/Martial_Asselin,29
3,Michael Basha,Liberal,Newfoundland and Labrador,"January 24, 1951",St. Laurent,"November 18, 1976",Resignation,https://en.wikipedia.org/wiki/Michael_Basha,29
4,Louis-Philippe Beaubien,Progressive Conservative,Quebec,"November 16, 1960",Diefenbaker,"March 28, 1985",Death,https://en.wikipedia.org/wiki/Louis-Philippe_B...,29
...,...,...,...,...,...,...,...,...,...
1801,Howard Wetston,Non-affiliated,Ontario,"November 10, 2016","Trudeau, J.",Incumbent,,https://en.wikipedia.org/wiki/Howard_Wetston,42
1802,Howard Wetston,Independent Senators Group,Ontario,"November 10, 2016","Trudeau, J.",Incumbent,,https://en.wikipedia.org/wiki/Howard_Wetston,42
1803,Vernon White,Conservative,Ontario,"February 20, 2012",Harper,Incumbent,,https://en.wikipedia.org/wiki/Vernon_White_(po...,42
1804,Yuen Pau Woo,Non-affiliated,British Columbia,"November 10, 2016","Trudeau, J.",Incumbent,,https://en.wikipedia.org/wiki/Yuen_Pau_Woo,42


In [30]:
Canadian_Sen_29_43.shape

(1806, 9)

In [31]:
#number of missing wiki profiles
Canadian_Sen_29_43["Wiki_link"].isnull().sum()

2

In [32]:
# for some reason, there's two wiki profiles (of the same person) are not filling in,
# so let's do it manualy
Canadian_Sen_29_43[Canadian_Sen_29_43["Wiki_link"].isnull()]

Unnamed: 0,Name,Party,Province (Division),Date appointed,Appointed by,Left office,Reason,Wiki_link,Parliament
1392,Nancy Ruth,Conservative,Ontario (Toronto),"March 24, 2005",Martin,"January 6, 2017",Retirement,,40
1547,Nancy Ruth,Conservative,Ontario (Toronto),"March 24, 2005",Martin,"January 6, 2017",Retirement,,41


In [36]:
Canadian_Sen_29_43.loc[1547,"Wiki_link"]="https://en.wikipedia.org/wiki/Nancy_Ruth"
Canadian_Sen_29_43.loc[1392,"Wiki_link"]="https://en.wikipedia.org/wiki/Nancy_Ruth"

In [39]:
Canadian_Sen_29_43["Wiki_link"].isnull().sum()

0

In [40]:
# save a CSV file from this dataframe
Canadian_Sen_29_43.to_csv('FINAL_FINAL_Senators_from_29_43_parliament.csv')

### Get birthdays for all Senators from the dataframe above

In [41]:
sen_29_43 = pd.read_csv("FINAL_FINAL_Senators_from_29_43_parliament.csv", index_col=0)
sen_29_43

Unnamed: 0,Name,Party,Province (Division),Date appointed,Appointed by,Left office,Reason,Wiki_link,Parliament
0,John Black Aird,Liberal,Ontario,"November 10, 1964",Pearson,"November 28, 1974",Resignation,https://en.wikipedia.org/wiki/John_Black_Aird,29
1,Hazen Argue,Liberal,Saskatchewan,"February 24, 1966",Pearson,"October 2, 1991",Death,https://en.wikipedia.org/wiki/Hazen_Argue,29
2,Martial Asselin,Progressive Conservative,Quebec,"September 1, 1972",Trudeau,"August 7, 1990",Resignation,https://en.wikipedia.org/wiki/Martial_Asselin,29
3,Michael Basha,Liberal,Newfoundland and Labrador,"January 24, 1951",St. Laurent,"November 18, 1976",Resignation,https://en.wikipedia.org/wiki/Michael_Basha,29
4,Louis-Philippe Beaubien,Progressive Conservative,Quebec,"November 16, 1960",Diefenbaker,"March 28, 1985",Death,https://en.wikipedia.org/wiki/Louis-Philippe_B...,29
...,...,...,...,...,...,...,...,...,...
1801,Howard Wetston,Non-affiliated,Ontario,"November 10, 2016","Trudeau, J.",Incumbent,,https://en.wikipedia.org/wiki/Howard_Wetston,42
1802,Howard Wetston,Independent Senators Group,Ontario,"November 10, 2016","Trudeau, J.",Incumbent,,https://en.wikipedia.org/wiki/Howard_Wetston,42
1803,Vernon White,Conservative,Ontario,"February 20, 2012",Harper,Incumbent,,https://en.wikipedia.org/wiki/Vernon_White_(po...,42
1804,Yuen Pau Woo,Non-affiliated,British Columbia,"November 10, 2016","Trudeau, J.",Incumbent,,https://en.wikipedia.org/wiki/Yuen_Pau_Woo,42


In [43]:
###############################################################################################################
######## FINAL SCRIPT TO GET ALL SENATORS (29-42 parliamanet) images and birthdate from THEIR WIKI PAGE  ##################
##############################################################################################################


from bs4 import BeautifulSoup
import requests
import re

import pandas as pd
from tqdm.auto import tqdm

for i,k in tqdm(enumerate(sen_29_43["Wiki_link"])):
    r=requests.get(k)
    data = r.text
    soup = BeautifulSoup(data, "html.parser")

    #Get all the infobox sections (main top right corner info container of the wiki page)
    #infobox = soup.select('table[class="infobox vcard"]')
    infobox = soup.find("table", {"class":"infobox"})
    # we'll use this tag to get the birthday date in case there's no infobox
    #bdate_text = soup.select('div[id="bodyContent"]')
    bdate_text = soup.find("div", {"id":"bodyContent"})
    
    if infobox:
        #Wiki page can have more than one infobox section
        #but will use only the first one to get the wiki image and birthday
        if infobox.find("img"):
            sen_29_43.at[i,"Wiki_image"]=infobox.find("img").get("src")

        #find the table header of the Personal details section 
        #that contains the text "Born" or "Born:"
        bd1 = soup.find("th", text="Born")
        bd2 = soup.find("th", text="Born:")
        
        #if that table header exists and it has next sibling
        #we get the birthday text
        if bd1 or bd2:
            if bd1:
                sen_29_43.at[i,"Birthday"]=bd1.next_sibling.get_text()
            else:
                sen_29_43.at[i,"Birthday"]=bd2.next_sibling.get_text()
        
        else:
            # If infobox section doesn't contain "Born" section we can search for the FIRST paragraph that contains
            # the words "born" or "Born" and get the text of that paragraph
            p = soup.find_all("p", limit=1)
            if len(p)>0:
                if ("born" in p[0].text) or ("Born" in p[0].text):
                    sen_29_43.at[i,"Birthday"]=p[0].text
                # else get the content and use regex to match any text in "( )"
                else:
                    if bdate_text:
                        brackets_text = re.search(r'\((.*?)\)', str(bdate_text))
                        if brackets_text:
                            sen_29_43.at[i,"Birthday"]=brackets_text.group(1)
                        else:
                            sen_29_43.at[i,"Birthday"]=np.NaN
        
    else:
        # If there's No infobox section on this wiki page, we get the FIRST paragraph that contains the word "born"
        # This might not always work but at least we get some birthdays
        p = soup.find_all("p", limit=1)
        
        if ("born" in p[0].text) or ("Born" in p[0].text):
            sen_29_43.at[i,"Birthday"]=p[0].text
        else:
            # else get the content and use regex to match any text in "( )"
            # check first if there's "Citation" at the very top of the page
            # this box (table) might be mistakenly selected as the first paragraph of the content
            ftable = soup.find("table", {"class":"box-More_citations_needed"})
            
            if ftable:
                # first paragraph is the next sibling of this table
                nextp = ftable.find_next_siblings()
                if nextp[0].text:
                    brackets_text = re.search(r'\((.*?)\)', str(nextp[0].text))
                    if brackets_text:
                        sen_29_43.at[i,"Birthday"]=brackets_text.group(1)
            else:
                # if there's no Citation box
                if bdate_text:
                    brackets_text = re.search(r'\((.*?)\)', str(bdate_text))
                    if brackets_text:
                        sen_29_43.at[i,"Birthday"]=brackets_text.group(1)
                    else:
                        sen_29_43.at[i,"Birthday"]=np.NaN


# remove all new lines in the Birthday columns
sen_29_43["Birthday"] = sen_29_43["Birthday"].str.replace('\n','', regex=True)

#export to csv
sen_29_43.to_csv("VERY_FINAL_FINAL_Senators_from_29_43_parliament_IMAGES_and_BRITHDATES.csv")

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [44]:
sen_29_43

Unnamed: 0,Name,Party,Province (Division),Date appointed,Appointed by,Left office,Reason,Wiki_link,Parliament,Birthday,Wiki_image
0,John Black Aird,Liberal,Ontario,"November 10, 1964",Pearson,"November 28, 1974",Resignation,https://en.wikipedia.org/wiki/John_Black_Aird,29,"(1923-05-05)May 5, 1923Toronto, Ontario, Canada",
1,Hazen Argue,Liberal,Saskatchewan,"February 24, 1966",Pearson,"October 2, 1991",Death,https://en.wikipedia.org/wiki/Hazen_Argue,29,"Hazen Robert Argue(1921-01-06)January 6, 1921K...",//upload.wikimedia.org/wikipedia/commons/thumb...
2,Martial Asselin,Progressive Conservative,Quebec,"September 1, 1972",Trudeau,"August 7, 1990",Resignation,https://en.wikipedia.org/wiki/Martial_Asselin,29,"(1924-02-03)February 3, 1924La Malbaie, Quebec...",
3,Michael Basha,Liberal,Newfoundland and Labrador,"January 24, 1951",St. Laurent,"November 18, 1976",Resignation,https://en.wikipedia.org/wiki/Michael_Basha,29,"(1896-01-20)20 January 1896Baalbeck, Lebanon",
4,Louis-Philippe Beaubien,Progressive Conservative,Quebec,"November 16, 1960",Diefenbaker,"March 28, 1985",Death,https://en.wikipedia.org/wiki/Louis-Philippe_B...,29,"(1903-03-03)March 3, 1903Montreal, Quebec, Canada",
...,...,...,...,...,...,...,...,...,...,...,...
1801,Howard Wetston,Non-affiliated,Ontario,"November 10, 2016","Trudeau, J.",Incumbent,,https://en.wikipedia.org/wiki/Howard_Wetston,42,"Howard I. Wetston (1947-06-03) June 3, 1947 (a...",
1802,Howard Wetston,Independent Senators Group,Ontario,"November 10, 2016","Trudeau, J.",Incumbent,,https://en.wikipedia.org/wiki/Howard_Wetston,42,"Howard I. Wetston (1947-06-03) June 3, 1947 (a...",
1803,Vernon White,Conservative,Ontario,"February 20, 2012",Harper,Incumbent,,https://en.wikipedia.org/wiki/Vernon_White_(po...,42,"(1959-02-21) February 21, 1959 (age 62)New Wa...",//upload.wikimedia.org/wikipedia/commons/thumb...
1804,Yuen Pau Woo,Non-affiliated,British Columbia,"November 10, 2016","Trudeau, J.",Incumbent,,https://en.wikipedia.org/wiki/Yuen_Pau_Woo,42,"(1963-03-02) March 2, 1963 (age 58)Malaysia",


In [45]:
sen_29_43.shape

(1806, 11)

In [46]:
#number of missing Birthdays
sen_29_43["Birthday"].isnull().sum()

0

In [48]:
# number of unique Wiki profiles
sen_29_43["Wiki_link"].nunique()

433

In [49]:
sen_29_43["Birthday"].nunique()

418