# Project: FML
#### Get Canadian Members of Parliament by Province, Electoral district and Political Party from 1972-2019 (29th-43th Parliament)
MP's name and birthday are web scraped from their Wikipedia pages

NOTES:
 - From 29th to 43rd Parliament, there are total of 4882 MP elected (combined total)
 - <b>1950 unique MP names</b>
 - <b>1795 unique Wiki profiles</b>
 - Missing Wiki profiles in most cases are for MP's that switched parties, someone replaced them etc.
 - From 1795 wiki profiles, this script 1744 birthdays - and only 51 aren't showing an actual birthday (because of the redirections, and/or additional obsticles)
 

Date: March 29, 2021<br/>
Python script: Kole Krstev


In [None]:
import pandas as pd
import requests
from tqdm.auto import tqdm
from bs4 import BeautifulSoup
import numpy as np

# Parliaments WIKI pages url:
#######################################################################
# 29-35 parliament: url = 'https://en.wikipedia.org/wiki/34th_Canadian_Parliament'
# 36-43 parliament: url = 'https://en.wikipedia.org/wiki/List_of_House_members_of_the_41st_Parliament_of_Canada'
#######################################################################

# On their Wiki pages, Canadian parliament lists has different Province table layout for different elections
# 33,37-43rd parliaments have ("Name", "Party", "Electoral district") table layout
# 29,30,31,32,34,35,36 parliament have ("Riding", "Member", "Political Party") table layout 


parliaments = ["29th","30th","31st","32nd","33rd","34th","35th","36th","37th","38th","39th","40th","41st","42nd","43rd"]

wiki = "https://en.wikipedia.org"

all_mps_29_43=[]


for par in tqdm(parliaments):
    
    # we need to change the wiki URL based on the parliament
    if par in ["29th","30th","31st","32nd","33rd","34th","35th"]:
        url = "https://en.wikipedia.org/wiki/"+par+"_Canadian_Parliament"
    else:
        url = "https://en.wikipedia.org/wiki/List_of_House_members_of_the_"+par+"_Parliament_of_Canada"
            
    r = requests.get(url)
    
    #--------------------------------------------------------------------------------------------  
    
    # Use Pandas read_html function to read the tables right away from the scraped web page by the match 
    if par in ["33rd","37th","38th","39th","40th","41st","42nd","43rd"]:
        tables = pd.read_html(r.text, match="Electoral district", header=0)
    else:
        if par=="34th" or par=="35th":
            tables = pd.read_html(r.text, match="Political party", header=0)
        else:
            tables = pd.read_html(r.text, match="Political Party", header=0)

    # Remove last element from list
    # because on 38th page there's additional table that matches "Electoral district"
    # at the very end of the page
    if par == "38th":
        tables.pop()
    
    for table in tables:
        # we used pandas read_html to get the tables, but we need to get the links of the MP's wiki pages
        # from their name by using Beautiful soup and
        soup = BeautifulSoup(r.text, "html.parser")
        
        if par in ["33rd","37th","38th","39th","40th","41st","42nd","43rd"]:
            for i,k in enumerate(table["Name"]):
                
                # find the al the links that have the name as anchor text and get the href value
                # but we'll use only the first one we find. We use find_all() vs find() because 
                # find_all doesn't give Nontype if empty, like find() (this might interfere with the value assigning)
                link = soup.find_all('a', href=True, string=k, limit=1)
                if len(link)>0:
                    table.loc[i,"Wiki_link"] = wiki + link[0]["href"]
                else:
                    # if we don't find a link or href value (for some reason)
                    # we'll assign this as null value
                    table.loc[i,"Wiki_link"] = np.NaN
                
                # add the number of the Parliament as new column
                table.loc[i, "Parliament"] = par[:-2]
                
        else:
            for m,n in enumerate(table["Member"]):
                link = soup.find_all('a', href=True, string=n, limit=1)
                if len(link)>0:
                    table.loc[m,"Wiki_link"] = wiki + link[0]["href"]
                else:
                    table.loc[m,"Wiki_link"] = np.NaN
                table.loc[m, "Parliament"] = par[:-2]
        
    # here we concatenate all provinces as a Dataframe from the N-th Parliament 
    all_province_tables = pd.concat(tables, ignore_index=True)
       
    # Append this Dataframe into new list
    all_mps_29_43.append(all_province_tables)


# Now that we have a list of all dataframes from 29-43 parliament 
# before we concatenate all of them, we need to unify them and make sure have the same columns
#--------------------------------------------------------------------------------------------

# parliaments that have table layout (Riding, Member, PoliticalParty) - that's 29,30,31,32,34,35,36 parliamants
table_layout_1 = ["29th","30th","31st","32nd","34th","35th","36th"]
indeces_29_30_31_32_34_35_36 = [parliaments.index(x) for x in table_layout_1]

# parliaments that have table layout (Name, Party, Electoral district) - that's 33,37-43 parliamants
table_layout_2 = ["33rd","37th","38th","39th","40th","41st","42nd","43rd"]
indeces_33_37_43 = [parliaments.index(x) for x in table_layout_2]
    
    
for index,election in enumerate(all_mps_29_43):
    
    if index in indeces_29_30_31_32_34_35_36:
        election.drop(columns=['Riding'], inplace=True)
        election.rename(columns={"Riding.1":"Electoral district","Member":"Name"}, inplace = True)
        # if 34th or 35th parliament change the "Political party" to "Party"
        # indices should be "5" and "6"
        if index==5 or index==6:
            election.rename(columns={"Political party":"Party"}, inplace = True)
        # otherwise change the "Political Party" to "Party"
        else:
            election.rename(columns={"Political Party":"Party"}, inplace = True)
        election=election[['Name', 'Party', 'Electoral district', 'Wiki_link', 'Parliament']]

    elif index in indeces_33_37_43:
        election.drop(columns=['Unnamed: 0'], inplace=True)
        election=election[['Name', 'Party', 'Electoral district', 'Wiki_link', "Parliament"]]

#--------------------------------------------------------------------------------------------

# create a final Dataframe of all Mp from 29-43 parliament
Canadian_Mps_29_43 = pd.concat(all_mps_29_43, ignore_index=True)

# save a CSV file from this dataframe
Canadian_Mps_29_43.to_csv('FINAL_FINAL_MPs_from_29_43_parliament.csv')

In [None]:
Canadian_Mps_29_43

In [None]:
Canadian_Mps_29_43.shape

### EDA of the Dataframe with MP's from 31-43 Parliament


In [None]:
Canadian_Mps_29_43 = pd.read_csv('FINAL_FINAL_MPs_from_29_43_parliament.csv',index_col=0)

In [None]:
Canadian_Mps_29_43

In [None]:
Canadian_Mps_29_43["Name"].unique()

In [None]:
# make sure we drop the duplicates if there're any
Canadian_Mps_29_43.drop_duplicates(keep="first", inplace=True)

# replace all the "*" in the name of the Mp's
Canadian_Mps_29_43["Name"] = Canadian_Mps_29_43["Name"].str.replace("*","")

In [None]:
Canadian_Mps_29_43.shape

In [None]:
#number of Unique Wiki profiles -> total of 1794
Canadian_Mps_29_43["Wiki_link"].nunique()

In [None]:
# number of missing wiki profiles - null values (total of 214)
Canadian_Mps_29_43["Wiki_link"].isnull().sum()

### Export Unique MP's as CSV

In [None]:
### Sift the MP's with unique wiki profile
Mps_29_43_unique = Canadian_Mps_29_43.drop_duplicates(subset=['Wiki_link'], keep='first')
Mps_29_43_unique.dropna(inplace=True)
Mps_29_43_unique = Mps_29_43_unique.reset_index(drop=True)

In [None]:
Mps_29_43_unique

In [None]:
Mps_29_43_unique.to_csv('MP_from_29_43_parliament_UNIQUE_WIKI_link.csv')

### Export Missing Wiki profile MP's as CSV file

In [None]:
#all null wiki_link rows are exported as dataframe so we can find different option to get the birthdays
Mps_29_43_no_wiki_profile = Canadian_Mps_29_43[Canadian_Mps_29_43["Wiki_link"].isnull()]

# reset index
Mps_29_43_no_wiki_profile = Mps_29_43_no_wiki_profile.reset_index(drop=True)

#save to csv
Mps_29_43_no_wiki_profile.to_csv('MP_from_29_43_parliament_WITHOUT_WIKI_link.csv')

In [None]:
Mps_29_43_no_wiki_profile

# Final script to get MP's images and birthdates from their WIki page

In [None]:
mp_29_43 = pd.read_csv("MP_from_29_43_parliament_UNIQUE_WIKI_link.csv", index_col=0)
mp_29_43

In [None]:
###############################################################################################################
######## FINAL SCRIPT TO GET ALL MLP (29-43 parliamanet) images and birthdate from THEIR WIKI PAGE  ##################
##############################################################################################################


from bs4 import BeautifulSoup
import requests

import pandas as pd
from tqdm.auto import tqdm

for i,k in tqdm(enumerate(mp_29_43["Wiki_link"])):
    r=requests.get(k)
    data = r.text
    soup = BeautifulSoup(data, "html.parser")

    #Get all the infobox sections (main top right corner info container of the wiki page)
    #infobox = soup.select('table[class="infobox vcard"]')
    infobox = soup.find("table", {"class":"infobox"})
    # we'll use this tag to get the birthday date in case there's no infobox
    #bdate_text = soup.select('div[id="bodyContent"]')
    bdate_text = soup.find("div", {"id":"bodyContent"})
    
    if infobox:
        #Wiki page can have more than one infobox section
        #but will use only the first one to get the wiki image and birthday
        #NOTE: We got better images for MP's from 36-43 parliament from House of Commons website
        #If we need to use photos, we have the dataframe as csv file already
        if infobox.find("img"):
            mp_29_43.at[i,"Wiki_image"]=infobox.find("img").get("src")

        #find the table header of the Personal details section 
        #that contains the text "Born" or "Born:"
        bd1 = soup.find("th", text="Born")
        bd2 = soup.find("th", text="Born:")
        
        #if that table header exists and it has next sibling
        #we get the birthday text
        if bd1 or bd2:
            if bd1:
                mp_29_43.at[i,"Birthday"]=bd1.next_sibling.get_text()
            else:
                mp_29_43.at[i,"Birthday"]=bd2.next_sibling.get_text()
        
        else:
            # If infobox section doesn't contain "Born" section we can search for the FIRST paragraph that contains
            # the words "born" or "Born" and get the text of that paragraph
            p = soup.find_all("p", limit=1)
            if len(p)>0:
                if ("born" in p[0].text) or ("Born" in p[0].text):
                    mp_29_43.at[i,"Birthday"]=p[0].text
                # else get the content and use regex to match any text in "( )"
                else:
                    if bdate_text:
                        brackets_text = re.search(r'\((.*?)\)', str(bdate_text))
                        if brackets_text:
                            mp_29_43.at[i,"Birthday"]=brackets_text.group(1)
                        else:
                            mp_29_43.at[i,"Birthday"]=np.NaN
        
    else:
        # If there's No infobox section on this wiki page, we get the FIRST paragraph that contains the word "born"
        # This might not always work but at least we get some birthdays
        p = soup.find_all("p", limit=1)
        
        if ("born" in p[0].text) or ("Born" in p[0].text):
            mp_29_43.at[i,"Birthday"]=p[0].text
        else:
            # else get the content and use regex to match any text in "( )"
            # check first if there's "Citation" at the very top of the page
            # this box (table) might be mistakenly selected as the first paragraph of the content
            ftable = soup.find("table", {"class":"box-More_citations_needed"})
            
            if ftable:
                # first paragraph is the next sibling of this table
                nextp = ftable.find_next_siblings()
                if nextp[0].text:
                    brackets_text = re.search(r'\((.*?)\)', str(nextp[0].text))
                    if brackets_text:
                        mp_29_43.at[i,"Birthday"]=brackets_text.group(1)
            else:
                # if there's no Citation box
                if bdate_text:
                    brackets_text = re.search(r'\((.*?)\)', str(bdate_text))
                    if brackets_text:
                        mp_29_43.at[i,"Birthday"]=brackets_text.group(1)
                    else:
                        mp_29_43.at[i,"Birthday"]=np.NaN


# remove all new lines in the Birthday columns
mp_29_43["Birthday"] = mp_29_43["Birthday"].str.replace('\n','', regex=True)

#export to csv
mp_29_43.to_csv("VERY_FINAL_FINAL_MP_from_29_43_parliament_IMAGES_and_BRITHDATES.csv")

In [None]:
mp_29_4 = pd.read_csv("VERY_FINAL_FINAL_MP_from_29_43_parliament_IMAGES_and_BRITHDATES.csv",index_col=0)

In [None]:
mp_29_43

In [None]:
#missing Birthday fields
mp_29_43["Birthday"].isnull().sum()

In [None]:
mp_29_43[mp_29_43["Birthday"].isnull()]