In [1]:
#Beautiful Soup is a Python package for parsing HTML and XML documents. 
#It creates a parse tree for parsed pages that can be used to extract data from HTML, which is useful for web scraping. 
#Prettify() function in BeautifulSoup will enable us to view how the tags are nested in the document.

#https://medium.com/analytics-vidhya/web-scraping-wiki-tables-using-beautifulsoup-and-python-6b9ea26d8722
#https://erikrood.com/Python_References/web_scrape.html

# Import Packages for webs scraping & data manipulation

In [3]:
#Packages
#--Web scraping packages
from bs4 import BeautifulSoup
import requests
#Pandas/numpy for data manipulation
import pandas as pd
import numpy as np

1) Using Beautiful Soup for web scrapping

In [5]:
#Postalcode of Canada in Wiki Page
website_url = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text

#loading empty array for postalCode of Canada
postalcode_Canada = []

#Using BeautifulSoup for web Scrapping
soup = BeautifulSoup(website_url, "html.parser")

#identify table we want to scrape
postalcode_Table = soup.find('table',{'class':'wikitable sortable'})

2) take out data from table

In [6]:
#try clause to skip any companies with missing/empty board member tables
try:
#loop through table, grab each of the 4 columns shown (try one of the links yourself to see the layout)
    for row in soup.find_all('table')[0].find_all('tr'):
        cols = row.find_all('td')
        if len(cols) == 3:
            postalcode_Canada.append((website_url, cols[0].text.strip(), cols[1].text.strip(), cols[2].text.strip()))
except: pass  
        
#convert output to new array, check length
postalcode_array = np.asarray(postalcode_Canada)
len(postalcode_array)

288

3) Convert to dataframe from array and Rename column name in dataframe

In [15]:
#convert new array to dataframe
df = pd.DataFrame(postalcode_array)

#rename columns, check output
df.columns = ['URL','PostalCode', 'Borough','Neighboorhood']

df1 = df[['PostalCode', 'Borough','Neighboorhood']]
df1.head(5)

Unnamed: 0,PostalCode,Borough,Neighboorhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [16]:
df2 = df1[(df1.Borough != "Not assigned")] 
df2.head(5)

Unnamed: 0,PostalCode,Borough,Neighboorhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [17]:
#https://stackoverflow.com/questions/54216702/pandas-grouping-by-column-one-and-adding-comma-separated-entries-from-column-two
dfResult = df2.groupby(['PostalCode','Borough'])['Neighboorhood'].agg(lambda x: ', '.join(set(x))).reset_index()
#dfResult.sort_values('PostalCode')
dfResult.head(5)

Unnamed: 0,PostalCode,Borough,Neighboorhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Port Union, Rouge Hill, Highland Creek"
2,M1E,Scarborough,"Morningside, Guildwood, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


6) Checking for replace with Borough column value when Neighboorhood column value is "Not assigned" although Borough has some value.

In [18]:
dfResult['Neighboorhood']  = np.where(((dfResult['Borough'] != "Not assigned") & (dfResult['Neighboorhood'] == "Not assigned")), dfResult['Borough'],dfResult['Neighboorhood'])
dfResult.head(5)

Unnamed: 0,PostalCode,Borough,Neighboorhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Port Union, Rouge Hill, Highland Creek"
2,M1E,Scarborough,"Morningside, Guildwood, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [19]:
print(dfResult.shape)

(103, 3)
