In [1]:
import bs4 as bs
import urllib.request
import pandas as pd
from geopy.geocoders import GoogleV3, Nominatim

In [2]:
# Reading the html file
source = urllib.request.urlopen('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').read()

# Finding the table
soup = bs.BeautifulSoup(source, 'html.parser')
table = soup.find('table', attrs={'class':'wikitable sortable'})

# Get all table rows
table_rows = table.find_all('tr')

# Taking the column names of the table
ths = table_rows[0].find_all('th')
cols =[th.text.strip().replace(" ", "") for th in ths]

# Extracting all the data in the table
data = [] 
table_rows[0]
for tr in table_rows:
    tds = tr.find_all('td')
    row = [td.text.strip() for td in tds]
    if len(row) > 0:
        data.append(row)

In [3]:
# Crerating the dataframe
df_table = pd.DataFrame(data, columns=cols)
df_table.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


Checking for repeated postal codes

In [4]:
df_table[['PostalCode','Borough']].groupby('PostalCode').count().sort_values(by='Borough',ascending=False)

Unnamed: 0_level_0,Borough
PostalCode,Unnamed: 1_level_1
M1A,1
M6S,1
M6V,1
M6W,1
M6X,1
...,...
M4B,1
M4C,1
M4E,1
M4G,1


In [5]:
df_table[df_table['PostalCode'] == 'M5A']

Unnamed: 0,PostalCode,Borough,Neighbourhood
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


Therefore, in the the new table in wikipedia, postal codes are not repeated. Now lets removed Boroughs that are Not assigned.

In [6]:
df_table = df_table[df_table['Borough'] != 'Not assigned']

Now not assigned Boroughs are removed from the table. Now lets check if we have any Neighbourhoods with Not Assigned value.

In [7]:
df_table[df_table['Neighbourhood'] == 'Not assigned']

Unnamed: 0,PostalCode,Borough,Neighbourhood


This shows that there are there are Neighbourhoods with Not Assigned values. Therefore we are done cleaning the table.

In [8]:
df_table.shape

(103, 3)

Cleaned table have 103 rows that corresponds to 103 Boroughs.

In [13]:
geolocator = GoogleV3(api_key="apikey")

In [10]:
def get_latlng(row):
    g = geolocator.geocode('{0}, {1}'.format(row.PostalCode, row.Borough))
    print('Searched Address: {0}, {1} --> Found Address: {2} | lat: {3} | lng : {4}'
          .format(row.PostalCode, row.Borough, str(g), g.latitude, g.longitude))
    row['Latitude'] = g.latitude
    row['Longitude'] = g.longitude
    
    return row

In [11]:
df_latlng = df_table.apply(get_latlng,axis=1)

Searched Address: M3A, North York --> Found Address: North York, ON M3A, Canada | lat: 43.7532586 | lng : -79.3296565
Searched Address: M3A, North York --> Found Address: North York, ON M3A, Canada | lat: 43.7532586 | lng : -79.3296565
Searched Address: M4A, North York --> Found Address: North York, ON M4A, Canada | lat: 43.72588229999999 | lng : -79.3155716
Searched Address: M5A, Downtown Toronto --> Found Address: Toronto, ON M5A, Canada | lat: 43.6542599 | lng : -79.36063589999999
Searched Address: M6A, North York --> Found Address: North York, ON M6A, Canada | lat: 43.718518 | lng : -79.4647633
Searched Address: M7A, Downtown Toronto --> Found Address: North York, ON M7A, Canada | lat: 43.6623015 | lng : -79.3894938
Searched Address: M9A, Etobicoke --> Found Address: Etobicoke, ON M9A, Canada | lat: 43.6678556 | lng : -79.5322424
Searched Address: M1B, Scarborough --> Found Address: Scarborough, ON M1B, Canada | lat: 43.8066863 | lng : -79.1943534
Searched Address: M3B, North York 

Searched Address: M5P, Central Toronto --> Found Address: York, ON M5P, Canada | lat: 43.6969476 | lng : -79.4113072
Searched Address: M6P, West Toronto --> Found Address: Toronto, ON, Canada | lat: 43.653226 | lng : -79.3831843
Searched Address: M9P, Etobicoke --> Found Address: Etobicoke, ON M9P, Canada | lat: 43.696319 | lng : -79.5322424
Searched Address: M1R, Scarborough --> Found Address: North York, ON M1R, Canada | lat: 43.7500715 | lng : -79.2958491
Searched Address: M2R, North York --> Found Address: North York, ON M2R, Canada | lat: 43.7827364 | lng : -79.4422593
Searched Address: M4R, Central Toronto --> Found Address: North York, ON M4R, Canada | lat: 43.7153834 | lng : -79.4056784
Searched Address: M5R, Central Toronto --> Found Address: Toronto, ON M5R, Canada | lat: 43.6727097 | lng : -79.4056784
Searched Address: M6R, West Toronto --> Found Address: W Toronto St, Toronto, ON M6N, Canada | lat: 43.6694459 | lng : -79.46914749999999
Searched Address: M7R, Mississauga -->

In [12]:
df_latlng.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
2,M3A,North York,Parkwoods,43.753259,-79.329656
3,M4A,North York,Victoria Village,43.725882,-79.315572
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
5,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
