### First, import the libraries.

In [1]:
import json
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

In [2]:
Webcontent = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
Soup = BeautifulSoup(Webcontent,'lxml')

#### Find the table from the page.

In [3]:
table = Soup.find('table',class_='wikitable sortable').text
print (table)



Postcode
Borough
Neighbourhood


M1A
Not assigned
Not assigned


M2A
Not assigned
Not assigned


M3A
North York
Parkwoods


M4A
North York
Victoria Village


M5A
Downtown Toronto
Harbourfront


M5A
Downtown Toronto
Regent Park


M6A
North York
Lawrence Heights


M6A
North York
Lawrence Manor


M7A
Queen's Park
Not assigned


M8A
Not assigned
Not assigned


M9A
Etobicoke
Islington Avenue


M1B
Scarborough
Rouge


M1B
Scarborough
Malvern


M2B
Not assigned
Not assigned


M3B
North York
Don Mills North


M4B
East York
Woodbine Gardens


M4B
East York
Parkview Hill


M5B
Downtown Toronto
Ryerson


M5B
Downtown Toronto
Garden District


M6B
North York
Glencairn


M7B
Not assigned
Not assigned


M8B
Not assigned
Not assigned


M9B
Etobicoke
Cloverdale


M9B
Etobicoke
Islington


M9B
Etobicoke
Martin Grove


M9B
Etobicoke
Princess Gardens


M9B
Etobicoke
West Deane Park


M1C
Scarborough
Highland Creek


M1C
Scarborough
Rouge Hill


M1C
Scarborough
Port Union


M2C
Not assigned
Not assigned

In [4]:
t = table.strip()   #remove the space

####  transform the format to list and drop all the empty values!

In [5]:
t2 = t.split('\n')
while '' in t2:
    t2.remove('')
print(t2)

['Postcode', 'Borough', 'Neighbourhood', 'M1A', 'Not assigned', 'Not assigned', 'M2A', 'Not assigned', 'Not assigned', 'M3A', 'North York', 'Parkwoods', 'M4A', 'North York', 'Victoria Village', 'M5A', 'Downtown Toronto', 'Harbourfront', 'M5A', 'Downtown Toronto', 'Regent Park', 'M6A', 'North York', 'Lawrence Heights', 'M6A', 'North York', 'Lawrence Manor', 'M7A', "Queen's Park", 'Not assigned', 'M8A', 'Not assigned', 'Not assigned', 'M9A', 'Etobicoke', 'Islington Avenue', 'M1B', 'Scarborough', 'Rouge', 'M1B', 'Scarborough', 'Malvern', 'M2B', 'Not assigned', 'Not assigned', 'M3B', 'North York', 'Don Mills North', 'M4B', 'East York', 'Woodbine Gardens', 'M4B', 'East York', 'Parkview Hill', 'M5B', 'Downtown Toronto', 'Ryerson', 'M5B', 'Downtown Toronto', 'Garden District', 'M6B', 'North York', 'Glencairn', 'M7B', 'Not assigned', 'Not assigned', 'M8B', 'Not assigned', 'Not assigned', 'M9B', 'Etobicoke', 'Cloverdale', 'M9B', 'Etobicoke', 'Islington', 'M9B', 'Etobicoke', 'Martin Grove', 'M9B

In [6]:
t3=[]       
for i in range(0,len(t2),3):
    if t2[i+1]!='Not assigned' :
        t3.append(t2[i:i+3])     #only those qualified are appended
print(t3)

[['Postcode', 'Borough', 'Neighbourhood'], ['M3A', 'North York', 'Parkwoods'], ['M4A', 'North York', 'Victoria Village'], ['M5A', 'Downtown Toronto', 'Harbourfront'], ['M5A', 'Downtown Toronto', 'Regent Park'], ['M6A', 'North York', 'Lawrence Heights'], ['M6A', 'North York', 'Lawrence Manor'], ['M7A', "Queen's Park", 'Not assigned'], ['M9A', 'Etobicoke', 'Islington Avenue'], ['M1B', 'Scarborough', 'Rouge'], ['M1B', 'Scarborough', 'Malvern'], ['M3B', 'North York', 'Don Mills North'], ['M4B', 'East York', 'Woodbine Gardens'], ['M4B', 'East York', 'Parkview Hill'], ['M5B', 'Downtown Toronto', 'Ryerson'], ['M5B', 'Downtown Toronto', 'Garden District'], ['M6B', 'North York', 'Glencairn'], ['M9B', 'Etobicoke', 'Cloverdale'], ['M9B', 'Etobicoke', 'Islington'], ['M9B', 'Etobicoke', 'Martin Grove'], ['M9B', 'Etobicoke', 'Princess Gardens'], ['M9B', 'Etobicoke', 'West Deane Park'], ['M1C', 'Scarborough', 'Highland Creek'], ['M1C', 'Scarborough', 'Rouge Hill'], ['M1C', 'Scarborough', 'Port Union'

In [7]:
column_names = ['PostalCode','Borough','Neighborhood']
neighborhoods = pd.DataFrame(columns=column_names)
neighborhoods

Unnamed: 0,PostalCode,Borough,Neighborhood


#### load the data into dataframe

In [8]:
for i in range(0,len(t3)-1):
    neighborhoods.loc[i] =({'PostalCode':t3[i+1][0],
                        'Borough':t3[i+1][1],
                        'Neighborhood':t3[i+1][2]})
    

In [9]:
neighborhoods.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


In [10]:
neighborhoods.shape

(212, 3)

#### OK! 212 items in all. Now let's try to get the latitude and longitude information.

In [12]:
import geocoder

In [14]:
geoinf = pd.DataFrame(columns=['Latitude','Longitude'])
geoinf

Unnamed: 0,Latitude,Longitude


In [15]:
for i in range(0,len(t3)-1):
    lat_lng_coords = None
    Postal_code = t3[i+1][0]
    while(lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(Postal_code))
        lat_lng_coords = g.latlng
    geoinf.loc[i] =({'Latitude':lat_lng_coords[0],
                        'Longitude':lat_lng_coords[1]})
geoinf.head()

Unnamed: 0,Latitude,Longitude
0,43.753259,-79.329656
1,43.725882,-79.315572
2,43.65426,-79.360636
3,43.65426,-79.360636
4,43.718518,-79.464763


#### Combine the two dataframes

In [16]:
geoinf.shape

(212, 2)

In [17]:
result = pd.concat([neighborhoods, geoinf], axis=1)
for i in range(0,212):
    if result.iloc[i,2]=='Not assigned':
        result.iloc[i,2]=result.iloc[i,1]
result.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636
4,M6A,North York,Lawrence Heights,43.718518,-79.464763
