## Wikipedia scrape notebook - Toronto postal codes with Lat Long Coordinates

In [3]:
import numpy as np 
import pandas as pd 
from bs4 import BeautifulSoup
from urllib.request import urlopen

In [4]:
wiki_page = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [5]:
#query the website and return the html to the variable ‘page’
page = urlopen(wiki_page)
soup = BeautifulSoup(page, 'html.parser') #store in variable `soup`

Now that we have wiki URL web page parsed and stored in bfSoup we can now extract and convert into dataframe

In [6]:
#extract table and convert into dataframe
table = soup.find_all('table')[0] 
df = pd.read_html(str(table))[0]
df=pd.DataFrame(df)
header = df.iloc[0]
df = df[1:]
df = df.rename(columns = header)
df.head(15)

Unnamed: 0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights
8,M6A,North York,Lawrence Manor
9,M7A,Queen's Park,Not assigned
10,M8A,Not assigned,Not assigned


Replace not assigned neighborhoods with Borough Names, rows wich has duplicate value of Postcode will be combined into one row.

In [8]:
df = df[df.Borough != 'Not assigned']
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights


In [9]:
df['Neighbourhood'] = df.apply(lambda row: row['Borough'] if (row['Neighbourhood']=='Not assigned') else row['Neighbourhood'],axis=1)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights


In [10]:
df_grp = df.groupby(['Postcode','Borough'], sort=False)['Neighbourhood'].apply(','.join).reset_index()
df_grp.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront,Regent Park"
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


In [11]:
df_grp.shape

(103, 3)

Above dataframe shows dataframe of the postal code of each neighborhood along with the borough name and neighborhood name.

Now lets get lat long for the above data

In [12]:
!pip install geocoder
import geocoder 
!pip install folium
import folium
import geopy
from geopy.geocoders import Nominatim

Requirement not upgraded as not directly required: geocoder in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages
Requirement not upgraded as not directly required: ratelim in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from geocoder)
Requirement not upgraded as not directly required: requests in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from geocoder)
Requirement not upgraded as not directly required: click in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from geocoder)
Requirement not upgraded as not directly required: six in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from geocoder)
Requirement not upgraded as not directly required: future in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from geocoder)
Requirement not upgraded as not directly required: decorator in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from ratelim->geocoder)
Requirement not upgraded as not directly required: chardet<3.1.0,

In [13]:
for index, row in df_grp.iterrows():
    address_1 = row['Neighbourhood'] 
    address_2 = address_1.split(',')[-1]
    address_3 = address_2+","+"Toronto,Canada"
    #print(address_3) #-- It worked

In [14]:
column_names = ['Latitude', 'Longitude'] 
n_hood = pd.DataFrame(columns=column_names)
n_hood.shape

(0, 2)

In [15]:
for index, row in df_grp.iterrows():
    try:
        address_1 = row['Neighbourhood'] 
        address_2 = address_1.split(',')[-1]
        address = address_2+","+"Toronto,Canada"
        geolocator = Nominatim()
        location = geolocator.geocode(address)
        latitude = location.latitude
        longitude = location.longitude
        #print(row['Borough'],address, latitude, longitude)
        n_hood = n_hood.append({'Latitude': latitude,'Longitude': longitude}, ignore_index=True)
        n_hood
        pass
    except ValueError as error_message:
        print("Error")
    except AttributeError:
        #print("Problem with data or cannot Geocode.")
        address_3 = row['Borough']
        address = address_3+","+"Toronto,Canada"
        geolocator = Nominatim()
        location = geolocator.geocode(address)
        latitude = location.latitude
        longitude = location.longitude
        #print(address, latitude, longitude)
        n_hood = n_hood.append({'Latitude': latitude,'Longitude': longitude}, ignore_index=True)
       # print(row['Borough'],address, latitude, longitude)
        n_hood
        pass

In [16]:
n_hood.head()

Unnamed: 0,Latitude,Longitude
0,43.757846,-79.315975
1,43.732658,-79.311189
2,43.660706,-79.360457
3,43.722079,-79.437507
4,43.65998,-79.390369


In [19]:
df = pd.concat([df_grp, n_hood[['Latitude', 'Longitude']]], axis=1)
df.head(25)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.757846,-79.315975
1,M4A,North York,Victoria Village,43.732658,-79.311189
2,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.660706,-79.360457
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.722079,-79.437507
4,M7A,Queen's Park,Queen's Park,43.65998,-79.390369
5,M9A,Etobicoke,Islington Avenue,43.627663,-79.516454
6,M1B,Scarborough,"Rouge,Malvern",43.809196,-79.221701
7,M3B,North York,Don Mills North,43.737178,-79.343451
8,M4B,East York,"Woodbine Gardens,Parkview Hill",43.706298,-79.321907
9,M5B,Downtown Toronto,"Ryerson,Garden District",43.656502,-79.377128


Above dataframe shows list of postal code of each neighborhood along with the borough, neighborhood name Lat Long coordinates