In [24]:
# importing the required libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd 
import re

import geocoder # import geocoder
print('Import Done')

Import Done


<h3> Scrapping the Data from the Wikepdia page using the BeatifulSoup Library </h3>

In [25]:
#Given URL of the Wikepedia Page
URL='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page =  requests.get(URL)

soup = BeautifulSoup(page.content, 'html.parser')

#find the table data in the returned dataset from the BeautifulSoup Library
postal_code_data = soup.find('table', attrs={'class':'wikitable sortable'})

#Defining the Regex expression to remove HTML tags 
cleanr = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')

#finding all the values within the <td>..</td> elements
postal_code_data = postal_code_data.tbody.find_all('td')


In [26]:
#empty lists to store the table data
table_data=[]
temp_list=[]

#iterating through the reruned data from Beautifulsoup to extract the values within the <td>..</td> elements
for item in postal_code_data:
    
    # Since we know each row contains only 3 columns, hence appennding 3 items at once to a temp list to construct a row. 
    if len(temp_list) < 3: 
        temp_list.append(re.sub(cleanr, '', str(item).replace('\n', '').strip()))
    
    #Once the temp list is populated with 3 items i.e. 1 row, we empty the list, append the temp list to the table data list
    #and continue same as above.
    
    else:
        table_data.append(temp_list)
        temp_list=[]
        temp_list.append(re.sub(cleanr, '', str(item).replace('\n', '').strip()))
       
# Appending the last row to the table_data
table_data.append(temp_list)


<h3> Data Preparation </h3>

In [28]:
# Creating a DataFrame for postal codes of Canada
post_code_df = pd.DataFrame(table_data, columns=['PostalCode', 'Borough', 'Neighbourhood'])

# Removing rows with not assigned Boroughs
post_code_df = post_code_df[post_code_df['Borough']!='Not assigned']

# Replacing Not assigned neigbourhoods with the borough's names 
post_code_df['Neighbourhood'] = post_code_df.apply(lambda x : x.Borough  if x.Neighbourhood=='Not assigned' else x.Neighbourhood, axis=1 )
post_code_df

Unnamed: 0,PostalCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [29]:
post_code_df.shape

(103, 3)

In [32]:
# reading the csv containing geospatial coordinates
df_coord = pd.read_csv('Geospatial_Coordinates.csv')
df_coord

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


<h3>Getting the Geospatial Coordinates from the CSV</h3>

In [35]:
# merging the dataframes containing geospatial coordinates and other data

post_code_df= post_code_df.merge(df_coord, how='inner', left_on='PostalCode', right_on = 'Postal Code')

post_code_df

Unnamed: 0,PostalCode,Borough,Neighbourhood,Postal Code,Latitude,Longitude
0,M3A,North York,Parkwoods,M3A,43.753259,-79.329656
1,M4A,North York,Victoria Village,M4A,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",M5A,43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",M6A,43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",M7A,43.662301,-79.389494
...,...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",M8X,43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,M4Y,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",M7Y,43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",M8Y,43.636258,-79.498509
