In [8]:
from bs4 import BeautifulSoup as bs
import requests
import lxml.html as lx_html
import pandas as pd
import numpy as np


url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
res = requests.get(url)
soup = bs(res.content,'lxml')
table = soup.find_all('table')[0]
df = pd.read_html(str(table))
data = pd.read_json(df[0].to_json(orient='records'))



In [17]:
data

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


In [16]:
# Choosing only data where field Borough doesn't have not assigned value
raw_data_selected = data[data['Borough'] != 'Not assigned']
raw_data_selected

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [15]:
# Group the Data
group = raw_data_selected.groupby(['Borough', 'Postal Code'], as_index=False).agg(','.join)
group

Unnamed: 0,Borough,Postal Code,Neighbourhood
0,Central Toronto,M4N,Lawrence Park
1,Central Toronto,M4P,Davisville North
2,Central Toronto,M4R,"North Toronto West, Lawrence Park"
3,Central Toronto,M4S,Davisville
4,Central Toronto,M4T,"Moore Park, Summerhill East"
...,...,...,...
98,York,M6C,Humewood-Cedarvale
99,York,M6E,Caledonia-Fairbanks
100,York,M6M,"Del Ray, Mount Dennis, Keelsdale and Silverthorn"
101,York,M6N,"Runnymede, The Junction North"


In [18]:
# Replacing values in Neighbourhood field with Borough where Neighbourhood is not assigned
old_value = raw_data_selected['Neighbourhood'] == 'Not assigned'
old_value

2      False
3      False
4      False
5      False
6      False
       ...  
160    False
165    False
168    False
169    False
178    False
Name: Neighbourhood, Length: 103, dtype: bool

In [19]:
change = data.iloc[:, [2]] 
change

Unnamed: 0,Neighbourhood
0,Not assigned
1,Not assigned
2,Parkwoods
3,Victoria Village
4,"Regent Park, Harbourfront"
...,...
175,Not assigned
176,Not assigned
177,Not assigned
178,"Mimico NW, The Queensway West, South of Bloor,..."


In [20]:
# Shape of Data
raw_data_selected.shape

(103, 3)

In [32]:
geospatial_url = "https://cocl.us/Geospatial_data"
geospatial_data = pd.read_csv(geospatial_url)
geospatial_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [55]:
# Renaming the columns
geospatial_data.columns = ['Postal_Code', 'Latitude', 'Longitude']

geospatial_data.columns
geospatial_data.head()

Unnamed: 0,Postal_Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [56]:
# Check whether the two tables have at least one similar column to which we can merge them on
print(geospatial_data.columns)
print()
print('*'*125)
print(raw_data_selected.columns)

Index(['Postal_Code', 'Latitude', 'Longitude'], dtype='object')

*****************************************************************************************************************************
Index(['Postal Code', 'Borough', 'Neighbourhood'], dtype='object')


In [77]:
# Merge dataframes
merged_data = pd.merge(raw_data_selected, geospatial_data, left_on='Postal Code', right_on='Postal_Code')

# Drop column 'Postal_Code' to avoid duplicate of column name
merged_data = merged_data.drop('Postal_Code', 1)
merged_data.head(12)


Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


In [91]:
merged_data['Coordinates'] = list(zip(merged_data['Latitude'], merged_data['Longitude']))

In [92]:
merged_data.head(12)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Coordinates
0,M3A,North York,Parkwoods,43.753259,-79.329656,"(43.7532586, -79.3296565)"
1,M4A,North York,Victoria Village,43.725882,-79.315572,"(43.725882299999995, -79.31557159999998)"
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,"(43.6542599, -79.3606359)"
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,"(43.718517999999996, -79.46476329999999)"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,"(43.6623015, -79.3894938)"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242,"(43.6678556, -79.53224240000002)"
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353,"(43.806686299999996, -79.19435340000001)"
7,M3B,North York,Don Mills,43.745906,-79.352188,"(43.745905799999996, -79.352188)"
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937,"(43.7063972, -79.309937)"
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,"(43.6571618, -79.37893709999999)"


In [90]:
# !pip install geocoder
import geocoder # import geocoder

# initialize your variable to None
lat_lng_coords = None

# loop until you get the coordinates
while(lat_lng_coords is None):
  g = geocoder.google(f'{Postal Code}, Toronto, Ontario')
  lat_lng_coords = g.latlng

latitude = coo[0]
longitude = lat_lng_coords[1]

# merged_data['Coordinates'] = merged_data['Coordinates'].apply(Point)


KeyError: 'Coordinates'

In [79]:
gdf = gpd.GeoDataFrame(merged_data, geometry='Coordinates')
gdf.head()

NameError: name 'gpd' is not defined

In [80]:

# set up map
cities = gpd.read_file(gpd.datasets.get_path('naturalearth_cities'))
ax = cities[cities.name == "Toronto"].plot( color='green', edgecolor='black')
# plot and show
gdf.plot(ax=ax, color='red')

plt.show()

NameError: name 'gpd' is not defined