Installing the beautiful soup library for web scraping

In [55]:
pip install beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


 Importing the required library for web scraping and sending data request to fetch the required table

In [56]:
import pandas as pd
import numpy as np
import requests
import bs4 as bs
from bs4 import BeautifulSoup
import requests

html_doc = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text
soup = BeautifulSoup(html_doc, 'html.parser')
#soup.prettify()

Creating a table out of the scraped data

In [57]:
table = soup.find("table")
output_rows = []
for table_row in table.findAll('tr'):
    columns = table_row.findAll('td')
    output_row = []
    for column in columns:
        output_row.append(column.text)
    output_rows.append(output_row)
#output_rows
    

Applying the conditions asked for in the problem statement

In [58]:
names = ['PostalCode','Borough','Neighbourhood']
df = pd.DataFrame(data=output_rows,columns = names)
df = df.replace('\n','', regex=True)
df = df.dropna(how='all', axis=0)
df['Neighbourhood'] = np.where((df['Neighbourhood'] == 'Not assigned'),df['Borough'],df['Neighbourhood'])
df = df[(df.Borough != 'Not assigned')]
df= df.groupby(['PostalCode','Borough']).agg(lambda x: ','.join(x))
df = df.reset_index()

Reading the shape of final table dataframe

In [59]:
print('The dataframe shape is:',df.shape)
df

The dataframe shape is: (103, 3)


Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village,Martin Grove Gardens,Richvie..."
101,M9V,Etobicoke,"Albion Gardens,Beaumond Heights,Humbergate,Jam..."


Extracting the Latitude and Longitude information

In [60]:
latlong = pd.read_csv('http://cocl.us/Geospatial_data')
latlong = latlong.rename(columns={"Postal Code": "PostalCode"})
latlong.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Merging the two tables to get the latlong informaion in main table

In [61]:
df = pd.merge(df,latlong,on=['PostalCode'])
df

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village,Martin Grove Gardens,Richvie...",43.688905,-79.554724
101,M9V,Etobicoke,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",43.739416,-79.588437


Use geopy library to get the latitude and longitude values of Toronto.
Cluster Toronto Neighbourhoods and load Folium library.

In [62]:
!conda install -c conda-forge geopy --yes
!conda install -c conda-forge folium=0.5.0 --yes
import folium # map rendering library
from geopy.geocoders import Nominatim
print('Libraries imported.')

Solving environment: done


  current version: 4.5.11
  latest version: 4.8.2

Please update conda by running

    $ conda update -n base -c defaults conda



# All requested packages already installed.

Solving environment: done


  current version: 4.5.11
  latest version: 4.8.2

Please update conda by running

    $ conda update -n base -c defaults conda



# All requested packages already installed.

Libraries imported.


Grouping postcodes into borough clusters

In [66]:
address = 'Toronto'
geolocator = Nominatim(user_agent="JAC")
location = geolocator.geocode(address)
lat = location.latitude
lon = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(lat, lon))

# create map of Toronto postcodes using latitude and longitude values
map_toronto = folium.Map(location=[lat, lon], zoom_start=10)

# add markers to map
for lat, lon, borough, neighbourhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [69]:
from folium import plugins

map_toronto = folium.Map(location = [lat, lon], zoom_start = 10)

# instantiate a marker cluster object for the postcodes in the dataframe
postcodes = plugins.MarkerCluster().add_to(map_toronto)

# loop through the dataframe and add each data point to the mark cluster
for lat, lon, postcode in zip(df['Latitude'], df['Longitude'], df['PostalCode']):
    label = 'lat-long: {}<br>Postcode: {}'.format(location[1], postcode)
    label = folium.Popup(label, parse_html=False)
    folium.Marker(
        location=[lat, lon],
        icon=folium.Icon(color='green', icon='ok-sign'),
        popup=label
    ).add_to(postcodes)

# display map
map_toronto