In [1]:
# setting up the imports required for the notebook
import pandas as pd
import requests
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import io
import folium
from geopy.geocoders import Nominatim
%matplotlib inline

#setting up the page which needs to be scraped from wikipedia using wikipedia library as wp
my_url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html,"html.parser")



In [2]:
# Using pandas dataframe to get all the tables on webpage
dfs = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

# finding out number of tables in the dfs and glimpse of what it stores
for df in dfs :
    print(df.head(5))


  Postcode           Borough      Neighborhood
0      M1A      Not assigned      Not assigned
1      M2A      Not assigned      Not assigned
2      M3A        North York         Parkwoods
3      M4A        North York  Victoria Village
4      M5A  Downtown Toronto      Harbourfront
                                                  0   \
0                                                NaN   
1  NL NS PE NB QC ON MB SK AB BC NU/NT YT A B C E...   
2                                                 NL   
3                                                  A   

                                                  1   \
0                              Canadian postal codes   
1  NL NS PE NB QC ON MB SK AB BC NU/NT YT A B C E...   
2                                                 NS   
3                                                  B   

                                                  2    3    4    5    6    7   \
0                                                NaN  NaN  NaN  NaN  NaN  N

In [3]:
# From above results we find that the first dataframe is the table we need
df = dfs[0]
df.head(10)

# Now we need to filter out the rows which have not assigned in Borough and Neighbourhood
df = df[df.Borough != 'Not assigned']
df.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
9,M9A,Queen's Park,Not assigned
10,M1B,Scarborough,Rouge
11,M1B,Scarborough,Malvern
13,M3B,North York,Don Mills North


In [4]:
# Now we have the rows which were not helpful deleted from the table
#Now we would group them by Boroughs and join the neighbourhoods by , 
df = df.groupby(['Postcode','Borough'])['Neighborhood'].apply(', '.join).reset_index()
df.columns = ['Postcode','Borough','Neighbourhood']
df.head(5)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [5]:
df.shape

(103, 3)

In [6]:
# getting the geospatial data for the Boroughs 
geo_url = "http://cocl.us/Geospatial_data"
s = requests.get(geo_url).content
geo_data = pd.read_csv(io.StringIO(s.decode('utf-8')))

# Checking the dataframe made for columns names and data
geo_data.head(5)

# Changing column name of postal code to postcode for merge with original df
geo_data.columns = ['Postcode','Latitude','Longitude']

geo_data.head(5)
df = pd.merge(geo_data,df , on ='Postcode')
df.head(5)

Unnamed: 0,Postcode,Latitude,Longitude,Borough,Neighbourhood
0,M1B,43.806686,-79.194353,Scarborough,"Rouge, Malvern"
1,M1C,43.784535,-79.160497,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,43.763573,-79.188711,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,43.770992,-79.216917,Scarborough,Woburn
4,M1H,43.773136,-79.239476,Scarborough,Cedarbrae


In [7]:
# Reordering the columns in datafrmae
df = df[['Postcode','Borough','Neighbourhood','Latitude','Longitude']]
df.head(5)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [8]:
# Finding out unique boroughs and number of neighbourhoods in dataframe
print('The dataframe has {} boroughs and {} neighbourhoods.'.format(
        len(df['Borough'].unique()),
        df.shape[0]))

The dataframe has 11 boroughs and 103 neighbourhoods.


In [9]:
# Finding Toronto coordinates
address = 'Toronto'
geolocator = Nominatim(user_agent = "Toronto_explorer")
location = geolocator.geocode(address)
latitude_toronto = location.latitude
longitude_toronto = location.longitude

# Print the coordiantes of the Toronto city
print('The geographical coordinates of Toronto city are Latitude:{} and Longitude:{}'.
     format(latitude_toronto,longitude_toronto))

The geographical coordinates of Toronto city are Latitude:43.653963 and Longitude:-79.387207


In [13]:
# creating map of Toronto using latitude and longitude in dataframe
#latitude_toronto = 43.651070
#longitude_toronto = -79.347015
map_toronto = folium.Map(Location =[latitude_toronto,longitude_toronto], zoom_start=10)

# adding markers to map made above 
for lat, lon, borough, neighbourhood  in zip(df['Latitude'],df['Longitude'],df['Borough'],df['Neighbourhood']):
    label = '{},{}'.format(neighbourhood,borough)
    label = folium.Popup(label,parse_html= True)
    folium.CircleMarker(
        [lat,lon],
        radius = 5,
        popup = label,
        color = 'red',
        fill = True,
        fill_color = 'lightred',
        fill_opacity = 0.6,
        parse_html= False).add_to(map_toronto)
    
    
map_toronto