### 1. Extract data from wikipedia into a dataframe

In [1]:
# Data wrangling modules
import io
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

Extract postal codes from wikipedia into an html soup:

In [2]:
url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(url,"lxml")

Construct table with postal codes

In [3]:
table = soup.find('table', class_='wikitable sortable')
rows = table.find_all('tr')
loc = []
for tr in rows:
    td = tr.find_all('td')
    row = [tr.text.strip() for tr in td if tr.text.strip()]
    if row:
        loc.append(row)
loc[:3]

[['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods']]

Create dataframes from the data

In [4]:
df = pd.DataFrame(loc, columns=["Postalcode", "Borough", "Neighbourhood"])
df.head(3)

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods


### 2. Clean Dataframe

Remove "Boroughs" with "Not assigned"

In [5]:
df['Borough'].replace(to_replace='Not assigned', value=np.NaN, inplace=True)
df.dropna(axis=0, subset=['Borough'], inplace=True)
df.reset_index()
df.head(3)

Unnamed: 0,Postalcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [6]:
df=df.groupby(['Postalcode','Borough'])['Neighbourhood'].apply(', '.join).reset_index()
df.head()

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


Check if any any neighbourhoods are "unassigned" and assign them to name same as the borough

In [7]:
i = df.index[df.Neighbourhood == 'Not assigned']
df['Neighbourhood'] = df['Borough'].where(df['Neighbourhood']=='Not assigned', df['Neighbourhood'])
df.loc[i]

Unnamed: 0,Postalcode,Borough,Neighbourhood
85,M7A,Queen's Park,Queen's Park


Combine rows with identical postal code

In [8]:
df=df.groupby(['Postalcode','Borough'])['Neighbourhood'].apply(', '.join).reset_index()
df.head(3)

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"


Summary of rows in dataframe

In [9]:
df.shape

(103, 3)

Download file with latitude, londitude data on Toronto neighborhoods

In [10]:
geodata = "https://cocl.us/Geospatial_data"
geo_df = pd.read_csv(geodata)

In [11]:
geo_df.columns = ['Postalcode', 'Latitude','Longitude']

In [12]:
#geo_df = geo_df.sort_values(by='Longitude', ascending = True)

In [13]:
geo_df.head(3)

Unnamed: 0,Postalcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711


Assign same columns to dataframe as in the neighborhoods 

In [14]:
full_df = pd.merge(df,geo_df, on='Postalcode')

In [15]:
full_df.head(5)

Unnamed: 0,Postalcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


### 3. Geotag and plot Toronto map

Install conda dependencies

In [16]:
#!conda install -c conda-forge geopy --yes

In [17]:
#!conda install -c conda-forge folium=0.5.0 --yes

Import plotting libraries

In [18]:
# For handling JSON files
import json
# convert an address into latitude and longitude values
from geopy.geocoders import Nominatim 

# tranform JSON file into a pandas dataframe
from pandas.io.json import json_normalize 
import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans
# Map rendering library
import folium 

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

Fetch geocoordinates for Toronto

In [19]:
address = 'Toronto'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('Geocordinates of Toronto City are {}, {}.'.format(latitude, longitude))



Geocordinates of Toronto City are 43.653963, -79.387207.


Fetch **Borough** data from dataframe

In [20]:
Toronto_boroughs = full_df[full_df['Borough'].str.contains("Toronto")].reset_index(drop=True)
Toronto_boroughs.shape
Toronto_boroughs.head()

Unnamed: 0,Postalcode,Borough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


Plot geotagged map of Toronto with Toronto Boroughs markers 

In [21]:
# create empty map of Toronto using latitude and longitude values
Toronto_boroughs_map = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(Toronto_boroughs['Latitude'], Toronto_boroughs['Longitude'], 
                           Toronto_boroughs['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(Toronto_boroughs_map)  
    
Toronto_boroughs_map