# Segmenting and Clustering Neighbourhoods in Toronto
# Part 1
## Webscraping using BeautifulSoup and creating dataframe

In [1]:
#installing required packages and importing libraries

!pip install beautifulsoup4
!pip install lxml
!pip install html5lib
!pip install requests

from bs4 import BeautifulSoup
import requests
import urllib.request

print ("done")

done


In [2]:
#Using beautifulsoup for webscraping 

with urllib.request.urlopen ('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M') as html_file :
    soup = BeautifulSoup(html_file, 'lxml')
    print (soup.prettify())     #prettying up the html script for ease of use

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of postal codes of Canada: M - Wikipedia
  </title>
  <script>
   document.documentElement.className = document.documentElement.className.replace( /(^|\s)client-nojs(\s|$)/, "$1client-js$2" );
  </script>
  <script>
   (window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":876823784,"wgRevisionId":876823784,"wgArticleId":539066,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Communications in Ontario","Postal codes in Canada","Toronto","Ontario-related lists"],"wgBreakFrames":false,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wg

In [3]:
#Finding the required fields in the table with class_ = 'Wikitable sortable' 

match = soup.find('table', class_ = 'wikitable sortable')
print (match)

<table class="wikitable sortable">
<tbody><tr>
<th>Postcode</th>
<th>Borough</th>
<th>Neighbourhood
</th></tr>
<tr>
<td>M1A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M2A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M3A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
</td></tr>
<tr>
<td>M4A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Harbourfront_(Toronto)" title="Harbourfront (Toronto)">Harbourfront</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Regent_Park" title="Regent Park">Regent Park</a>
</td></tr>
<tr>
<td>M6A</td>

In [5]:
#Use tags to remove html and extract fields -Postcode, Borough and Neighborhood
import pandas as pd

table_post = soup.find('table')
fields = table_post.find_all('td')

postcode = []
borough = []
neighbourhood = []

for i in range(0, len(fields), 3):
    postcode.append(fields[i].text.strip())
    borough.append(fields[i+1].text.strip())
    neighbourhood.append(fields[i+2].text.strip())
        
df_pc = pd.DataFrame(data=[postcode, borough, neighbourhood]).transpose()
df_pc.columns = ['Postcode', 'Borough', 'Neighbourhood']
df_pc.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [6]:
#Replacing 'not assigned' values in 'borough' column with np.nan values and dropping them
import numpy as np
df_pc['borough'] = df_pc.replace ('Not assigned', np.nan, inplace = True)
df_pc.dropna(subset = ['Borough'], inplace = True)
df_pc.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,borough
2,M3A,North York,Parkwoods,
3,M4A,North York,Victoria Village,
4,M5A,Downtown Toronto,Harbourfront,
5,M5A,Downtown Toronto,Regent Park,
6,M6A,North York,Lawrence Heights,


In [7]:
df_pc = df_pc.drop(['borough'], axis =1)
df_pc.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [8]:
#Resetting index

df_pc.reset_index()

Unnamed: 0,index,Postcode,Borough,Neighbourhood
0,2,M3A,North York,Parkwoods
1,3,M4A,North York,Victoria Village
2,4,M5A,Downtown Toronto,Harbourfront
3,5,M5A,Downtown Toronto,Regent Park
4,6,M6A,North York,Lawrence Heights
5,7,M6A,North York,Lawrence Manor
6,8,M7A,Queen's Park,
7,10,M9A,Etobicoke,Islington Avenue
8,11,M1B,Scarborough,Rouge
9,12,M1B,Scarborough,Malvern


In [9]:
#Groupby post code and consolidate neighborhoods under each post code
df = df_pc.groupby(['Postcode','Borough'])['Neighbourhood'].apply(lambda x: ", ".join(x.astype(str))).reset_index()
df = df.sample(frac=1).reset_index(drop=True)
df.head(10)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M6K,West Toronto,"Brockton, Exhibition Place, Parkdale Village"
1,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
2,M3M,North York,Downsview Central
3,M3A,North York,Parkwoods
4,M1G,Scarborough,Woburn
5,M9B,Etobicoke,"Cloverdale, Islington, Martin Grove, Princess ..."
6,M1R,Scarborough,"Maryvale, Wexford"
7,M1V,Scarborough,"Agincourt North, L'Amoreaux East, Milliken, St..."
8,M6B,North York,Glencairn
9,M6S,West Toronto,"Runnymede, Swansea"


In [10]:
# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.
for index, row in df.iterrows():
    if row['Neighbourhood'] == 'Not assigned':
        row['Neighbourhood'] = row['Borough']
        
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M6K,West Toronto,"Brockton, Exhibition Place, Parkdale Village"
1,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
2,M3M,North York,Downsview Central
3,M3A,North York,Parkwoods
4,M1G,Scarborough,Woburn
5,M9B,Etobicoke,"Cloverdale, Islington, Martin Grove, Princess ..."
6,M1R,Scarborough,"Maryvale, Wexford"
7,M1V,Scarborough,"Agincourt North, L'Amoreaux East, Milliken, St..."
8,M6B,North York,Glencairn
9,M6S,West Toronto,"Runnymede, Swansea"


In [11]:
df.shape

(103, 3)

# Part 2
## Including Latitude, Longitude by downloading csv file and adding it to dataframe

In [12]:
ll_data = pd.read_csv('https://cocl.us/Geospatial_data')

ll_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [13]:
#Removing whitespaces in column name of above dataframe

ll_data.columns = ll_data.columns.str.replace(' ', '')

In [14]:
ll_data.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [15]:
#renaming column Postcode to Postalcode in original dataframe

df = df.rename(columns = {'Postcode' : 'PostalCode'})
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M6K,West Toronto,"Brockton, Exhibition Place, Parkdale Village"
1,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
2,M3M,North York,Downsview Central
3,M3A,North York,Parkwoods
4,M1G,Scarborough,Woburn


In [16]:
#Merging the dataframes on 'Postalcode' key with outer join

finaldf = pd.merge(df, ll_data, on='PostalCode', how='outer')


In [17]:
#Merged dataframe with Lat, long values included

finaldf

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M6K,West Toronto,"Brockton, Exhibition Place, Parkdale Village",43.636847,-79.428191
1,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
2,M3M,North York,Downsview Central,43.728496,-79.495697
3,M3A,North York,Parkwoods,43.753259,-79.329656
4,M1G,Scarborough,Woburn,43.770992,-79.216917
5,M9B,Etobicoke,"Cloverdale, Islington, Martin Grove, Princess ...",43.650943,-79.554724
6,M1R,Scarborough,"Maryvale, Wexford",43.750072,-79.295849
7,M1V,Scarborough,"Agincourt North, L'Amoreaux East, Milliken, St...",43.815252,-79.284577
8,M6B,North York,Glencairn,43.709577,-79.445073
9,M6S,West Toronto,"Runnymede, Swansea",43.651571,-79.484450


# Part 3
## Acquiring Lat, Long details of Toronto
## Creating Map of Toronto and visualizing neighborhoods

In [18]:
from geopy.geocoders import Nominatim
import folium

address = 'Toronto, CA'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))



The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [19]:
# create map of Toronto using latitude and longitude values

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighbourhood in zip(finaldf['Latitude'], finaldf['Longitude'], finaldf['Borough'], finaldf['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='yellow',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto