# Segmenting and Clustering Neighborhoods in Toronto Notebook - Q1 #
Question 2 and 3 is down below in this notebook.

Let's start with adding the libraries for web scraping

In [1]:
import requests # library to handle requests
from bs4 import BeautifulSoup # library to handle scraping the page

Load the webpage and turn it into a soup object

In [2]:
website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(website_url, 'lxml')

Extract the data table

In [3]:
my_table = soup.find('table',{'class':'wikitable sortable'})

Split the rows

In [4]:
rows = my_table.find_all('tr')

Assign each column in a row to corresponding PostalCode, Borough, Neighborhood list by considering:
* Ignore cells with a borough that is **Not assigned**
* If a borough is listed twice and has two neighborhoods, these two rows will be combined into one row with the neighborhoods separated with a comma
* A **Not assigned** neighborhood will be the same as the borough

_We are ignoring the first row as it has the headers_

In [5]:
# Initiate lists
postcode = []
borough = []
neighborhood = []

# The index of the first row with non 'Not Assigned' borough
i = 1

# Find the index of the first row with non 'Not Assigned' borough
for j in range (1, len(rows)) : # Starts from 1 because first row is header
    parsed_row = rows[j].find_all('td')
    if parsed_row[1].text != 'Not assigned' : # Borough is not 'Not Assigned'
        i = j # Update the index of the first row with non 'Not Assigned' borough
        break # Leave for loop
print("The index of the first row with non 'Not Assigned' borough: " + str(i))

 # Parse the first row with non 'Not Assigned' borough and append data to the lists
parsed_row = rows[i].find_all('td') # Parse the first row with non 'Not Assigned' borough
postcode.append(parsed_row[0].text) # Append postcode to postcode list
borough.append(parsed_row[1].text) # Append borough to borough list
if parsed_row[2].text.rstrip() == 'Not assigned' : # Neighborhood is 'Not Assigned'
    neighborhood.append(parsed_row[1]).text # Append borough to the neighborhood list
else : # Neighborhood is not 'Not Assigned'
    neighborhood.append(parsed_row[2].text.rstrip()) # Append neighborhood to neighborhood list

print("First row with non 'Not Assigned' borough: " + str(postcode + borough + neighborhood))

# Start with the next row and traverse rows
for k in range (i + 1, len(rows)) :
    parsed_row = rows[k].find_all('td')
    if parsed_row[0].text == postcode[-1] : # Postcodes are not different
        if parsed_row[2].text.rstrip() == 'Not assigned' : # Neighborhood is 'Not Assigned'
            neighborhood[-1] += ', ' + parsed_row[1].text # Add borough to the neighborhood
        else : # Neighborhood is not 'Not Assigned'
            neighborhood[-1] += ', ' + parsed_row[2].text.rstrip() # Add neighborhood to the neighborhood
    else : # Postcodes are different
        if parsed_row[1].text != 'Not assigned' : # Borough is not 'Not Assigned'
            postcode.append(parsed_row[0].text) # Append postcode to postcode list
            borough.append(parsed_row[1].text) # Append borough to borough list
            if parsed_row[2].text.rstrip() == 'Not assigned' : # Neighborhood is 'Not Assigned'
                neighborhood.append(parsed_row[1].text) # Append borough to neighborhood list
            else : # Neighborhood is not 'Not Assigned'
                neighborhood.append(parsed_row[2].text.rstrip()) # Append neighborhood to neighborhood list

The index of the first row with non 'Not Assigned' borough: 3
First row with non 'Not Assigned' borough: ['M3A', 'North York', 'Parkwoods']


Import pandas library and zip the lists to convert to a dataframe

In [6]:
import pandas as pd # library for data analysis
list_of_tuples = list(zip(postcode, borough, neighborhood))
df = pd.DataFrame(list_of_tuples, columns = ['PostalCode', 'Borough', 'Neighborhood'])
df.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


There are 103 rows in the dataframe as seen below

In [7]:
df.shape

(103, 3)

# Segmenting and Clustering Neighborhoods in Toronto Notebook - Q2 #

Load csv file with the coordinates as a new dataframe

In [8]:
df_coordinates = pd.read_csv('Geospatial_Coordinates.csv')

Rename the key column to match the original dataframe and look at the head

In [9]:
df_coordinates.rename(columns={'Postal Code':'PostalCode'}, inplace=True)
df_coordinates.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Join two dataframes using 'PostalCode' column as the key and look at the head

In [10]:
df2 = pd.merge(df, df_coordinates, how='left', on=['PostalCode'])
df2.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


There are still the same number of rows as seen below

In [11]:
df2.shape

(103, 5)

# Segmenting and Clustering Neighborhoods in Toronto Notebook - Q3 #

Import the necessary libraries

In [12]:
import numpy as np # library to handle data in a vectorized manner

!pip install geopy
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!pip install folium
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


Set pandas options

In [13]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

Select the rows containing 'Toronto'

In [14]:
mask = df2.select_dtypes(include=[object]).apply(lambda x: x.str.contains('Toronto'))
df3 = df2[mask.any(axis=1)]
df3.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306


Set the map location to 'Toronto, CA'

In [15]:
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geographical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geographical coordinates of Toronto are 43.653963, -79.387207.


Let's visualize Toronto and the neighborhoods in it.

In [16]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(df3['Latitude'], df3['Longitude'], df3['Borough'], df3['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)
    
map_toronto

Let's group the markers into different clusters. Each cluster is then represented by the number of neighborhoods in each borough.

To implement this, we start off by instantiating a MarkerCluster object and adding all the data points in the dataframe to this object.

In [17]:
from folium.plugins import MarkerCluster

# let's start again with a clean copy of the map of Toronto
map_toronto = folium.Map(location = [latitude, longitude], zoom_start = 11)

# instantiate a mark cluster object for the boroughs in the dataframe
boroughs = MarkerCluster().add_to(map_toronto)

# loop through the dataframe and add each data point to the mark cluster
for lat, lng, borough, neighborhood in zip(df3['Latitude'], df3['Longitude'], df3['Borough'], df3['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.Marker(
        location=[lat, lng],
        icon=None,
        popup=label,
    ).add_to(boroughs)

# display map
map_toronto