# Clustering Neighborhood - Toronto

In [1]:
# import modules
import requests # library to handle requests
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner
import random # library for random number generation


!pip install geopy
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

!pip install geocoder
# Libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 
    
# Tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize


!pip install folium==0.5.0
import folium # Plotting library

# Matplotlib modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# Library SKlearn for clustering
from sklearn.cluster import KMeans




# Extracting Data From Wikipedia page of "List of postal codes of Canada"

In [7]:
from bs4 import BeautifulSoup #To retrive data from website using BeautifulSoup
toronto_neighbor_page = requests.get('https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=1011037969').text
soup = BeautifulSoup(toronto_neighbor_page, 'html.parser')
allTable = soup.find('table')
allElement = allTable.find_all('td')

In [8]:
#Extract postal code, borough and neighborhood
postalCode = []
borough = []
neighborhood = []
counter = 1
for element in allElement[0:]:
    textHere = element.text[0:-1]
    if counter == 1:
        counter = counter + 1
        postalCode.append(textHere)
    elif counter == 2:
        counter = counter + 1
        borough.append(textHere)
    else: 
        counter = 1
        neighborhood.append(textHere)
# print(postalCode)
# print(borough)
# print(neighborhood)

In [9]:
# Constructing a dataframe
df = pd.DataFrame({'PostalCode':postalCode,'Borough':borough,'Neighborhood':neighborhood})
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


In [10]:
# perform postal code check
postal_codes = df['PostalCode']
postal_codes

0      M1A
1      M2A
2      M3A
3      M4A
4      M5A
      ... 
175    M5Z
176    M6Z
177    M7Z
178    M8Z
179    M9Z
Name: PostalCode, Length: 180, dtype: object

In [11]:
# data obtained
df.shape
df['Borough'].value_counts()
df=df[(df['Borough']!='Not assigned')]
postal_codes = df['PostalCode']
postal_codes
df.shape #the size of the cleaned data
df

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


# Finding the latitude and longitude

In [12]:
import geocoder
# test geolocator for toronto
geolocator = Nominatim(user_agent="My_App")

location = geolocator.geocode('{}, Toronto, ON'.format('M3A'))

location

# define function to retrieve latitude and longitude from postal code
def get_latlon(postal_code):
    lat_lng_coords = None
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng
    return lat_lng_coords

get_latlon('M9B') # test function

[43.65034000000003, -79.55361999999997]

In [13]:
coord = {}
for code in postal_codes:
    loc = get_latlon(code)
    coord[code] =loc 


In [14]:
latitude=[]
longitude=[]
for code in postal_codes:
    latitude.append(coord[code][0])
    longitude.append(coord[code][1])
df['Latitude']=latitude
df['Longitude']=longitude
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
2,M3A,North York,Parkwoods,43.75245,-79.32991
3,M4A,North York,Victoria Village,43.73057,-79.31306
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65512,-79.36264
5,M6A,North York,"Lawrence Manor, Lawrence Heights",43.72327,-79.45042
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66253,-79.39188
...,...,...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.65319,-79.51113
165,M4Y,Downtown Toronto,Church and Wellesley,43.66659,-79.38133
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.64869,-79.38544
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.63278,-79.48945


In [15]:
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.75245,-79.32991
1,M4A,North York,Victoria Village,43.73057,-79.31306
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65512,-79.36264
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.72327,-79.45042
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66253,-79.39188
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.65319,-79.51113
99,M4Y,Downtown Toronto,Church and Wellesley,43.66659,-79.38133
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.64869,-79.38544
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.63278,-79.48945


# Toronto

In [16]:
# Extract Toronto data
toronto = df[df['Borough'].str.contains('Toronto')].reset_index(drop=True)
toronto #The data set for toronto

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65512,-79.36264
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66253,-79.39188
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.65739,-79.37804
3,M5C,Downtown Toronto,St. James Town,43.65215,-79.37587
4,M4E,East Toronto,The Beaches,43.67709,-79.29547
5,M5E,Downtown Toronto,Berczy Park,43.64536,-79.37306
6,M5G,Downtown Toronto,Central Bay Street,43.65609,-79.38493
7,M6G,Downtown Toronto,Christie,43.66869,-79.42071
8,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.6497,-79.38258
9,M6H,West Toronto,"Dufferin, Dovercourt Village",43.66505,-79.43891


In [17]:
#To understand the size of the array
toronto.shape

(40, 5)

In [18]:
# find the mean latitude and longitude for map plotting
meanLatitude = toronto['Latitude'].mean(axis = 0)
meanLongitude = toronto['Longitude'].mean(axis = 0) 

In [19]:
# create map of Manhattan using latitude and longitude values
map_toronto = folium.Map(location=[meanLatitude, meanLongitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(toronto['Latitude'], toronto['Longitude'], toronto['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto