# Segmenting and Clustering Neighborshoods in Toronto

## Week 3

#### Xinyu Zou

### For this assignment, you will be required to explore and cluster the neighborhoods in Toronto.

In [2]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
print('Libraries imported.')

Libraries imported.


In [3]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
soup = BeautifulSoup(source.text, 'lxml')
#print(soup.head())
print('Read url')


Read url


**Scrape the table**

In [4]:
table = soup.find("table")
table_rows = table.tbody.find_all("tr")

data = []
columns = []

for tr in table_rows:
    td = tr.find_all("td")
    row = [tr.text for tr in td]
    
    if row != [] and row[1] != "Not assigned\n":
        if "Not assigned\n" in row[2]:
            row[2] = row[1]
        data.append(row)
df = pd.DataFrame(data, columns = ["PostalCode", "Borough", "Neighborhood"])
df.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A\n,North York\n,Parkwoods\n
1,M4A\n,North York\n,Victoria Village\n
2,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront\n"
3,M6A\n,North York\n,"Lawrence Manor, Lawrence Heights\n"
4,M7A\n,Downtown Toronto\n,"Queen's Park, Ontario Provincial Government\n"
5,M9A\n,Etobicoke\n,"Islington Avenue, Humber Valley Village\n"
6,M1B\n,Scarborough\n,"Malvern, Rouge\n"
7,M3B\n,North York\n,Don Mills\n
8,M4B\n,East York\n,"Parkview Hill, Woodbine Gardens\n"
9,M5B\n,Downtown Toronto\n,"Garden District, Ryerson\n"


**Remove the "\n" at the end of each string**

In [5]:
df['PostalCode'] = df['PostalCode'].str.replace("\n", "")
df['Borough'] = df['Borough'].str.replace("\n", "")
df['Neighborhood'] = df['Neighborhood'].str.replace("\n", "")
df.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


**Get the shape of the data frame**

In [6]:
df.shape

(103, 3)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   PostalCode    103 non-null    object
 1   Borough       103 non-null    object
 2   Neighborhood  103 non-null    object
dtypes: object(3)
memory usage: 2.5+ KB


### Using Geocoder Python package to get latitude and longitude

In [19]:
from geopy.geocoders import Nominatim
from pandas.io.json import json_normalize
import folium

In [11]:
df_geocoder = pd.read_csv("http://cocl.us/Geospatial_data")
df_geocoder.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [14]:
df_loc = pd.merge(df, df_geocoder, how = 'left', left_on = 'PostalCode', right_on = 'Postal Code')
df_loc.drop('Postal Code', axis = 1, inplace = True)
df_loc.head(12)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


### Explore and Cluster the neighborhoods in Toronto

**Get the latitude and longitude of Toronto**

In [55]:
address = "Toronto , ON"
geolocator = Nominatim(user_agent = "toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The latitude and longitude of Toronto are {}, {}.'.format(latitude, longitude))


The latitude and longitude of Toronto are 43.6534817, -79.3839347.


**Create a map for Toronto**

In [62]:
map_toronto = folium.Map(location = [latitude, longitude], zoom_start = 10)
map_toronto

In [64]:
for lat, lng, borough, neighborhood in zip(
    df_loc['Latitude'],
    df_loc['Longitude'],
    df_loc['Borough'],
    df_loc['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker(
    [lat, lng], 
    radius = 3,
    popup = label, 
    color = '#466d77',
    fill = True,
    fill_color = '#d4dde3',
    fill_opacity = 0.9,
    parse_html = False).add_to(map_toronto)
map_toronto