# Segmenting and Clustering Neighborhoods in Toronto

In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

Waiting for a Spark session to start...
Spark Initialization Done! ApplicationId = app-20181016113350-0020


In [2]:
# Create the dataframe consist of three columns: PostalCode, Borough, and Neighborhood

URL ='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M' 
Rq = requests.get(URL)
BS = BeautifulSoup(Rq.content,'lxml')
table = BS.find_all('table')[0] 
df = pd.read_html(str(table))[0]
df_pcode = pd.DataFrame(df)

df_pcode.head()

Unnamed: 0,0,1,2
0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


In [3]:
#  Drop the first coulmn and rename the coulum names

df_pcode.drop(0,inplace=True)
df_pcode.columns = ['Postcode','Borough','Neighborhood']
df_pcode.head()

Unnamed: 0,Postcode,Borough,Neighborhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


In [4]:
# Remove boroughs with 'not assigned' 

df_pcode1 = df_pcode[df_pcode.Borough.str.contains("Not assigned") == False]

# Reset index

df_pcode2 = df_pcode1.reset_index()
df_pcode3 = df_pcode2.drop(df_pcode2.columns[0],axis=1)
df_pcode3.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


In [5]:
# Combine rows with the same Postcode & Borough

df_pcode4 = df_pcode3.groupby(['Postcode', 'Borough'])['Neighborhood'].apply(', '.join).reset_index()
df_pcode4.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [6]:
# If a cell has a borough but a Not assigned neighborhood, then assign the neighborhood cell the same as the borough.

row=0
for row in range(103):
    if df_pcode4.Neighborhood[row]=='Not assigned':
        df_pcode4.Neighborhood[row] = df_pcode4.Borough[row]
        row=row+1
    
df_pcode4.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [7]:
# size of the dataframe
df_pcode4.shape

(103, 3)

## Add the latitude and the longitude coordinates of each neighborhood


In [8]:
#https://geocoder.readthedocs.io/index.html
!pip install geocoder

Collecting geocoder
  Using cached https://files.pythonhosted.org/packages/4f/6b/13166c909ad2f2d76b929a4227c952630ebaf0d729f6317eb09cbceccbab/geocoder-1.38.1-py2.py3-none-any.whl
Collecting ratelim (from geocoder)
  Using cached https://files.pythonhosted.org/packages/f2/98/7e6d147fd16a10a5f821db6e25f192265d6ecca3d82957a4fdd592cad49c/ratelim-0.1.6-py2.py3-none-any.whl
Collecting future (from geocoder)
Collecting click (from geocoder)
  Using cached https://files.pythonhosted.org/packages/fa/37/45185cb5abbc30d7257104c434fe0b07e5a195a6847506c074527aa599ec/Click-7.0-py2.py3-none-any.whl
Collecting six (from geocoder)
  Using cached https://files.pythonhosted.org/packages/67/4b/141a581104b1f6397bfa78ac9d43d8ad29a7ca43ea90a2d863fe3056e86a/six-1.11.0-py2.py3-none-any.whl
Collecting requests (from geocoder)
  Using cached https://files.pythonhosted.org/packages/65/47/7e02164a2a3db50ed6d8a6ab1d6d60b69c4c3fdf57a284257925dfc12bda/requests-2.19.1-py2.py3-none-any.whl
Collecting decorator (from ra

In [9]:
import geocoder
import time

In [10]:
# Add columns for Latitude and Longitude

df_pcode4['Latitude'] = 0.0
df_pcode4['Longitude'] = 0.0
df_pcode4.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",0.0,0.0
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",0.0,0.0
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",0.0,0.0
3,M1G,Scarborough,Woburn,0.0,0.0
4,M1H,Scarborough,Cedarbrae,0.0,0.0


#### I have used the following code but the used geocoder package doesn't respond. Thus, the provided csv file has been used to populate the geographical coordinates in the dataframe   

start_time=time.time()
row=0
for row in range(103):
    print(row)
    lat_lng_coords = None
    while(lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(df_pcode4.Postcode[row]))
        lat_lng_coords = g.latlng
        
    print("--- %s seconds ---" % round((time.time() - start_time), 2))
    df_pcode4.Latitude[row] = lat_lng_coords[0]
    df_pcode4.Longitude[row] = lat_lng_coords[1]
    row=row+1
    
df_pcode4

In [11]:
# download the csv file of longitude and latitude

!wget -O latlog.csv http://cocl.us/Geospatial_data

--2018-10-16 11:33:57--  http://cocl.us/Geospatial_data
Resolving cocl.us (cocl.us)... 169.48.113.201
Connecting to cocl.us (cocl.us)|169.48.113.201|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://cocl.us/Geospatial_data [following]
--2018-10-16 11:33:57--  https://cocl.us/Geospatial_data
Connecting to cocl.us (cocl.us)|169.48.113.201|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2018-10-16 11:33:57--  https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv
Resolving ibm.box.com (ibm.box.com)... 107.152.27.197
Connecting to ibm.box.com (ibm.box.com)|107.152.27.197|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://ibm.ent.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2018-10-16 11:33:58--  https://ibm.ent.box.com/shared/

In [14]:
df_latlog = pd.read_csv("latlog.csv", delimiter=",")
df_pcode4['Latitude'] = df_latlog[['Latitude']].values
df_pcode4['Longitude'] = df_latlog[['Longitude']].values
df_pcode4

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848
