# Assignment 2: Neighborhoods in Toronto

First we start by importing needed libraries

In [161]:
import requests # library to handle requests
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner

from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 
    
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize

import folium # plotting library
from bs4 import BeautifulSoup
import geocoder

from sklearn.cluster import KMeans

## Scraping Wikipedia

We use the 'BeautifulSoup' library to scrape the Wikipedia page.

In [162]:
# Scrape wiki site for Toronto neighborhood data
wiki_link = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
src = wiki_link.content
soup = BeautifulSoup(src)

We then find the table using HTML tags and format it into a list of lists

In [163]:
# Parse out table and rows
table = soup.find('tbody')
table_rows = table.find_all('tr')

# Format into list
l = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    l.append(row)

## Data Cleansing

The goal is to get it into a pandas dataframe and clean it up. Each step in the cell below is annotated with a comment.

In [164]:
# Format into df
toronto_neigh = pd.DataFrame(l, columns=['Postcode', 'Borough', 'Neighbourhood'])  # Transform list into df
toronto_neigh = toronto_neigh[toronto_neigh.Borough.str.contains('Not assigned') == False]  # Get rid of 'Not assigned' in Boroughs
toronto_neigh['Neighbourhood'] = toronto_neigh.Neighbourhood.str[:-1]  # Get rid of extra '/n' token in Neighbourhood

# If Neighbourhood is 'Not assigned', assign it the Borough value
for i in range(len(toronto_neigh)):
    if toronto_neigh['Neighbourhood'].iloc[i] == 'Not assigned':
        toronto_neigh['Neighbourhood'].iloc[i] = toronto_neigh['Borough'].iloc[i]
        
toronto_neigh = toronto_neigh.groupby(['Postcode', 'Borough']).agg(lambda x: ', '.join(x))  # Join Postcodes together
toronto_neigh.reset_index(inplace=True)  # Reset index
toronto_neigh.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


We then display the shape of the dataframe.

In [165]:
toronto_neigh.shape

(103, 3)

## Getting location data

I created a function to get the latitude and longitude coordinates using geopy.

In [106]:
def get_coords(row):
    '''
    Returns latitude and longitude coords of an address.
    '''
    postal_code = row.Postcode
    
    geolocator = Nominatim(user_agent='my-application')
    
    try:
        location = geolocator.geocode(f'{postal_code}, Toronto, Ontario')
    except GeocoderTimedOut:
        return get_coords(row)
    
    if location is None:
        return -1, -1
    
    return location.latitude, location.longitude

I then use the function to fill in the latitude and longitude values.

In [107]:
toronto_neigh['Latitude'] = -1
toronto_neigh['Longitude'] = -1
for i in range(len(toronto_neigh)):
    lat, lon = get_coords(toronto_neigh.iloc[i])
    toronto_neigh['Latitude'].iloc[i] = lat
    toronto_neigh['Longitude'].iloc[i] = lon
    
toronto_neigh.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.653963,-79.387207
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.653963,-79.387207
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",-1.0,-1.0
3,M1G,Scarborough,Woburn,43.765717,-79.221898
4,M1H,Scarborough,Cedarbrae,-1.0,-1.0


## Backup using csv

Below is a backup method using the csv document containing the latitude and longitude coordinates. I used this because geopy would not get many of the locations.

In [166]:
path = '../../Geospatial_Coordinates.csv'
geospatial_data = pd.read_csv(path)
geospatial_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [167]:
toronto_neigh = toronto_neigh.set_index('Postcode').join(geospatial_data.set_index('Postal Code'))
toronto_neigh.reset_index(inplace=True)
toronto_neigh

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv...",43.688905,-79.554724
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ...",43.739416,-79.588437


## Clustering Toronto Neighborhoods

We cluster the neighborhoods below.

In [168]:
X = toronto_neigh[['Latitude', 'Longitude', 'Neighbourhood']]

In [169]:
n_clusters = 5
kmeans = KMeans(n_clusters=n_clusters).fit(X[['Latitude', 'Longitude']])
kmeans.labels_

array([1, 1, 1, 1, 1, 1, 1, 4, 1, 4, 1, 4, 1, 1, 1, 1, 1, 0, 4, 0, 0, 0,
       0, 0, 0, 4, 4, 4, 0, 0, 0, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 2, 4, 2,
       0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 2,
       2, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 3, 3, 2, 3, 2, 3, 4,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3])

In [170]:
X['Labels'] = kmeans.labels_
X.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Latitude,Longitude,Neighbourhood,Labels
0,43.806686,-79.194353,"Rouge, Malvern",1
1,43.784535,-79.160497,"Highland Creek, Rouge Hill, Port Union",1
2,43.763573,-79.188711,"Guildwood, Morningside, West Hill",1
3,43.770992,-79.216917,Woburn,1
4,43.773136,-79.239476,Cedarbrae,1


## Display data

We will make a map with folium that displays the clustered neighborhoods.

In [171]:
geolocator = Nominatim(user_agent='my-application')
toronto_location = geolocator.geocode('Toronto, Ontario')

In [172]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[toronto_location.latitude, toronto_location.longitude], zoom_start=10)

# Color dict
color = {0: 'blue', 1: 'red', 2: 'purple', 3: 'orange', 4: 'cyan'}

# add markers to map
for lat, lng, label, col in zip(X['Latitude'], X['Longitude'], X['Neighbourhood'], X['Labels']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=color[col],
        fill=True,
        fill_color=color[col],
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto