# Segmenting and Clustering Neighborhoods in Toronto

## Install necessary libraries

In [20]:
import numpy as np # library to handle data in a vectorized manner
!conda install -c conda-forge geopy --yes

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!pip install beautifulsoup4
#!pip install lxml
#!pip install requests

from bs4 import BeautifulSoup
import requests
import pandas as pd

!conda install -c conda-forge folium=0.5.0 --yes 
import folium

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



## Webscraping

#### Bring in Beautifulsoup package

In [21]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')

#### Parse html and remove "Not assigned"

In [22]:
Nhoods_df = pd.DataFrame(columns = ['PostalCode', 'Borough', 'Neighborhood'])

#table = soup.find('table', class_= 'wikitable sortable jquery-tablesorter')
table = soup.find('table')
#table_body = table.find('tbody') #tbody is the name on wikipedia
table_body = table.tbody

rows = table_body.find_all('tr')

for row in rows:
    cols = row.find_all('td')
    counter = -1
    # Placeholder for each row entry
    table_split = pd.DataFrame([["Empty", "Empty", "Empty"]], columns = ['PostalCode', 'Borough', 'Neighborhood'])
    
    #Start assuming row will not be rejected
    reject = 0
    
    # Loop through all columns in each row
    for col in cols:
        counter = counter + 1
        # Last column had /n at the back of everything
        if counter == 2:
            new_text = col.get_text()[:-1]
            # Check if neighbourhood missing
            if new_text == "Not assigned":
                new_text = table_split.iloc[0,counter - 1]
        else:
            new_text = col.get_text()
        table_split.iloc[0,counter] = new_text
            
        # Remove "Not assigned", start with blank slate of reject = 0
        if new_text == "Not assigned" and counter == 1:
            reject = 1 
    
    if reject == 0:
        Nhoods_df = Nhoods_df.append(table_split)
        
Nhoods_df = Nhoods_df[1:]
    
Nhoods_df.to_csv(r'C:\Users\Tim Blake\Documents\Work\Training\IBM data science specialization\Final Capstone Project\test2.csv')


#### Combining neighborhoods

In [23]:
Nhoods_df_merge = Nhoods_df.groupby(by=['PostalCode','Borough']).agg(lambda x: ','.join(x))
Nhoods_df_merge.reset_index(level=['PostalCode','Borough'], inplace=True)
Nhoods_df_merge


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


#### Give shape of dataframe

In [52]:
Nhoods_df_merge.shape

(103, 3)

## Geocoding

#### Import packages

In [26]:
from geopy.geocoders import Nominatim


#### Toronto Geocodes

In [70]:
address = 'Toronto, Canada'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
tor_latitude = location.latitude
tor_longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.775347, -79.3459439.


#### Other geocodes on unmerged dataset

In [71]:
# Add latitude and longitude to unmerged dataset
Nhoods_df['Latitude'] = 0
Nhoods_df['Longitude'] = 0

# initialize your variable to None
#lat_lng_coords = None

row_count = len(Nhoods_df.index)

#for i in range (0,row_count):
for i in range (0,row_count):

    # Find coordinates
    try:
        address = Nhoods_df.iloc[i,2] + ', Toronto, Ontario'
        geolocator = Nominatim(user_agent="ny_explorer")
        location = geolocator.geocode(address)
        latitude = location.latitude
        longitude = location.longitude
        #print('The geograpical coordinates are {}, {}.'.format(latitude, longitude))

        Nhoods_df.iloc[i,3] = latitude
        Nhoods_df.iloc[i,4] = longitude
    
    except:
        Nhoods_df.iloc[i,3] = tor_latitude
        Nhoods_df.iloc[i,4] = tor_longitude
        
Nhoods_df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7588,-79.320197
0,M4A,North York,Victoria Village,43.732658,-79.311189
0,M5A,Downtown Toronto,Harbourfront,43.64008,-79.38015
0,M6A,North York,Lawrence Heights,43.722778,-79.450933
0,M6A,North York,Lawrence Manor,43.722079,-79.437507
0,M7A,Downtown Toronto,Queen's Park,43.659659,-79.39034
0,M9A,Queen's Park,Queen's Park,43.659659,-79.39034
0,M1B,Scarborough,Rouge,43.80493,-79.165837
0,M1B,Scarborough,Malvern,43.809196,-79.221701
0,M3B,North York,Don Mills North,43.775347,-79.345944


#### Re-do merge

In [87]:
#Merge neighbourhoods
Nhoods_df_final = Nhoods_df.groupby(by=['PostalCode','Borough']).agg({'Neighborhood': lambda x: ','.join(x), 'Latitude': 'mean', 'Longitude': 'mean'})
Nhoods_df_final.reset_index(level=['PostalCode','Borough'], inplace=True)
Nhoods_df_final

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.807063,-79.193769
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.781964,-79.14627
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.773314,-79.194191
3,M1G,Scarborough,Woburn,43.759824,-79.225291
4,M1H,Scarborough,Cedarbrae,43.756467,-79.226692
5,M1J,Scarborough,Scarborough Village,43.743742,-79.211632
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.725012,-79.267197
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.711279,-79.286143
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.725617,-79.232014
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.706641,-79.254134


In [94]:
Nhoods_df_final.shape

(103, 5)

## Visualization

In [90]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(Nhoods_df_final['Latitude'], Nhoods_df_final['Longitude'], Nhoods_df_final['Borough'], Nhoods_df_final['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

#### Cluster into 10 clusters

In [None]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

In [106]:
# set number of clusters
kclusters = 10

end_points = Nhoods_df_final[['Latitude', 'Longitude']]

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(end_points)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=10, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=0, tol=0.0001, verbose=0)

#### Update dataframe with clusters and visualize

In [110]:
# Import necessary packages

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

In [None]:
# add clustering labels
Nhoods_df_final.insert(0, 'Cluster Labels', kmeans.labels_)

In [111]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Nhoods_df_final['Latitude'], Nhoods_df_final['Longitude'], Nhoods_df_final['Neighborhood'], Nhoods_df_final['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters