# Segmenting and Clustering Neighborhoods in Toronto

### the notebook is created and tested in IBM Watson Studio

## The first part

In [None]:
import pandas as pd
print('library imported')

In [None]:
tables = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
print('table parsed')

In [None]:
# The dataframe consist of three columns: PostalCode, Borough, and Neighborhood
postalcode = tables[0]
postalcode

In [None]:
#  Ignore cells with a borough that is Not assigned.
postalcode = postalcode[postalcode['Borough'] != "Not assigned"]
postalcode

In [None]:
# More than one neighborhood can exist in one postal code area, for example 'M5A'
postalcode.iloc[2,:]

In [None]:
# If a cell has a borough but a Not assigned  neighborhood, then the neighborhood will be the same as the borough.
# split the table into two table. postcodeA for cell has a borough but a Not assigned  neighborhood
postalcodeA = postalcode[postalcode['Neighbourhood'] == 'Not assigned']
postalcodeA['Neighbourhood'] = postalcodeA['Borough']
#postalcodeA

# postcodeB for the rest
postalcodeB = postalcode[postalcode['Neighbourhood'] != 'Not assigned']
#postalcodeB

# combine the two tables. postcodeA & postcodeB
postalcode = postalcodeA.append(postalcodeB)
postalcode

In [None]:
# use the .shape method to print the number of rows of your dataframe
postalcode.shape

## The second part

In [None]:
# install library -- grocoder
!pip install geocoder

# import geocoder
import geocoder
print('library imported')

In [None]:
# create a function to get latitude and longitude
def getll(postal_code):
    # initialize your variable to None
    lat_lng_coords = None
    
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        if True:
            g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code))
        if g.latlng is None:
            g = geocoder.geolytica('{}, Toronto, Ontario'.format(postal_code))
        if g.latlng is None:
            g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng
    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    return latitude, longitude
        

print('function created')

In [None]:
# define neighbourhoods range
#postalcode_q = postalcode #uncomment for all
#postalcode_q = postalcode.sample(n=2) #uncomment for test
postalcode_q = postalcode[postalcode['Borough'].str.contains('Toronto')] #uncomment for boroughs that contain the word Toronto

i = postalcode_q.shape[0]
print('neighbourhoods range defined')

In [None]:
# create table with latitude and longitude
column_names = ['Postal Code', 'Borough', 'Neighbourhood', 'Latitude', 'Longitude'] 
neighbourhoods = pd.DataFrame(columns=column_names)
for P, B, N in zip(postalcode_q['Postal Code'], postalcode_q['Borough'], postalcode_q['Neighbourhood']):
    print(i, P, B, N)
    i += -1
    lat, lon = getll(P)
    neighbourhoods = neighbourhoods.append({'Postal Code': P,
                                          'Borough': B,
                                          'Neighbourhood': N,
                                          'Latitude': lat,
                                          'Longitude': lon
                                         }, ignore_index=True)
neighbourhoods

## The third part

In [None]:
import requests
print('library imported')

In [None]:
# create a function to get nearby venues
def getNearbyVenues(names, latitudes, longitudes, radius = 500):
    venues_list = []
    
    for name, lat, lon in zip(names, latitudes, longitudes):
        url = 'https://api.foursquare.com/v2/venues/explore?ll={},{}&radius={}&limit={}&client_id={}&client_secret={}&v={}'.format(\
            lat,lon,radius,100,'OSAYJYNU10234WU1ECUOQWQOTCRIOZI50LZMGS5HIFM2UXDJ','3LS42SYJOQ3PBY2JK21DTQ53CHDMSMJKIM2G0ZTZE4K1HM3G','20210101')
        r = requests.get(url).json()['response']['groups'][0]['items']
        venues_list.append([(name, lat, lon, 
            v['venue']['name'], v['venue']['location']['lat'], v['venue']['location']['lng'], v['venue']['categories'][0]['name']) for v in r])
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

print('function created')

In [None]:
Toronto_venues = getNearbyVenues(neighbourhoods['Neighbourhood'],neighbourhoods['Latitude'],neighbourhoods['Longitude'])

print("there are {} unique category".format(len(Toronto_venues['Venue Category'].unique())))

Toronto_venues.groupby('Neighbourhood').count()


In [None]:
# Analyze
# one hot encoding
toronto_onehot = pd.get_dummies(Toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighbourhood'] = Toronto_venues['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot

In [None]:
# group by neighbourhoods and take mean of the frequency
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()
toronto_grouped

In [None]:
# print each neighbourhoods along with top 5 venues
num_top_venues = 5

for hood in toronto_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

In [None]:
# create function to sort the venues in descending order

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]
print('function created')

In [None]:
import numpy as np
print('library imported')

In [None]:
# top 10 venues for each neighboourhoods
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighbourhoods_venues_sorted.head()

In [None]:
from sklearn.cluster import KMeans
print('library imported')

In [None]:
# cluster neighbourhoods
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:100] 

#toronto_grouped_clustering

In [None]:
# create a new data frame to includes top10 venues for each neeighbourhoods
# add clustering labels
neighbourhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = neighbourhoods

# merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighbourhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')
toronto_merged = toronto_merged.replace(np.nan, 0)

toronto_merged

In [None]:
# install library -- folium
!pip install folium

import folium
import matplotlib.cm as cm
import matplotlib.colors as colors
print('library imported')

In [None]:
# visualize the result
lat = getll('')[0]
lon = getll('')[1]
map_clusters = folium.Map(location=[lat, lon], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster-1)],
        fill=True,
        fill_color=rainbow[int(cluster-1)],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Examine cluster


In [None]:
# Cluster n
n = 0
toronto_merged.loc[toronto_merged['Cluster Labels'] == n, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

In [None]:
# Cluster n
n = 1
toronto_merged.loc[toronto_merged['Cluster Labels'] == n, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

In [None]:
# Cluster n
n = 2
toronto_merged.loc[toronto_merged['Cluster Labels'] == n, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

In [None]:
# Cluster n
n = 3
toronto_merged.loc[toronto_merged['Cluster Labels'] == n, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

In [None]:
# Cluster n
n = 4
toronto_merged.loc[toronto_merged['Cluster Labels'] == n, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

In [None]:
# Thank you and have a nice day!