# Segmenting and Clustering Neighborhoods in Toronto

### 1. Requirements

In [1]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
import requests  # library to handle requests

# Installing the BeautifulSoup package
!conda install -c conda-forge beautifulsoup4 --yes
from bs4 import BeautifulSoup

# Installing the geocoder package
!conda install -c conda-forge geocoder --yes 
import geocoder 

# Installing the geopy package
!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

# Installing the folium package
!conda install -c conda-forge folium=0.5.0 --yes 
import folium # map rendering library

Solving environment: done


  current version: 4.5.11
  latest version: 4.8.2

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs: 
    - beautifulsoup4


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    certifi-2019.11.28         |           py36_0         149 KB  conda-forge
    scikit-learn-0.20.1        |   py36h22eb022_0         5.7 MB
    liblapack-3.8.0            |      11_openblas          10 KB  conda-forge
    numpy-1.18.1               |   py36h95a1406_0         5.2 MB  conda-forge
    liblapacke-3.8.0           |      11_openblas          10 KB  conda-forge
    libopenblas-0.3.6          |       h5a2b251_2         7.7 MB
    scipy-1.4.1                |   py36h921218d_0        18.9 MB  conda-forge
    beautifulsoup4-4.8.2       |           py36_

### 2. Scraping an Cleaning data

In [2]:
res = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(res.content,"html.parser")
table = soup.find_all('table')[0] 

# Scrape table and pass data to array
board_members = []
for row in table.find_all('tr'):
    cols = row.find_all('td')
    if cols != []:
        board_members.append((cols[0].text.strip(), cols[1].text.strip(), cols[2].text.strip()))
board_array = np.asarray(board_members)

# Convert array to dataframe
df = pd.DataFrame(board_array)
df.columns = ['PostalCode', 'Borough', 'Neighborhood']

# Ignoring cells with a borough that is Not assigned
df.drop(df[df.Borough == 'Not assigned'].index, inplace = True)
df.reset_index(drop=True, inplace=True)

# Neighborhood with Not assigned value
for i in range(df.shape[0]):
    if df.loc[i, 'Neighborhood'] == 'Not assigned':
        df.loc[i, 'Neighborhood'] = df.loc[i, 'Borough']

# Combinning rows
j = 0
while j < df.shape[0]-1:
    if df.loc[j, 'PostalCode'] == df.loc[j+1, 'PostalCode']:
        df.loc[j, 'Neighborhood'] = df.loc[j, 'Neighborhood'] + ", " + df.loc[j+1, 'Neighborhood']
        df.drop([j+1], axis = 0, inplace = True)
        df.reset_index(drop=True, inplace=True)
        j = j-1
    j = j+1

# Addin latitude and longitude columns
df['Latitude'] = ''
df['Longitude'] = ''

# Rading CSV file
lat_lng = pd.read_csv('Geospatial_Coordinates.csv')

for i in range(df.shape[0]):
    df.loc[i, 'Latitude'] = lat_lng.loc[lat_lng[lat_lng['Postal Code'] == df.loc[i, 'PostalCode']].index.values[0], 'Latitude']
    df.loc[i, 'Longitude'] = lat_lng.loc[lat_lng[lat_lng['Postal Code'] == df.loc[i, 'PostalCode']].index.values[0], 'Longitude']
    
df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7533,-79.3297
1,M4A,North York,Victoria Village,43.7259,-79.3156
2,M5A,Downtown Toronto,Harbourfront,43.6543,-79.3606
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.7185,-79.4648
4,M7A,Downtown Toronto,Queen's Park,43.6623,-79.3895
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.6537,-79.5069
99,M4Y,Downtown Toronto,Church and Wellesley,43.6659,-79.3832
100,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern,43.6627,-79.3216
101,M8Y,Etobicoke,"Humber Bay, King's Mill Park, Kingsway Park So...",43.6363,-79.4985


### 3. Exploring and clustering the neighborhoods in Toronto

In [3]:
# let's simplify segment and cluster only boroughs that contain the word Toronto
i = 0
while (i != df.shape[0]):
    if not ('Toronto' in df['Borough'][i]):
        df.drop([i], axis = 0, inplace = True)
        df.reset_index(drop=True, inplace=True)
        i = i-1
    i = i+1
df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,Harbourfront,43.6543,-79.3606
1,M7A,Downtown Toronto,Queen's Park,43.6623,-79.3895
2,M5B,Downtown Toronto,"Ryerson, Garden District",43.6572,-79.3789
3,M5C,Downtown Toronto,St. James Town,43.6515,-79.3754
4,M4E,East Toronto,The Beaches,43.6764,-79.293
5,M5E,Downtown Toronto,Berczy Park,43.6448,-79.3733
6,M5G,Downtown Toronto,Central Bay Street,43.658,-79.3874
7,M6G,Downtown Toronto,Christie,43.6695,-79.4226
8,M5H,Downtown Toronto,"Adelaide, King, Richmond",43.6506,-79.3846
9,M6H,West Toronto,"Dovercourt Village, Dufferin",43.669,-79.4423


#### The geographical coordinates of Toronto.

In [4]:
address = 'Toronto, Canada'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


####  Create a map of Toronto with neighborhoods 

In [5]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)
# add markers to map
for lat, lng, label in zip(df['Latitude'], df['Longitude'], df['PostalCode']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

#### Define Foursquare Credentials and Version

In [6]:
CLIENT_ID = 'R1BEPOOQVDFWBZHBFRIADZ05GYHIVZGGDDN52GBTCTV3YS3A' # Foursquare ID
CLIENT_SECRET = 'HTGZPZL1M4FXDGMBKZ2W0PMINJSTILYD5DWFKOMMSNCY0CSK' # Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: R1BEPOOQVDFWBZHBFRIADZ05GYHIVZGGDDN52GBTCTV3YS3A
CLIENT_SECRET:HTGZPZL1M4FXDGMBKZ2W0PMINJSTILYD5DWFKOMMSNCY0CSK


#### Explore Neighborhoods in Toronto

In [7]:
# Let's create a function to repeat the same process to all the neighborhoods in Manhattan
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            100)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [8]:
# Now let's run the above function on each neighborhood and create a new dataframe called manhattan_venues.
toronto_venues = getNearbyVenues(names=df['PostalCode'], latitudes=df['Latitude'], longitudes=df['Longitude'])
toronto_venues

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M5A,43.654260,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,M5A,43.654260,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,M5A,43.654260,-79.360636,Cooper Koo Family YMCA,43.653191,-79.357947,Gym / Fitness Center
3,M5A,43.654260,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,M5A,43.654260,-79.360636,Morning Glory Cafe,43.653947,-79.361149,Breakfast Spot
...,...,...,...,...,...,...,...
1686,M7Y,43.662744,-79.321558,The Ten Spot,43.664815,-79.324213,Spa
1687,M7Y,43.662744,-79.321558,Toronto Yoga Mamas,43.664824,-79.324335,Yoga Studio
1688,M7Y,43.662744,-79.321558,Olliffe On Queen,43.664503,-79.324768,Butcher
1689,M7Y,43.662744,-79.321558,TTC Stop #03049,43.664470,-79.325145,Light Rail Station


#### Analyze Each Neighborhood

In [42]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

toronto_df = pd.DataFrame(toronto_venues['Neighborhood'])
for i in toronto_onehot.columns:
    toronto_df[i] = toronto_onehot[i]
toronto_df['Neighborhood'] = toronto_venues['Neighborhood']
toronto_df

Unnamed: 0,Neighborhood,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1686,M7Y,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1687,M7Y,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1688,M7Y,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1689,M7Y,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [43]:
toronto_grouped = toronto_df.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M4E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M4K,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.025,0.0,...,0.0,0.025,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.025
2,M4L,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M4M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04878,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.02439,0.0,0.0,0.02439
4,M4N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,M4P,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,M4R,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05
7,M4S,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.030303,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,M4T,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,M4V,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.071429,0.0,...,0.0,0.0,0.0,0.0,0.0,0.071429,0.0,0.0,0.0,0.0


#### Let's put that into a *pandas* dataframe

In [44]:
#First, let's write a function to sort the venues in descending order.

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [60]:
#Now let's create the new dataframe and display the top 10 venues for each neighborhood.
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] =toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,Park,Trail,Health Food Store,Pub,Cupcake Shop,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run,Discount Store
1,M4K,Greek Restaurant,Italian Restaurant,Coffee Shop,Ice Cream Shop,Furniture / Home Store,Yoga Studio,Frozen Yogurt Shop,Pub,Pizza Place,Liquor Store
2,M4L,Sandwich Place,Brewery,Ice Cream Shop,Pub,Fish & Chips Shop,Sushi Restaurant,Light Rail Station,Fast Food Restaurant,Food & Drink Shop,Italian Restaurant
3,M4M,Café,Coffee Shop,Gastropub,Bakery,Brewery,Italian Restaurant,American Restaurant,Sandwich Place,Cheese Shop,Pet Store
4,M4N,Park,Swim School,Bus Line,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run
5,M4P,Park,Gym,Breakfast Spot,Sandwich Place,Food & Drink Shop,Hotel,Department Store,Deli / Bodega,Dumpling Restaurant,Donut Shop
6,M4R,Clothing Store,Coffee Shop,Yoga Studio,Bagel Shop,Gym / Fitness Center,Fast Food Restaurant,Diner,Dessert Shop,Mexican Restaurant,Chinese Restaurant
7,M4S,Sandwich Place,Dessert Shop,Pizza Place,Gym,Italian Restaurant,Café,Sushi Restaurant,Coffee Shop,Gas Station,Indian Restaurant
8,M4T,Park,Playground,Dance Studio,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run,Discount Store
9,M4V,Coffee Shop,Pub,Sushi Restaurant,Fried Chicken Joint,Sports Bar,Pizza Place,Restaurant,American Restaurant,Liquor Store,Vietnamese Restaurant


#### Cluster Neighborhoods

In [61]:
#Run k-means to cluster the neighborhood into 5 clusters
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 3, 0, 0, 0, 2, 0], dtype=int32)

#### Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood

In [62]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
neighborhoods_venues_sorted.rename(columns={'Neighborhood':'PostalCode'}, inplace=True)

toronto_merged = df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('PostalCode'), on='PostalCode')

toronto_merged

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,Harbourfront,43.6543,-79.3606,0,Coffee Shop,Pub,Park,Café,Bakery,Mexican Restaurant,Theater,Breakfast Spot,Performing Arts Venue,Chocolate Shop
1,M7A,Downtown Toronto,Queen's Park,43.6623,-79.3895,0,Coffee Shop,Park,Gym,Yoga Studio,Burrito Place,Beer Bar,Italian Restaurant,Seafood Restaurant,Sandwich Place,Juice Bar
2,M5B,Downtown Toronto,"Ryerson, Garden District",43.6572,-79.3789,0,Coffee Shop,Clothing Store,Japanese Restaurant,Café,Cosmetics Shop,Diner,Bookstore,Bakery,Electronics Store,Bubble Tea Shop
3,M5C,Downtown Toronto,St. James Town,43.6515,-79.3754,0,Coffee Shop,Café,Restaurant,American Restaurant,Bakery,Beer Bar,Clothing Store,Diner,Cosmetics Shop,Hotel
4,M4E,East Toronto,The Beaches,43.6764,-79.293,0,Park,Trail,Health Food Store,Pub,Cupcake Shop,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run,Discount Store
5,M5E,Downtown Toronto,Berczy Park,43.6448,-79.3733,0,Coffee Shop,Farmers Market,Beer Bar,Seafood Restaurant,Cocktail Bar,Bakery,Cheese Shop,Steakhouse,Café,Italian Restaurant
6,M5G,Downtown Toronto,Central Bay Street,43.658,-79.3874,0,Coffee Shop,Café,Italian Restaurant,Juice Bar,Burger Joint,Sandwich Place,Japanese Restaurant,Ice Cream Shop,Department Store,Bar
7,M6G,Downtown Toronto,Christie,43.6695,-79.4226,0,Grocery Store,Café,Park,Athletics & Sports,Diner,Italian Restaurant,Restaurant,Baby Store,Candy Store,Gas Station
8,M5H,Downtown Toronto,"Adelaide, King, Richmond",43.6506,-79.3846,0,Coffee Shop,Steakhouse,Thai Restaurant,Café,Burger Joint,Bar,Bakery,Cosmetics Shop,Sushi Restaurant,Restaurant
9,M6H,West Toronto,"Dovercourt Village, Dufferin",43.669,-79.4423,0,Bakery,Pharmacy,Grocery Store,Gym / Fitness Center,Middle Eastern Restaurant,Music Venue,Pool,Café,Brewery,Bar


#### Finally, let's visualize the resulting clusters

In [64]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'],
                                  toronto_merged['Longitude'],
                                  toronto_merged['PostalCode'],
                                  toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters