# Data handling
Ignore cells with a borough that is Not assigned

More than one neighborhood can exist in one postal code; these will be combined with the neighborhoods separated with a comma

If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough

In [6]:
# bs4 ref: https://www.digitalocean.com/community/tutorials/how-to-scrape-web-pages-with-beautiful-soup-and-python-3
# bs4 ref: https://www.crummy.com/software/BeautifulSoup/bs4/doc/

import pandas as pd
from bs4 import BeautifulSoup
import requests

result = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(result.text, 'lxml')
table_list = soup.find("table", class_ = "wikitable sortable")

PostalCode = []
Borough = []
Neighborhood = []

for entry in table_list.find_all('tr'):
    items = entry.find_all('td')
    if len(items) == 3:
        PostalCode.append(items[0].text)
        Borough.append(items[1].text)
        Neighborhood.append(items[2].text[:-1])   # to remove \n

df = pd.DataFrame({"PostalCode":PostalCode, "Borough":Borough, "Neighborhood":Neighborhood})

# Ignore cells with a borough that is Not assigned
df = df[df.Borough != "Not assigned"]

# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.
loc = df.Neighborhood == "Not assigned"
df["Neighborhood"][loc] = df["Borough"][loc]

# More than one neighborhood can exist in one postal code; these will be combined with the neighborhoods separated with a comma
df_final = pd.DataFrame(columns=["PostalCode", "Borough", "Neighborhood"])

for name, group in df.groupby("PostalCode"):
    neighborhoods = ", ".join(group["Neighborhood"])
    df_final = df_final.append({"PostalCode":list(group["PostalCode"])[0], "Borough":list(group["Borough"])[0], "Neighborhood":neighborhoods}, ignore_index=True)

print(df_final.shape)
print(df_final.head())

(103, 3)
  PostalCode      Borough                            Neighborhood
0        M1B  Scarborough                          Rouge, Malvern
1        M1C  Scarborough  Highland Creek, Rouge Hill, Port Union
2        M1E  Scarborough       Guildwood, Morningside, West Hill
3        M1G  Scarborough                                  Woburn
4        M1H  Scarborough                               Cedarbrae


# Get the latitude and the longitude coordinates of each neighborhood

In [7]:
df_coords = pd.read_csv("Geospatial_Coordinates.csv")
df_full = pd.merge(df_final, df_coords, left_on='PostalCode', right_on='Postal Code')
display(df_full)

Unnamed: 0,PostalCode,Borough,Neighborhood,Postal Code,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",M1B,43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",M1C,43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",M1E,43.763573,-79.188711
3,M1G,Scarborough,Woburn,M1G,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,M1H,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,M1J,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",M1K,43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",M1L,43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",M1M,43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",M1N,43.692657,-79.264848


# Work with only boroughs that contain the word Toronto 
# Cluster into 5 chunks

In [84]:
import json, requests

def getNearbyVenues(names, latitudes, longitudes, radius=500, limit=5):
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            'B0LPCPNQLDCNHSV3KLKMLY0XMKD0NGH40ZJZM2DSM3FNHBTC', 
            '2SJFKJTXBG4D21XFM5S2JP5IZRMV0LVOQ3UZRP0YLDWVHJW3', 
            '20180323', 
            lat, 
            lng, 
            radius, 
            limit)
            
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [85]:
df_toronto = df_full[df_full["Borough"].str.contains("Toronto")]
toronto_venues = getNearbyVenues(names=df_toronto['Neighborhood'],
                                   latitudes=df_toronto['Latitude'],
                                   longitudes=df_toronto['Longitude']
                                  )

In [95]:
import numpy as np

kclusters = 5
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
#display(toronto_venues)
# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()


def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 5
indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

# display(neighborhoods_venues_sorted.head())

from sklearn.cluster import KMeans

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)   # run k-means clustering
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)   # add clustering labels

toronto_merged = df_toronto

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')
toronto_merged.sort_values('Cluster Labels', inplace=True)
toronto_merged.reset_index(inplace=True, drop=True)
display(toronto_merged)

Unnamed: 0,PostalCode,Borough,Neighborhood,Postal Code,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M4T,Central Toronto,"Moore Park, Summerhill East",M4T,43.689574,-79.38316,0,Playground,Vegetarian / Vegan Restaurant,Dance Studio,Coffee Shop,Comic Shop
1,M5H,Downtown Toronto,"Adelaide, King, Richmond",M5H,43.650571,-79.384568,1,Greek Restaurant,Concert Hall,Steakhouse,Plaza,Hotel
2,M6P,West Toronto,"High Park, The Junction South",M6P,43.661608,-79.464763,1,Gastropub,Speakeasy,Park,Italian Restaurant,Bar
3,M6J,West Toronto,"Little Portugal, Trinity",M6J,43.647927,-79.41975,1,Ice Cream Shop,Pizza Place,Korean Restaurant,Asian Restaurant,Brewery
4,M6H,West Toronto,"Dovercourt Village, Dufferin",M6H,43.669005,-79.442259,1,Bar,Supermarket,Brewery,Bakery,Middle Eastern Restaurant
5,M5W,Downtown Toronto,Stn A PO Boxes 25 The Esplanade,M5W,43.646435,-79.374846,1,Museum,Thai Restaurant,Cocktail Bar,Steakhouse,Concert Hall
6,M5V,Downtown Toronto,"CN Tower, Bathurst Quay, Island airport, Harbo...",M5V,43.628947,-79.39442,1,Airport,Airport Food Court,Airport Lounge,Airport Terminal,Harbor / Marina
7,M5R,Central Toronto,"The Annex, North Midtown, Yorkville",M5R,43.67271,-79.405678,1,Vegetarian / Vegan Restaurant,American Restaurant,Park,Café,Indian Restaurant
8,M5P,Central Toronto,"Forest Hill North, Forest Hill West",M5P,43.696948,-79.411307,1,Trail,Sushi Restaurant,Bus Line,Jewelry Store,Vegetarian / Vegan Restaurant
9,M5J,Downtown Toronto,"Harbourfront East, Toronto Islands, Union Station",M5J,43.640816,-79.381752,1,Salad Place,Park,Lake,Sporting Goods Shop,Vegetarian / Vegan Restaurant


# Visualization & Conclusion

k (# of clusters) is set to 5.

Main conclusion is that, unless FourSquare query limit is limited to a smaller number (<10), there is usually just one big cluster (areas with cafe/coffee shop vs ones without).

We got somewhat interesting result when we reduced the query limit to 5.

However, there needs to be an extra interpreative layer to translate FourSquare categories for the output to be useful (e.g., cafe grouped with coffee shop, combine various ethnic resturants, fine tune differences between board categoreis of "diner", "restaurant", etc.). There is also (apparently) an inherent limiated to using k-means clustering to categorical (0 or 1) variables instead of continous variables.

With that said, we observe the following...
Cluster 0 & 4 has the unique profile that 3 of the top 5 results are non-food, and no coffee shops & cafes
Cluster 1 contains mostly food places with very little cafes and coffee shops
Cluster 2 tends to contain coffee shops & cafes
Cluster 3 tends to contain coffee shops & cafes along with ethnic resaturants or gyms 

In [96]:
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

map_clusters = folium.Map(location=[43.6658599, -79.3831599], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters