# Part 1
### Assumptions:
Boroughs can have multiple Postcodes  
Postcodes can have multiple Boroughs

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

In [2]:
#### Scraping
'''
Resources:
# https://medium.com/analytics-vidhya/web-scraping-wiki-tables-using-beautifulsoup-and-python-6b9ea26d8722
# https://www.youtube.com/watch?v=ng2o98k983k
'''
website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(website_url,'lxml')
#print(soup.prettify())

In [3]:
#### Pulls data from html
My_table = soup.find('table',{'class':'wikitable sortable'})
#My_table
cell_data = My_table.find_all('td')
#cell_data

In [4]:
#### Creates df from HTML data
'''
gets data into a list with only 1 column / no column distinction
# counter = 0
# table = []
# row = []
# for cell in cell_data:
#     #print(cell.get_text())
#     row.append(cell.get_text().strip())
#     counter+=1
# print(row)

'''
counter = 0
table = []
row = []
for cell in cell_data:
    if counter < 3:
        #print(cell.get_text())
        row.append(cell.get_text().strip())
        counter+=1
    else:
        #print(row)
        table.append(row)
        counter = 0
        row = []
        #print(cell.get_text())
        row.append(cell.get_text().strip())
        counter+=1
#print(table)
headers=['Postcode','Borough','Neighbourhood']
df = pd.DataFrame(table, columns=headers)
#df.head()

#### Cleans data: removes non borough rows and if neighbourhood is not assigned, assigns neighbourhood as borough
def clean_data1 (row):
    #print (row)
    #print (row['Neighbourhood'])
    if row['Neighbourhood'] == 'Not assigned':
        row['Neighbourhood'] = row['Borough']
    return row
modified_df=df.apply(clean_data1, axis=1)
modified_df = modified_df[modified_df['Borough'] != 'Not assigned']
#modified_df.head()

In [5]:
# Grouping Postcode and Borough while merging Neighbourhoods with same Postcode and Borough via groupby
'''
Resources:
https://stackoverflow.com/questions/17841149/pandas-groupby-how-to-get-a-union-of-strings
https://stackoverflow.com/questions/54216702/pandas-grouping-by-column-one-and-adding-comma-separated-entries-from-column-two

# Test for correct size for M5A
#grouped_df = modified_df.groupby(['Postcode', 'Borough']).size().reset_index()#['Neighbourhood']
#grouped_df[grouped_df['Postcode'] =='M5A']
'''

grouped_df = modified_df.groupby(['Postcode', 'Borough'])['Neighbourhood'].apply(lambda x: "%s" % ', '.join(x)).reset_index()#(lambda x: x.sum())
grouped_df.head(10)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [6]:
grouped_df.shape

(103, 3)

# Part 2

In [7]:
import geocoder # import geocoder

def get_coordinates1 (row):
    # initialize your variable to None
    lat_lng_coords = None
    print(row['Postcode'])
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(row['Postcode']))
        lat_lng_coords = g.latlng
        print('1')

    row['Latitude'] = lat_lng_coords[0]
    row['Longitude'] = lat_lng_coords[1]
    print(row['Latitude'], row['Longitude'])
    return row

coords_df = pd.read_csv('Geospatial_Coordinates.csv')
def get_coordinates2 (row):
    temp_df = coords_df[coords_df['Postal Code'] == row['Postcode']]
    row['Latitude'] = float(temp_df['Latitude'])
    row['Longitude'] = float(temp_df['Longitude'])
    return row
    
df_with_coords = grouped_df
df_with_coords['Latitude'] = np.nan
df_with_coords['Longitude'] = np.nan
df_with_coords = df_with_coords.apply(get_coordinates2, axis=1)
df_with_coords.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


# Part 3
## Analysis of similar postcodes and boroughs
### Hypothesis: postcodes within the same borough will be clustered together

### Map of Toronto

In [8]:
import folium
# create map of New York using latitude and longitude values
latitude = 43.6532
longitude = -79.3832
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)
df_toronto_neighborhoods = df_with_coords.rename(index=str, columns={"Neighbourhood": "Neighborhood"})
df_toronto_neighborhoods['Neighborhood'] = '['+ df_toronto_neighborhoods['Postcode'] + '] ' + df_toronto_neighborhoods['Neighborhood']
# add markers to map
for lat, lng, borough, neighborhood in zip(df_toronto_neighborhoods['Latitude'], df_toronto_neighborhoods['Longitude'], df_toronto_neighborhoods['Borough'], df_toronto_neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough.upper())
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  

map_toronto


In [9]:
df_toronto_neighborhoods.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"[M1B] Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"[M1C] Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"[M1E] Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,[M1G] Woburn,43.770992,-79.216917
4,M1H,Scarborough,[M1H] Cedarbrae,43.773136,-79.239476


In [10]:
CLIENT_ID = 'MCHW4SBP13AEUUZPAB0NDIHD5PALKHUPJ0XFPNVMNU2M45C5' #'your-client-ID' # your Foursquare ID
CLIENT_SECRET = 'YTMSWOLIR03PGEP3ZKVV2OXFLIMCX1IDRQPAJGIT1ZW1HP3D' # your Foursquare Secret
VERSION = '20180604'
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: MCHW4SBP13AEUUZPAB0NDIHD5PALKHUPJ0XFPNVMNU2M45C5
CLIENT_SECRET:YTMSWOLIR03PGEP3ZKVV2OXFLIMCX1IDRQPAJGIT1ZW1HP3D


### Getting venue data via foursquare

In [11]:
LIMIT = 100
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [12]:
df_toronto_venues = getNearbyVenues(names=df_toronto_neighborhoods['Neighborhood'],
                                   latitudes=df_toronto_neighborhoods['Latitude'],
                                   longitudes=df_toronto_neighborhoods['Longitude'])
print(df_toronto_venues.shape)
#df_toronto_venues.head()
df_toronto_venues.groupby('Neighborhood').count()

[M1B] Rouge, Malvern
[M1C] Highland Creek, Rouge Hill, Port Union
[M1E] Guildwood, Morningside, West Hill
[M1G] Woburn
[M1H] Cedarbrae
[M1J] Scarborough Village
[M1K] East Birchmount Park, Ionview, Kennedy Park
[M1L] Clairlea, Golden Mile, Oakridge
[M1M] Cliffcrest, Cliffside, Scarborough Village West
[M1N] Birch Cliff, Cliffside West
[M1P] Dorset Park, Scarborough Town Centre, Wexford Heights
[M1R] Maryvale, Wexford
[M1S] Agincourt
[M1T] Clarks Corners, Sullivan, Tam O'Shanter
[M1V] Agincourt North, L'Amoreaux East, Milliken, Steeles East
[M1W] L'Amoreaux West
[M1X] Upper Rouge
[M2H] Hillcrest Village
[M2J] Fairview, Henry Farm, Oriole
[M2K] Bayview Village
[M2L] Silver Hills, York Mills
[M2M] Newtonbrook, Willowdale
[M2N] Willowdale South
[M2P] York Mills West
[M2R] Willowdale West
[M3A] Parkwoods
[M3B] Don Mills North
[M3C] Flemingdon Park, Don Mills South
[M3H] Bathurst Manor, Downsview North, Wilson Heights
[M3J] Northwood Park, York University
[M3K] CFB Toronto, Downsview East
[M

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"[M1B] Rouge, Malvern",1,1,1,1,1,1
"[M1C] Highland Creek, Rouge Hill, Port Union",2,2,2,2,2,2
"[M1E] Guildwood, Morningside, West Hill",8,8,8,8,8,8
[M1G] Woburn,3,3,3,3,3,3
[M1H] Cedarbrae,7,7,7,7,7,7
[M1J] Scarborough Village,2,2,2,2,2,2
"[M1K] East Birchmount Park, Ionview, Kennedy Park",5,5,5,5,5,5
"[M1L] Clairlea, Golden Mile, Oakridge",10,10,10,10,10,10
"[M1M] Cliffcrest, Cliffside, Scarborough Village West",3,3,3,3,3,3
"[M1N] Birch Cliff, Cliffside West",4,4,4,4,4,4


### ^May have random "group" error, just run again and it should work^

### How many Unique Categories

In [13]:
print('There are {} uniques categories.'.format(len(df_toronto_venues['Venue Category'].unique())))

There are 277 uniques categories.


In [14]:
# one hot encoding
toronto_onehot = pd.get_dummies(df_toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = df_toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()
#toronto_onehot.shape

Unnamed: 0,Yoga Studio,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Train Station,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,...,Train Station,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store
0,"[M1B] Rouge, Malvern",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"[M1C] Highland Creek, Rouge Hill, Port Union",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"[M1E] Guildwood, Morningside, West Hill",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,[M1G] Woburn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,[M1H] Cedarbrae,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Top 5 Types of Venues per Postal Code

In [16]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----[M1B] Rouge, Malvern----
                             venue  freq
0             Fast Food Restaurant   1.0
1                    Metro Station   0.0
2  Molecular Gastronomy Restaurant   0.0
3       Modern European Restaurant   0.0
4                Mobile Phone Shop   0.0


----[M1C] Highland Creek, Rouge Hill, Port Union----
                             venue  freq
0                              Bar   0.5
1                   History Museum   0.5
2                      Yoga Studio   0.0
3  Molecular Gastronomy Restaurant   0.0
4       Modern European Restaurant   0.0


----[M1E] Guildwood, Morningside, West Hill----
                 venue  freq
0          Pizza Place  0.12
1         Intersection  0.12
2  Rental Car Location  0.12
3   Mexican Restaurant  0.12
4    Electronics Store  0.12


----[M1G] Woburn----
                             venue  freq
0                      Coffee Shop  0.67
1                Korean Restaurant  0.33
2                    Metro Station  0.00
3  Molecular 

               venue  freq
0               Park  0.33
1        Coffee Shop  0.33
2  Convenience Store  0.33
3        Yoga Studio  0.00
4      Metro Station  0.00


----[M4K] The Danforth West, Riverdale----
                venue  freq
0    Greek Restaurant  0.18
1         Coffee Shop  0.09
2      Ice Cream Shop  0.07
3  Italian Restaurant  0.05
4           Bookstore  0.05


----[M4L] The Beaches West, India Bazaar----
                venue  freq
0      Sandwich Place  0.11
1           Pet Store  0.05
2      Ice Cream Shop  0.05
3             Brewery  0.05
4  Italian Restaurant  0.05


----[M4M] Studio District----
                venue  freq
0                Café  0.11
1         Coffee Shop  0.08
2              Bakery  0.05
3  Italian Restaurant  0.05
4           Gastropub  0.05


----[M4N] Lawrence Park----
                venue  freq
0                Park  0.33
1            Bus Line  0.33
2         Swim School  0.33
3         Yoga Studio  0.00
4  Mexican Restaurant  0.00


----[M4P] 

                venue  freq
0  Turkish Restaurant  0.25
1      Sandwich Place  0.25
2          Restaurant  0.25
3   Convenience Store  0.25
4       Metro Station  0.00


----[M6N] The Junction North, Runnymede----
                  venue  freq
0               Brewery   0.2
1              Bus Line   0.2
2     Convenience Store   0.2
3         Grocery Store   0.2
4  Caribbean Restaurant   0.2


----[M6P] High Park, The Junction South----
                venue  freq
0                 Bar  0.08
1                Café  0.08
2  Mexican Restaurant  0.08
3           Speakeasy  0.04
4           Bookstore  0.04


----[M6R] Parkdale, Roncesvalles----
            venue  freq
0  Breakfast Spot  0.12
1       Gift Shop  0.12
2       Bookstore  0.06
3    Dessert Shop  0.06
4            Bank  0.06


----[M6S] Runnymede, Swansea----
                venue  freq
0         Coffee Shop  0.10
1         Pizza Place  0.07
2                Café  0.07
3    Sushi Restaurant  0.05
4  Italian Restaurant  0.05


----

### Most common Venues for each postal code

In [17]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"[M1B] Rouge, Malvern",Fast Food Restaurant,Women's Store,Donut Shop,Dim Sum Restaurant,Diner,Discount Store,Dive Bar,Dog Run,Doner Restaurant,Drugstore
1,"[M1C] Highland Creek, Rouge Hill, Port Union",Bar,History Museum,Donut Shop,Dim Sum Restaurant,Diner,Discount Store,Dive Bar,Dog Run,Doner Restaurant,Women's Store
2,"[M1E] Guildwood, Morningside, West Hill",Electronics Store,Pizza Place,Rental Car Location,Intersection,Medical Center,Breakfast Spot,Spa,Mexican Restaurant,Drugstore,Donut Shop
3,[M1G] Woburn,Coffee Shop,Korean Restaurant,Women's Store,Donut Shop,Dim Sum Restaurant,Diner,Discount Store,Dive Bar,Dog Run,Doner Restaurant
4,[M1H] Cedarbrae,Hakka Restaurant,Athletics & Sports,Thai Restaurant,Bakery,Bank,Caribbean Restaurant,Fried Chicken Joint,Drugstore,Donut Shop,Doner Restaurant


### Clustering

In [18]:
from sklearn.cluster import KMeans
# set number of clusters
kclusters = 6

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([5, 1, 2, 2, 1, 2, 2, 1, 2, 2], dtype=int32)

In [19]:
# add clustering labels
try:
    neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
except:
    pass

toronto_merged = df_toronto_neighborhoods

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged['Cluster Labels'] = toronto_merged['Cluster Labels']#.astype(int)
toronto_merged['Cluster Labels'].dropna(inplace=True)
toronto_merged.head() # check the last columns!

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,"[M1B] Rouge, Malvern",43.806686,-79.194353,5.0,Fast Food Restaurant,Women's Store,Donut Shop,Dim Sum Restaurant,Diner,Discount Store,Dive Bar,Dog Run,Doner Restaurant,Drugstore
1,M1C,Scarborough,"[M1C] Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,1.0,Bar,History Museum,Donut Shop,Dim Sum Restaurant,Diner,Discount Store,Dive Bar,Dog Run,Doner Restaurant,Women's Store
2,M1E,Scarborough,"[M1E] Guildwood, Morningside, West Hill",43.763573,-79.188711,2.0,Electronics Store,Pizza Place,Rental Car Location,Intersection,Medical Center,Breakfast Spot,Spa,Mexican Restaurant,Drugstore,Donut Shop
3,M1G,Scarborough,[M1G] Woburn,43.770992,-79.216917,2.0,Coffee Shop,Korean Restaurant,Women's Store,Donut Shop,Dim Sum Restaurant,Diner,Discount Store,Dive Bar,Dog Run,Doner Restaurant
4,M1H,Scarborough,[M1H] Cedarbrae,43.773136,-79.239476,1.0,Hakka Restaurant,Athletics & Sports,Thai Restaurant,Bakery,Bank,Caribbean Restaurant,Fried Chicken Joint,Drugstore,Donut Shop,Doner Restaurant


In [20]:
toronto_merged['Cluster Labels'].value_counts()

2.0    76
0.0    12
1.0     7
4.0     2
3.0     1
5.0     1
Name: Cluster Labels, dtype: int64

In [21]:
import matplotlib.cm as cm
import matplotlib.colors as colors
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)-1],
        fill=True,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Conclusion: Postal codes in the same borough were not clustered together, it is possible that the each borough has an even amount of different venues arounds its area so a k-means clustering like this would give us a lot of 'similar' postal codes within a single cluster. ex. code ABC is in Borough X and code DEF is in Borough Y, both have many shopping areas and similar venues that cater towards the upper-middle class of the Borough, thus they are similar and would be clustered together