In [1]:
pip install beautifulsoup4

Collecting beautifulsoup4
[?25l  Downloading https://files.pythonhosted.org/packages/cb/a1/c698cf319e9cfed6b17376281bd0efc6bfc8465698f54170ef60a485ab5d/beautifulsoup4-4.8.2-py3-none-any.whl (106kB)
[K     |████████████████████████████████| 112kB 7.1MB/s eta 0:00:01
[?25hCollecting soupsieve>=1.2 (from beautifulsoup4)
  Downloading https://files.pythonhosted.org/packages/81/94/03c0f04471fc245d08d0a99f7946ac228ca98da4fa75796c507f61e688c2/soupsieve-1.9.5-py2.py3-none-any.whl
Installing collected packages: soupsieve, beautifulsoup4
Successfully installed beautifulsoup4-4.8.2 soupsieve-1.9.5
Note: you may need to restart the kernel to use updated packages.


## Importing libraries after installing beautifulSoup

In [2]:
#import
import requests
from bs4 import BeautifulSoup
import pandas as pd

## Scrap data from Wikipedia page into a DataFrame using Beautifulsoup

In [3]:
#web parser
page = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(page.content, 'html.parser')

In [4]:
table = soup.find('tbody')
rows = table.select('tr')
row = [r.get_text() for r in rows]

## Creating Dataframe

In [5]:
#creating df
df = pd.DataFrame(row)
df.head()

Unnamed: 0,0
0,\nPostcode\nBorough\nNeighbourhood\n
1,\nM1A\nNot assigned\nNot assigned\n
2,\nM2A\nNot assigned\nNot assigned\n
3,\nM3A\nNorth York\nParkwoods\n
4,\nM4A\nNorth York\nVictoria Village\n


## Cleaning Data and assigning column names

In [6]:
#cleaning data
df1 = df[0].str.split('\n', expand=True)
df1.head()

Unnamed: 0,0,1,2,3,4
0,,Postcode,Borough,Neighbourhood,
1,,M1A,Not assigned,Not assigned,
2,,M2A,Not assigned,Not assigned,
3,,M3A,North York,Parkwoods,
4,,M4A,North York,Victoria Village,


In [22]:
df2 = df1.rename(columns=df1.iloc[0])
df3 = df2.drop(df2.index[0])
df3.head()

Unnamed: 0,Unnamed: 1,Postcode,Borough,Neighbourhood,Unnamed: 5
1,,M1A,Not assigned,Not assigned,
2,,M2A,Not assigned,Not assigned,
3,,M3A,North York,Parkwoods,
4,,M4A,North York,Victoria Village,
5,,M5A,Downtown Toronto,Harbourfront,


In [42]:
df3.rename(columns = {'Postcode':'PostalCode'}, inplace = True) 
df3.rename(columns = {'Neighbourhood':'Neighborhood'}, inplace = True) 
df3.head()

Unnamed: 0,Unnamed: 1,PostalCode,Borough,Neighborhood,Unnamed: 5
1,,M1A,Not assigned,Not assigned,
2,,M2A,Not assigned,Not assigned,
3,,M3A,North York,Parkwoods,
4,,M4A,North York,Victoria Village,
5,,M5A,Downtown Toronto,Harbourfront,


## Ignored rows with boroughs that is "not assigned"

In [43]:
#ignoring 'not assigned' rows
df4 = df3[df3.Borough != 'Not assigned']
df4.head()

Unnamed: 0,Unnamed: 1,PostalCode,Borough,Neighborhood,Unnamed: 5
3,,M3A,North York,Parkwoods,
4,,M4A,North York,Victoria Village,
5,,M5A,Downtown Toronto,Harbourfront,
6,,M6A,North York,Lawrence Heights,
7,,M6A,North York,Lawrence Manor,


## Group neighborhoods in the same borough

In [44]:
#groupby postalcode and Borough
df5= df4.groupby(['PostalCode' , 'Borough'], sort = False).agg(','.join)
df5.reset_index(inplace = True)
df5.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park
5,M9A,Queen's Park,Not assigned
6,M1B,Scarborough,"Rouge,Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens,Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson,Garden District"


## For Neighborhood="Not assigned", make the value the same as Borough

In [45]:
# for Neighborhood="Not assigned", make the value the same as Borough
for index, row in df5.iterrows():
    if row["Neighborhood"] == "Not assigned":
        row["Neighborhood"] = row["Borough"]
        
df5.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park
5,M9A,Queen's Park,Queen's Park
6,M1B,Scarborough,"Rouge,Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens,Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson,Garden District"


## Used .shape to print number of rows and columns

In [46]:
df5.shape

(103, 3)

## Load coordinates from csv file

In [47]:
## get all geographical coordinates of the neighborhoods.
coordinates = pd.read_csv('https://cocl.us/Geospatial_data')
coordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [49]:
# rename the column "PostalCode"
coordinates.rename(columns={"Postal Code": "PostalCode"}, inplace=True)
coordinates.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


## Merging the two tables

In [50]:
# merge two table on the column "PostalCode"
df5 = df5.merge(coordinates, on="PostalCode", how="left")
df5.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494


## Use geopy library to get the latitude and longitude values of Toronto

In [51]:
!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
address = 'Toronto'

geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

Solving environment: done


  current version: 4.5.11
  latest version: 4.8.2

Please update conda by running

    $ conda update -n base -c defaults conda



# All requested packages already installed.

The geograpical coordinate of Toronto are 43.653963, -79.387207.


## Create a map of Toronto with neighborhoods superimposed on top

In [52]:
# create map of Toronto using latitude and longitude values
!conda install -c conda-forge folium=0.5.0 --yes
import folium # map rendering library
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df5['Latitude'], df5['Longitude'], df5['Borough'], df5['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)  
    
map_toronto

Solving environment: done


  current version: 4.5.11
  latest version: 4.8.2

Please update conda by running

    $ conda update -n base -c defaults conda



# All requested packages already installed.



## Filter only boroughs that contain the word Toronto

In [53]:
# filter borough names that contain the word Toronto
borough_names = list(df5.Borough.unique())

borough_with_toronto = []

for x in borough_names:
    if "toronto" in x.lower():
        borough_with_toronto.append(x)
        
borough_with_toronto

['Downtown Toronto', 'East Toronto', 'West Toronto', 'Central Toronto']

In [60]:
# create a new DataFrame with only boroughs that contain the word Toronto
df6= df5[df5['Borough'].isin(borough_with_toronto)].reset_index(drop=True)
df6.shape

(39, 5)

In [61]:
df6.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
1,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494
2,M5B,Downtown Toronto,"Ryerson,Garden District",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


## Visualize boroughs having 'Toronto'

In [63]:
map_toronto1 = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df6['Latitude'], df6['Longitude'], df6['Borough'], df6['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto1)  
    
map_toronto1

## Use the Foursquare API to explore the neighborhoods

In [64]:
# define Foursquare Credentials and Version
CLIENT_ID = 'BRFKUNQD0BWTEPLDZGFJ4VM3KWDQMCNS05A5TVRJC0CF1TI2' # your Foursquare ID
CLIENT_SECRET = '2JLWIJJ1PQ0WSWVLYEMSAFNKTEG5RBVJDHX5OCNPUGWWTIQJ' # your Foursquare Secret
VERSION = '20190822' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: BRFKUNQD0BWTEPLDZGFJ4VM3KWDQMCNS05A5TVRJC0CF1TI2
CLIENT_SECRET:2JLWIJJ1PQ0WSWVLYEMSAFNKTEG5RBVJDHX5OCNPUGWWTIQJ


## Getting the top 100 venues that are within a radius of 500 meters

In [65]:
radius = 500
LIMIT = 100

venues = []

for lat, long, post, borough, neighborhood in zip(df6['Latitude'], df6['Longitude'], df6['PostalCode'], df6['Borough'], df6['Neighborhood']):
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    for venue in results:
        venues.append((
            post, 
            borough,
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [66]:
# convert the venues list into a new DataFrame
venues_df = pd.DataFrame(venues)

# define the column names
venues_df.columns = ['PostalCode', 'Borough', 'Neighborhood', 'BoroughLatitude', 'BoroughLongitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']

print(venues_df.shape)
venues_df.head()

(1702, 9)


Unnamed: 0,PostalCode,Borough,Neighborhood,BoroughLatitude,BoroughLongitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,Cooper Koo Family YMCA,43.653191,-79.357947,Gym / Fitness Center
3,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,Morning Glory Cafe,43.653947,-79.361149,Breakfast Spot


## Venues were returned for each PostalCode

In [67]:
venues_df.groupby(["PostalCode", "Borough", "Neighborhood"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,BoroughLatitude,BoroughLongitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
PostalCode,Borough,Neighborhood,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
M4E,East Toronto,The Beaches,5,5,5,5,5,5
M4K,East Toronto,"The Danforth West,Riverdale",41,41,41,41,41,41
M4L,East Toronto,"The Beaches West,India Bazaar",22,22,22,22,22,22
M4M,East Toronto,Studio District,41,41,41,41,41,41
M4N,Central Toronto,Lawrence Park,3,3,3,3,3,3
M4P,Central Toronto,Davisville North,8,8,8,8,8,8
M4R,Central Toronto,North Toronto West,19,19,19,19,19,19
M4S,Central Toronto,Davisville,35,35,35,35,35,35
M4T,Central Toronto,"Moore Park,Summerhill East",2,2,2,2,2,2
M4V,Central Toronto,"Deer Park,Forest Hill SE,Rathnelly,South Hill,Summerhill West",14,14,14,14,14,14


## Number of unique categories that can be curated from all the returned venues

In [68]:
print('There are {} uniques categories.'.format(len(venues_df['VenueCategory'].unique())))

There are 227 uniques categories.


In [69]:
venues_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,BoroughLatitude,BoroughLongitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,Cooper Koo Family YMCA,43.653191,-79.357947,Gym / Fitness Center
3,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,Morning Glory Cafe,43.653947,-79.361149,Breakfast Spot


## Analyze each neighborhood

In [71]:
# one hot encoding
toronto_onehotEn = pd.get_dummies(venues_df[['VenueCategory']], prefix="", prefix_sep="")
# add postal, borough and neighborhood column back to dataframe
toronto_onehotEn['PostalCode'] = venues_df['PostalCode'] 
toronto_onehotEn['Borough'] = venues_df['Borough'] 
toronto_onehotEn['Neighborhoods'] = venues_df['Neighborhood'] 

# move postal, borough and neighborhood column to the first column
fixed_columns = list(toronto_onehotEn.columns[-3:]) + list(toronto_onehotEn.columns[:-3])
toronto_onehotEn = toronto_onehotEn[fixed_columns]

print(toronto_onehotEn.shape)
toronto_onehotEn.head()

(1702, 230)


Unnamed: 0,PostalCode,Borough,Neighborhoods,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M5A,Downtown Toronto,Harbourfront,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M5A,Downtown Toronto,Harbourfront,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M5A,Downtown Toronto,Harbourfront,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M5A,Downtown Toronto,Harbourfront,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M5A,Downtown Toronto,Harbourfront,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [72]:
group_df = toronto_onehotEn.groupby(["PostalCode", "Borough", "Neighborhoods"]).mean().reset_index()

print(group_df.shape)
group_df

(39, 230)


Unnamed: 0,PostalCode,Borough,Neighborhoods,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M4E,East Toronto,The Beaches,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M4K,East Toronto,"The Danforth West,Riverdale",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.02439,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02439
2,M4L,East Toronto,"The Beaches West,India Bazaar",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M4M,East Toronto,Studio District,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.02439,0.0,0.0,0.02439
4,M4N,Central Toronto,Lawrence Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,M4P,Central Toronto,Davisville North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,M4R,Central Toronto,North Toronto West,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052632
7,M4S,Central Toronto,Davisville,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.028571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,M4T,Central Toronto,"Moore Park,Summerhill East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,M4V,Central Toronto,"Deer Park,Forest Hill SE,Rathnelly,South Hill,...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.071429,0.0,0.0,0.0,0.0


## Display the top 10 venues for each PostalCode

In [74]:
import numpy as np
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
areaColumns = ['PostalCode', 'Borough', 'Neighborhoods']
freqColumns = []
for ind in np.arange(num_top_venues):
    try:
        freqColumns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        freqColumns.append('{}th Most Common Venue'.format(ind+1))
columns = areaColumns+freqColumns

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['PostalCode'] = group_df['PostalCode']
neighborhoods_venues_sorted['Borough'] = group_df['Borough']
neighborhoods_venues_sorted['Neighborhoods'] = group_df['Neighborhoods']

for ind in np.arange(group_df.shape[0]):
    row_categories = group_df.iloc[ind, :].iloc[3:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    neighborhoods_venues_sorted.iloc[ind, 3:] = row_categories_sorted.index.values[0:num_top_venues]

print(neighborhoods_venues_sorted.shape)
neighborhoods_venues_sorted

(39, 13)


Unnamed: 0,PostalCode,Borough,Neighborhoods,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beaches,Trail,Health Food Store,Park,Pub,Neighborhood,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Yoga Studio
1,M4K,East Toronto,"The Danforth West,Riverdale",Greek Restaurant,Italian Restaurant,Coffee Shop,Ice Cream Shop,Furniture / Home Store,Yoga Studio,Frozen Yogurt Shop,Pub,Pizza Place,Liquor Store
2,M4L,East Toronto,"The Beaches West,India Bazaar",Park,Sandwich Place,Food & Drink Shop,Burger Joint,Burrito Place,Italian Restaurant,Fast Food Restaurant,Fish & Chips Shop,Steakhouse,Ice Cream Shop
3,M4M,East Toronto,Studio District,Café,Coffee Shop,Gastropub,Brewery,Bakery,Italian Restaurant,American Restaurant,Sandwich Place,Cheese Shop,Pet Store
4,M4N,Central Toronto,Lawrence Park,Park,Swim School,Bus Line,Department Store,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run
5,M4P,Central Toronto,Davisville North,Department Store,Sandwich Place,Park,Gym,Breakfast Spot,Food & Drink Shop,Convenience Store,Hotel,Diner,Discount Store
6,M4R,Central Toronto,North Toronto West,Coffee Shop,Clothing Store,Yoga Studio,Spa,Café,Mexican Restaurant,Fast Food Restaurant,Dessert Shop,Sporting Goods Shop,Salon / Barbershop
7,M4S,Central Toronto,Davisville,Coffee Shop,Dessert Shop,Sandwich Place,Pizza Place,Sushi Restaurant,Italian Restaurant,Gym,Café,Pharmacy,Seafood Restaurant
8,M4T,Central Toronto,"Moore Park,Summerhill East",Restaurant,Playground,Dance Studio,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run,Discount Store
9,M4V,Central Toronto,"Deer Park,Forest Hill SE,Rathnelly,South Hill,...",Coffee Shop,Pub,Light Rail Station,American Restaurant,Sushi Restaurant,Restaurant,Sports Bar,Fried Chicken Joint,Pizza Place,Liquor Store


## Run k-means to cluster the Toronto areas into 5 clusters

In [78]:
from sklearn.cluster import KMeans
# set number of clusters
kclusters = 5

toronto_clustering = group_df.drop(["PostalCode", "Borough", "Neighborhoods"], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([0, 1, 1, 1, 1, 1, 1, 1, 3, 1], dtype=int32)

In [79]:
# create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.
toronto_merged = df6.copy()

# add clustering labels
toronto_merged["Cluster Labels"] = kmeans.labels_

# merge data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.drop(["Borough", "Neighborhoods"], 1).set_index("PostalCode"), on="PostalCode")

print(toronto_merged.shape)
toronto_merged.head() # check the last columns!

(39, 16)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,0,Coffee Shop,Café,Park,Pub,Bakery,Mexican Restaurant,Theater,Breakfast Spot,Performing Arts Venue,Chocolate Shop
1,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494,1,Coffee Shop,Park,Gym,Yoga Studio,Burger Joint,Beer Bar,Italian Restaurant,Japanese Restaurant,Juice Bar,Seafood Restaurant
2,M5B,Downtown Toronto,"Ryerson,Garden District",43.657162,-79.378937,1,Coffee Shop,Clothing Store,Cosmetics Shop,Café,Bakery,Japanese Restaurant,Bubble Tea Shop,Diner,Restaurant,Ramen Restaurant
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,1,Coffee Shop,Café,Restaurant,Cocktail Bar,American Restaurant,Beer Bar,Cosmetics Shop,Bakery,Clothing Store,Hotel
4,M4E,East Toronto,The Beaches,43.676357,-79.293031,1,Trail,Health Food Store,Park,Pub,Neighborhood,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Yoga Studio


In [80]:
# sort the results by Cluster Labels
print(toronto_merged.shape)
toronto_merged.sort_values(["Cluster Labels"], inplace=True)
toronto_merged

(39, 16)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,0,Coffee Shop,Café,Park,Pub,Bakery,Mexican Restaurant,Theater,Breakfast Spot,Performing Arts Venue,Chocolate Shop
10,M5J,Downtown Toronto,"Harbourfront East,Toronto Islands,Union Station",43.640816,-79.381752,0,Coffee Shop,Aquarium,Café,Hotel,Italian Restaurant,Scenic Lookout,Fried Chicken Joint,Restaurant,Brewery,Baseball Stadium
21,M5P,Central Toronto,"Forest Hill North,Forest Hill West",43.696948,-79.411307,1,Jewelry Store,Trail,Mexican Restaurant,Sushi Restaurant,Yoga Studio,Dessert Shop,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop
24,M5R,Central Toronto,"The Annex,North Midtown,Yorkville",43.67271,-79.405678,1,Sandwich Place,Café,Coffee Shop,Park,Pizza Place,Burger Joint,Middle Eastern Restaurant,Indian Restaurant,Flower Shop,Pub
25,M6R,West Toronto,"Parkdale,Roncesvalles",43.64896,-79.456325,1,Gift Shop,Bookstore,Dog Run,Restaurant,Bar,Dessert Shop,Italian Restaurant,Movie Theater,Cuban Restaurant,Eastern European Restaurant
26,M4S,Central Toronto,Davisville,43.704324,-79.38879,1,Coffee Shop,Dessert Shop,Sandwich Place,Pizza Place,Sushi Restaurant,Italian Restaurant,Gym,Café,Pharmacy,Seafood Restaurant
27,M5S,Downtown Toronto,"Harbord,University of Toronto",43.662696,-79.400049,1,Café,Bookstore,Bar,Bakery,Japanese Restaurant,Coffee Shop,Restaurant,Comfort Food Restaurant,Beer Bar,Dessert Shop
28,M6S,West Toronto,"Runnymede,Swansea",43.651571,-79.48445,1,Coffee Shop,Café,Sushi Restaurant,Pizza Place,Italian Restaurant,Juice Bar,Burrito Place,Restaurant,Pub,Dessert Shop
29,M4T,Central Toronto,"Moore Park,Summerhill East",43.689574,-79.38316,1,Restaurant,Playground,Dance Studio,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run,Discount Store
30,M5T,Downtown Toronto,"Chinatown,Grange Park,Kensington Market",43.653206,-79.400049,1,Bar,Café,Chinese Restaurant,Coffee Shop,Dumpling Restaurant,Vietnamese Restaurant,Vegetarian / Vegan Restaurant,Mexican Restaurant,Grocery Store,Park


## View the clusters

In [82]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, post, bor, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['PostalCode'], toronto_merged['Borough'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup('{} ({}): {} - Cluster {}'.format(bor, post, poi, cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Examine Clusters

### Cluster 1

In [88]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Downtown Toronto,0,Coffee Shop,Café,Park,Pub,Bakery,Mexican Restaurant,Theater,Breakfast Spot,Performing Arts Venue,Chocolate Shop
10,Downtown Toronto,0,Coffee Shop,Aquarium,Café,Hotel,Italian Restaurant,Scenic Lookout,Fried Chicken Joint,Restaurant,Brewery,Baseball Stadium


### Cluster 2

In [84]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
21,Central Toronto,1,Jewelry Store,Trail,Mexican Restaurant,Sushi Restaurant,Yoga Studio,Dessert Shop,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop
24,Central Toronto,1,Sandwich Place,Café,Coffee Shop,Park,Pizza Place,Burger Joint,Middle Eastern Restaurant,Indian Restaurant,Flower Shop,Pub
25,West Toronto,1,Gift Shop,Bookstore,Dog Run,Restaurant,Bar,Dessert Shop,Italian Restaurant,Movie Theater,Cuban Restaurant,Eastern European Restaurant
26,Central Toronto,1,Coffee Shop,Dessert Shop,Sandwich Place,Pizza Place,Sushi Restaurant,Italian Restaurant,Gym,Café,Pharmacy,Seafood Restaurant
27,Downtown Toronto,1,Café,Bookstore,Bar,Bakery,Japanese Restaurant,Coffee Shop,Restaurant,Comfort Food Restaurant,Beer Bar,Dessert Shop
28,West Toronto,1,Coffee Shop,Café,Sushi Restaurant,Pizza Place,Italian Restaurant,Juice Bar,Burrito Place,Restaurant,Pub,Dessert Shop
29,Central Toronto,1,Restaurant,Playground,Dance Studio,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run,Discount Store
30,Downtown Toronto,1,Bar,Café,Chinese Restaurant,Coffee Shop,Dumpling Restaurant,Vietnamese Restaurant,Vegetarian / Vegan Restaurant,Mexican Restaurant,Grocery Store,Park
31,Central Toronto,1,Coffee Shop,Pub,Light Rail Station,American Restaurant,Sushi Restaurant,Restaurant,Sports Bar,Fried Chicken Joint,Pizza Place,Liquor Store
32,Downtown Toronto,1,Airport Lounge,Airport Terminal,Coffee Shop,Boutique,Bar,Rental Car Location,Sculpture Garden,Plane,Boat or Ferry,Harbor / Marina


### Cluster 3

In [85]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
22,West Toronto,2,Bar,Thai Restaurant,Café,Mexican Restaurant,Flea Market,Bakery,Italian Restaurant,Cajun / Creole Restaurant,Speakeasy,Diner


### Cluster 4

In [86]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
8,Downtown Toronto,3,Coffee Shop,Steakhouse,Thai Restaurant,Café,Burger Joint,Bar,Bakery,Sushi Restaurant,Asian Restaurant,Cosmetics Shop


### Cluster 5

In [87]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
23,Central Toronto,4,Coffee Shop,Clothing Store,Yoga Studio,Spa,Café,Mexican Restaurant,Fast Food Restaurant,Dessert Shop,Sporting Goods Shop,Salon / Barbershop


## Observations

Most of the neighborhoods are in Cluster 2 with a lot of coffee shops, parks& playgrounds, restaurants etc.
Cluster 3 has a lot of bars , Cluster 1, 4 & 5 have a lot of coffee shops