### Scrape the following Wikipedia page: https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

In [1]:
# import the library we use to open URLs
import urllib.request
# specify which URL/web page we are going to be scraping
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
# open the url using urllib.request and put the HTML into the page variable
page = urllib.request.urlopen(url)

In [2]:
from bs4 import BeautifulSoup

In [3]:
soup = BeautifulSoup(page, "lxml")

In [4]:
right_table=soup.find('table', class_='wikitable sortable')
right_table

<table class="wikitable sortable">
<tbody><tr>
<th>Postal Code
</th>
<th>Borough
</th>
<th>Neighborhood
</th></tr>
<tr>
<td>M1A
</td>
<td>Not assigned
</td>
<td>
</td></tr>
<tr>
<td>M2A
</td>
<td>Not assigned
</td>
<td>
</td></tr>
<tr>
<td>M3A
</td>
<td>North York
</td>
<td>Parkwoods
</td></tr>
<tr>
<td>M4A
</td>
<td>North York
</td>
<td>Victoria Village
</td></tr>
<tr>
<td>M5A
</td>
<td>Downtown Toronto
</td>
<td>Regent Park, Harbourfront
</td></tr>
<tr>
<td>M6A
</td>
<td>North York
</td>
<td>Lawrence Manor, Lawrence Heights
</td></tr>
<tr>
<td>M7A
</td>
<td>Downtown Toronto
</td>
<td>Queen's Park, Ontario Provincial Government
</td></tr>
<tr>
<td>M8A
</td>
<td>Not assigned
</td>
<td>
</td></tr>
<tr>
<td>M9A
</td>
<td>Etobicoke
</td>
<td>Islington Avenue, Humber Valley Village
</td></tr>
<tr>
<td>M1B
</td>
<td>Scarborough
</td>
<td>Malvern, Rouge
</td></tr>
<tr>
<td>M2B
</td>
<td>Not assigned
</td>
<td>
</td></tr>
<tr>
<td>M3B
</td>
<td>North York
</td>
<td>Don Mills
</td></tr>
<tr>
<

In [5]:
A=[]
B=[]
C=[]

for row in right_table.findAll('tr'):
    cells=row.findAll('td')
    if len(cells)==3:
        A.append(cells[0].find(text=True))
        B.append(cells[1].find(text=True))
        C.append(cells[2].find(text=True))
    

In [6]:
import pandas as pd
df=pd.DataFrame(A,columns=['PostalCode'])
df['Borough']=B
df['Neighborhood']=C
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


### Ignore rows that have Boroughs Not Assigned

#### Went the route of converting 'Not assigned' to NaN in numpy. More than one way to achieve results.

In [7]:
import numpy as np
df = df.replace('Not assigned',np.nan, regex=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,,
1,M2A,,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [8]:
df.dropna(subset=["Borough"], axis=0, inplace=True)
df = df.reset_index(drop=True)
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 3 columns):
PostalCode      103 non-null object
Borough         103 non-null object
Neighborhood    103 non-null object
dtypes: object(3)
memory usage: 2.5+ KB


### Cell has a borough but a Not assigned neighborhood, the neighborhood will be the same as the borough

In [10]:
df[df['Neighborhood'] == 0]
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [11]:
mask = df['Neighborhood'] == "Not assigned"
df.loc[mask, 'Neighborhood'] = df.loc[mask, 'Borough']
df


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


### Verify that there are no repeating postal codes

In [12]:
df['PostalCode'].unique()

array(['M3A\n', 'M4A\n', 'M5A\n', 'M6A\n', 'M7A\n', 'M9A\n', 'M1B\n',
       'M3B\n', 'M4B\n', 'M5B\n', 'M6B\n', 'M9B\n', 'M1C\n', 'M3C\n',
       'M4C\n', 'M5C\n', 'M6C\n', 'M9C\n', 'M1E\n', 'M4E\n', 'M5E\n',
       'M6E\n', 'M1G\n', 'M4G\n', 'M5G\n', 'M6G\n', 'M1H\n', 'M2H\n',
       'M3H\n', 'M4H\n', 'M5H\n', 'M6H\n', 'M1J\n', 'M2J\n', 'M3J\n',
       'M4J\n', 'M5J\n', 'M6J\n', 'M1K\n', 'M2K\n', 'M3K\n', 'M4K\n',
       'M5K\n', 'M6K\n', 'M1L\n', 'M2L\n', 'M3L\n', 'M4L\n', 'M5L\n',
       'M6L\n', 'M9L\n', 'M1M\n', 'M2M\n', 'M3M\n', 'M4M\n', 'M5M\n',
       'M6M\n', 'M9M\n', 'M1N\n', 'M2N\n', 'M3N\n', 'M4N\n', 'M5N\n',
       'M6N\n', 'M9N\n', 'M1P\n', 'M2P\n', 'M4P\n', 'M5P\n', 'M6P\n',
       'M9P\n', 'M1R\n', 'M2R\n', 'M4R\n', 'M5R\n', 'M6R\n', 'M7R\n',
       'M9R\n', 'M1S\n', 'M4S\n', 'M5S\n', 'M6S\n', 'M1T\n', 'M4T\n',
       'M5T\n', 'M1V\n', 'M4V\n', 'M5V\n', 'M8V\n', 'M9V\n', 'M1W\n',
       'M4W\n', 'M5W\n', 'M8W\n', 'M9W\n', 'M1X\n', 'M4X\n', 'M5X\n',
       'M8X\n', 'M4Y

### Use the .shape method to print the number of rows of your dataframe

In [13]:
print('The scrubbed dataframe has' , df.shape, 'rows and columns')

The scrubbed dataframe has (103, 3) rows and columns


### Use geocoder to include latitude and longitude to dataframe (used arcgis)

In [14]:
!conda install -c conda-forge geocoder --yes

Solving environment: | ^C
failed

CondaError: KeyboardInterrupt



In [15]:
def get_geocoder(postal_code_from_df):
    
    lat_lng_coords = None
    
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code_from_df.strip()))
        lat_lng_coords = g.latlng
        latitude = lat_lng_coords[0]
        longitude = lat_lng_coords[1]
    return latitude,longitude

In [18]:
import geocoder

In [19]:
df['Latitude'], df['Longitude'] = zip(*df['PostalCode'].apply(get_geocoder))
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.752935,-79.335641
1,M4A,North York,Victoria Village,43.728102,-79.31189
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.650964,-79.353041
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.723265,-79.451211
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66179,-79.38939


In [20]:
df.shape

(103, 5)

In [21]:
!pip -q install folium

In [22]:
from geopy.geocoders import Nominatim
import matplotlib.cm as cm
import matplotlib.colors as colors
import folium
from sklearn.cluster import KMeans

In [23]:

address = 'Toronto, CA'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto, Cananda are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto, Cananda are 43.6534817, -79.3839347.


### Create a map of Toronto with neighborhoods superimposed on top

In [24]:
# create map of Toronto using latitude and longitude values
map_tor = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, Borough, Neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(Neighborhood, Borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_tor)  
    
map_tor

### Narrowing focus to Boroughs that contain "Toronto"

In [25]:

toronto_data=df[df['Borough'].str.contains("Toronto")]
toronto_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.650964,-79.353041
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66179,-79.38939
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657491,-79.377529
15,M5C,Downtown Toronto,St. James Town,43.651734,-79.375554
19,M4E,East Toronto,The Beaches,43.678148,-79.295349


### Preparing to use Foursquare to explore venue data of Neighborhoods

In [26]:
CLIENT_ID = 'UBIBXNK5NDHAMJB5WQJN5INIYHCFE35KPBJ0MNLUD0H1X1BM' # your Foursquare ID
CLIENT_SECRET = 'DKQFRYYW3DU4G3F2IIQVFKGHWAGHB5SVZIVDQKXT1VKC4L44' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: UBIBXNK5NDHAMJB5WQJN5INIYHCFE35KPBJ0MNLUD0H1X1BM
CLIENT_SECRET:DKQFRYYW3DU4G3F2IIQVFKGHWAGHB5SVZIVDQKXT1VKC4L44


### Define function to collect venues from each neighborhood

In [27]:
def getNearbyVenues(names, latitudes, longitudes):
    radius=500
    LIMIT=100
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### Process results received from Foursquare

In [28]:
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

In [29]:

toronto_venues = getNearbyVenues(names=toronto_data['Neighborhood'],
                                   latitudes=toronto_data['Latitude'],
                                   longitudes=toronto_data['Longitude']
                                  )

Regent Park, Harbourfront

Queen's Park, Ontario Provincial Government

Garden District, Ryerson

St. James Town

The Beaches

Berczy Park

Central Bay Street

Christie

Richmond, Adelaide, King

Dufferin, Dovercourt Village

Harbourfront East, Union Station, Toronto Islands

Little Portugal, Trinity

The Danforth West, Riverdale

Toronto Dominion Centre, Design Exchange

Brockton, Parkdale Village, Exhibition Place

India Bazaar, The Beaches West

Commerce Court, Victoria Hotel

Studio District

Lawrence Park

Roselawn

Davisville North

Forest Hill North & West, Forest Hill Road Park

High Park, The Junction South

North Toronto West,  Lawrence Park

The Annex, North Midtown, Yorkville

Parkdale, Roncesvalles

Davisville

University of Toronto, Harbord

Runnymede, Swansea

Moore Park, Summerhill East

Kensington Market, Chinatown, Grange Park

Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park

CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay

In [30]:
print(toronto_venues.shape)
toronto_venues.head()

(1588, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",43.650964,-79.353041,Souk Tabule,43.653756,-79.35439,Mediterranean Restaurant
1,"Regent Park, Harbourfront",43.650964,-79.353041,Young Centre for the Performing Arts,43.650825,-79.357593,Performing Arts Venue
2,"Regent Park, Harbourfront",43.650964,-79.353041,SOMA chocolatemaker,43.650622,-79.358127,Chocolate Shop
3,"Regent Park, Harbourfront",43.650964,-79.353041,BATLgrounds,43.647088,-79.351306,Athletics & Sports
4,"Regent Park, Harbourfront",43.650964,-79.353041,Cluny Bistro & Boulangerie,43.650565,-79.357843,French Restaurant


In [31]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,63,63,63,63,63,63
"Brockton, Parkdale Village, Exhibition Place",44,44,44,44,44,44
"Business reply mail Processing Centre, South Central Letter Processing Plant Toronto",100,100,100,100,100,100
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",62,62,62,62,62,62
Central Bay Street,54,54,54,54,54,54
Christie,12,12,12,12,12,12
Church and Wellesley,84,84,84,84,84,84
"Commerce Court, Victoria Hotel",100,100,100,100,100,100
Davisville,26,26,26,26,26,26
Davisville North,5,5,5,5,5,5


In [32]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 225 uniques categories.


### Use one hot encoding to create binary information on results

In [33]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Accessories Store,Afghan Restaurant,American Restaurant,Antique Shop,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,...,Tibetan Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Afghan Restaurant,American Restaurant,Antique Shop,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,...,Tibetan Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store
0,Berczy Park,0.015873,0.0,0.0,0.0,0.0,0.015873,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.015873,0.0,0.0,0.0,0.0,0.0
1,"Brockton, Parkdale Village, Exhibition Place",0.0,0.022727,0.0,0.0,0.0,0.022727,0.0,0.022727,0.0,...,0.022727,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Business reply mail Processing Centre, South C...",0.0,0.0,0.0,0.02,0.0,0.01,0.0,0.0,0.03,...,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.0,0.0
3,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.016129,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.018519,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.018519,0.018519,0.018519,0.0,0.0
5,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Church and Wellesley,0.011905,0.0,0.011905,0.011905,0.0,0.0,0.0,0.011905,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011905,0.0
7,"Commerce Court, Victoria Hotel",0.01,0.0,0.0,0.04,0.0,0.01,0.0,0.0,0.01,...,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.0
8,Davisville,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038462,...,0.0,0.038462,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Davisville North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Find the top five venues by way of frequency for each neighborhood

In [35]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Berczy Park
----
          venue  freq
0   Coffee Shop  0.10
1  Cocktail Bar  0.05
2          Café  0.03
3      Beer Bar  0.03
4    Restaurant  0.03


----Brockton, Parkdale Village, Exhibition Place
----
                    venue  freq
0                    Café  0.07
1             Coffee Shop  0.07
2  Thrift / Vintage Store  0.05
3               Gift Shop  0.05
4                   Diner  0.05


----Business reply mail Processing Centre, South Central Letter Processing Plant Toronto
----
                 venue  freq
0          Coffee Shop  0.07
1                Hotel  0.05
2  Japanese Restaurant  0.03
3                 Café  0.03
4     Asian Restaurant  0.03


----CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
----
               venue  freq
0        Coffee Shop  0.06
1               Café  0.06
2         Restaurant  0.05
3  French Restaurant  0.05
4               Park  0.05


----Central Bay Street
----
                   

In [36]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

### Create dataframe of the ten top venues for each neighborhood

In [37]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Coffee Shop,Cocktail Bar,Hotel,Bakery,Beer Bar,Seafood Restaurant,Cheese Shop,Restaurant,Breakfast Spot,Café
1,"Brockton, Parkdale Village, Exhibition Place",Coffee Shop,Café,Diner,Gift Shop,Pizza Place,Thrift / Vintage Store,Pet Store,Caribbean Restaurant,French Restaurant,Indian Restaurant
2,"Business reply mail Processing Centre, South C...",Coffee Shop,Hotel,Café,Restaurant,Asian Restaurant,Japanese Restaurant,Bar,Tea Room,Sandwich Place,Seafood Restaurant
3,"CN Tower, King and Spadina, Railway Lands, Har...",Coffee Shop,Café,French Restaurant,Restaurant,Park,Speakeasy,Italian Restaurant,Gym / Fitness Center,Bar,Lounge
4,Central Bay Street,Coffee Shop,Clothing Store,Restaurant,Japanese Restaurant,Sandwich Place,Bubble Tea Shop,Plaza,Middle Eastern Restaurant,Fast Food Restaurant,Café


### Use statistical K-means to generate neighborhood clusters (5)

In [38]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [39]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.650964,-79.353041,0,Pub,Café,Coffee Shop,Athletics & Sports,Bank,Thai Restaurant,Mediterranean Restaurant,Mexican Restaurant,Food Truck,Chocolate Shop
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66179,-79.38939,0,Coffee Shop,Café,Bank,Sandwich Place,Burrito Place,Smoothie Shop,Creperie,Middle Eastern Restaurant,Bookstore,Fried Chicken Joint
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657491,-79.377529,0,Coffee Shop,Clothing Store,Sandwich Place,Middle Eastern Restaurant,Restaurant,Bar,Café,Hotel,Italian Restaurant,Cosmetics Shop
15,M5C,Downtown Toronto,St. James Town,43.651734,-79.375554,0,Coffee Shop,Café,Cocktail Bar,American Restaurant,Cosmetics Shop,Gastropub,Italian Restaurant,Clothing Store,Lingerie Store,Theater
19,M4E,East Toronto,The Beaches,43.678148,-79.295349,0,Health Food Store,Trail,Pub,Church,Creperie,Dog Run,Fast Food Restaurant,Farmers Market,Farm,Falafel Restaurant


### Map of clusters

In [40]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [41]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,Downtown Toronto,0,Pub,Café,Coffee Shop,Athletics & Sports,Bank,Thai Restaurant,Mediterranean Restaurant,Mexican Restaurant,Food Truck,Chocolate Shop
4,Downtown Toronto,0,Coffee Shop,Café,Bank,Sandwich Place,Burrito Place,Smoothie Shop,Creperie,Middle Eastern Restaurant,Bookstore,Fried Chicken Joint
9,Downtown Toronto,0,Coffee Shop,Clothing Store,Sandwich Place,Middle Eastern Restaurant,Restaurant,Bar,Café,Hotel,Italian Restaurant,Cosmetics Shop
15,Downtown Toronto,0,Coffee Shop,Café,Cocktail Bar,American Restaurant,Cosmetics Shop,Gastropub,Italian Restaurant,Clothing Store,Lingerie Store,Theater
19,East Toronto,0,Health Food Store,Trail,Pub,Church,Creperie,Dog Run,Fast Food Restaurant,Farmers Market,Farm,Falafel Restaurant
20,Downtown Toronto,0,Coffee Shop,Cocktail Bar,Hotel,Bakery,Beer Bar,Seafood Restaurant,Cheese Shop,Restaurant,Breakfast Spot,Café
24,Downtown Toronto,0,Coffee Shop,Clothing Store,Restaurant,Japanese Restaurant,Sandwich Place,Bubble Tea Shop,Plaza,Middle Eastern Restaurant,Fast Food Restaurant,Café
25,Downtown Toronto,0,Grocery Store,Café,Playground,Candy Store,Baby Store,Athletics & Sports,Coffee Shop,Park,Dance Studio,Eastern European Restaurant
30,Downtown Toronto,0,Coffee Shop,Café,Restaurant,Clothing Store,Thai Restaurant,Sushi Restaurant,Salad Place,Gym,Hotel,Deli / Bodega
31,West Toronto,0,Park,Grocery Store,Athletics & Sports,Brazilian Restaurant,Café,Bank,Bakery,Pool,Smoke Shop,Bus Line


In [42]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
73,Central Toronto,1,Playground,Gym Pool,Park,Garden,Dog Run,Fast Food Restaurant,Farmers Market,Farm,Falafel Restaurant,Ethiopian Restaurant
91,Downtown Toronto,1,Playground,Grocery Store,Park,Candy Store,Dog Run,Fast Food Restaurant,Farmers Market,Farm,Falafel Restaurant,Ethiopian Restaurant


In [43]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
68,Central Toronto,2,Park,Women's Store,Dog Run,Fish & Chips Shop,Fast Food Restaurant,Farmers Market,Farm,Falafel Restaurant,Ethiopian Restaurant,Elementary School


In [44]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
61,Central Toronto,3,Bus Line,Swim School,Women's Store,Donut Shop,Fish & Chips Shop,Fast Food Restaurant,Farmers Market,Farm,Falafel Restaurant,Ethiopian Restaurant


In [45]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
62,Central Toronto,4,IT Services,Clothing Store,Women's Store,Dog Run,Fish & Chips Shop,Fast Food Restaurant,Farmers Market,Farm,Falafel Restaurant,Ethiopian Restaurant
