In [172]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import json
from pandas.io.json import json_normalize  # transform json files to pandas dataframes
!pip install geopy
from geopy.geocoders import Nominatim # 
import numpy as np
import csv
!pip install folium
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

You should consider upgrading via the '/Applications/anaconda3/bin/python3 -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/Applications/anaconda3/bin/python3 -m pip install --upgrade pip' command.[0m


# Scraping the neighborhood data to make a table

In [173]:
# The wikipedia site link
site_link = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

##### Get source code html data from the website

In [174]:
source = requests.get(site_link).text

##### Use BeautifulSoup to parse it

In [175]:
soup = BeautifulSoup(source, 'lxml')

##### Get the table that contains the data we want to scrape

In [176]:
My_table = soup.find('table',{'class':'wikitable sortable'})

# My_table

##### we can see that all the data we want are between the $&lt;td&gt;$ brackets, let's get the data between the td brackets

In [177]:
links = My_table.find_all('td')
# links

##### Loop through links and extract only the text elements

In [178]:
text_links = []

for link in links:
    text_links.append(link.text)



#text_links

##### Clean links and keep only rows with Borough. Out of which we shall rename rows without Neighborhood as Boroughs

In [179]:
cleaned_links = []

while True:
    
    if len(text_links) < 3:
        break
    
    sub = text_links[:3]
    # If 'Not ' in borough then skip that row of data
    if 'Not ' in sub[1]:
        text_links = text_links[3:]
    else:
        cleaned_links.append(text_links[:3])
        
        # Let's strip off the \n at the end of each neighborhood data
        for i in range(0,len(cleaned_links)):
            cleaned_links[i][0] = cleaned_links[i][0].strip('\n')
            cleaned_links[i][1] = cleaned_links[i][1].strip('\n')
            cleaned_links[i][2] = cleaned_links[i][2].strip('\n')
        
        # If the Borough is available but the Neighborhood is missing
        # make Neighborhood same as Borough
        if 'Not ' in cleaned_links[-1][-1]:
            cleaned_links[-1][-1] = cleaned_links[-1][-2]
        text_links = text_links[3:]

#cleaned_links

In [180]:
len(cleaned_links)

103

##### Add neighborhood data of each duplicate Postal Codes together to the first instance or row that contains the PostalCode

In [181]:
link = []
for i in range(len(cleaned_links)):
    x = cleaned_links[i][0]
    if x in link:
        cleaned_links[link.index(x)][-1] += ', ' + cleaned_links[i][-1]
    link.append(x)
#cleaned_links

###### Pass cleaned _links to a data frame and set index to postal code so that we can easily work on it

In [182]:
df = pd.DataFrame(cleaned_links, columns=['PostalCode','Borough','Neighborhood'])
df.index= df.PostalCode

In [183]:
#viewing the data frame
df.head()

Unnamed: 0_level_0,PostalCode,Borough,Neighborhood
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
M3A,M3A,North York,Parkwoods
M4A,M4A,North York,Victoria Village
M5A,M5A,Downtown Toronto,"Regent Park, Harbourfront"
M6A,M6A,North York,"Lawrence Manor, Lawrence Heights"
M7A,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


#### Use pandas duplicate method to drop duplicate index

In [184]:
df = df.loc[~df.index.duplicated(keep='first')]
df.head()

Unnamed: 0_level_0,PostalCode,Borough,Neighborhood
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
M3A,M3A,North York,Parkwoods
M4A,M4A,North York,Victoria Village
M5A,M5A,Downtown Toronto,"Regent Park, Harbourfront"
M6A,M6A,North York,"Lawrence Manor, Lawrence Heights"
M7A,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


##### Reset index so it is like the example on the assignment page

In [185]:
df = df.reset_index(drop=True)
df.head(20)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [186]:
df.shape

(103, 3)

# Appending the Latitude and Longitude data

##### Define a simple method that we can apply to each Borough to get its Latitude and Longitude using the apply() method

In [187]:
def latitude_longitude(Borough):
    """ Method takes a Series object and returns
    a list of Latitude and corresponding Longitude data,
    using the geopy library.
    This method also prints out the coordinate data"""
    
    address = Borough
    
    geolocator = Nominatim(user_agent="CA_explorer")
    location = geolocator.geocode(address)
    latitude = location.latitude
    longitude = location.longitude
    print('The geograpical coordinate of {} are {}, {}.'.format(address, latitude, longitude))
    return [latitude, longitude]

##### Append list containing corresponding lat and lon data to column Latitude

In [188]:
df['Latitude'] = df.Borough.apply(latitude_longitude)

The geograpical coordinate of North York are 43.7543263, -79.44911696639593.
The geograpical coordinate of North York are 43.7543263, -79.44911696639593.
The geograpical coordinate of Downtown Toronto are 43.6541737, -79.38081164513409.
The geograpical coordinate of North York are 43.7543263, -79.44911696639593.
The geograpical coordinate of Downtown Toronto are 43.6541737, -79.38081164513409.
The geograpical coordinate of Etobicoke are 43.6435559, -79.5656326.
The geograpical coordinate of Scarborough are 54.2820009, -0.4011868.
The geograpical coordinate of North York are 43.7543263, -79.44911696639593.
The geograpical coordinate of East York are 43.699971000000005, -79.33251996261595.
The geograpical coordinate of Downtown Toronto are 43.6541737, -79.38081164513409.
The geograpical coordinate of North York are 43.7543263, -79.44911696639593.
The geograpical coordinate of Etobicoke are 43.6435559, -79.5656326.
The geograpical coordinate of Scarborough are 54.2820009, -0.4011868.
The 

##### View the updated data with Latitude containing lists of lats and lons data

In [189]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude
0,M3A,North York,Parkwoods,"[43.7543263, -79.44911696639593]"
1,M4A,North York,Victoria Village,"[43.7543263, -79.44911696639593]"
2,M5A,Downtown Toronto,"Regent Park, Harbourfront","[43.6541737, -79.38081164513409]"
3,M6A,North York,"Lawrence Manor, Lawrence Heights","[43.7543263, -79.44911696639593]"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government","[43.6541737, -79.38081164513409]"


##### Loop through the data frame and separate Latitude from Longitude and make lat and lons just numbers not lists

In [190]:
lon_list = []
for i, j in df.iterrows():
    lon_list.append(j.Latitude[1])
    j.Latitude = j.Latitude[0]
    
# next let's assign the lon_list as the value of the Longitude Column

df['Longitude'] = lon_list
df.head(100)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7543,-79.449117
1,M4A,North York,Victoria Village,43.7543,-79.449117
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6542,-79.380812
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7543,-79.449117
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6542,-79.380812
...,...,...,...,...,...
95,M1X,Scarborough,Upper Rouge,54.282,-0.401187
96,M4X,Downtown Toronto,"St. James Town, Cabbagetown",43.6542,-79.380812
97,M5X,Downtown Toronto,"First Canadian Place, Underground city",43.6542,-79.380812
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.6436,-79.565633


# EDA of Toronto Neighborhood Clusters

##### explore and cluster the neighborhoods in Toronto. We can decide to work with only boroughs that contain the word Toronto and then replicate the same analysis we did earlier to the New York City data.

In [191]:
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7543,-79.449117
1,M4A,North York,Victoria Village,43.7543,-79.449117
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6542,-79.380812
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7543,-79.449117
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6542,-79.380812
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.6436,-79.565633
6,M1B,Scarborough,"Malvern, Rouge",54.282,-0.401187
7,M3B,North York,Don Mills,43.7543,-79.449117
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.7,-79.33252
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.6542,-79.380812


##### View unique Borough names we have in the Data Frame

In [192]:
df.Borough.unique()

array(['North York', 'Downtown Toronto', 'Etobicoke', 'Scarborough',
       'East York', 'York', 'East Toronto', 'West Toronto',
       'Central Toronto', 'Mississauga'], dtype=object)

##### Loop through the data frame and get the index of Boroughs that end with 'Toronto'

In [193]:
toronto_list = [i for i in df.index if df.iloc[i,1].endswith('Toronto')]

#### Slice through the data frame to select only these rows of Boroughs ending with Toronto

In [194]:
Toronto_df = df.iloc[toronto_list,].reset_index(drop=True)
Toronto_df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6542,-79.380812
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6542,-79.380812
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.6542,-79.380812
3,M5C,Downtown Toronto,St. James Town,43.6542,-79.380812
4,M4E,East Toronto,The Beaches,43.6262,-79.396962
5,M5E,Downtown Toronto,Berczy Park,43.6542,-79.380812
6,M5G,Downtown Toronto,Central Bay Street,43.6542,-79.380812
7,M6G,Downtown Toronto,Christie,43.6542,-79.380812
8,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.6542,-79.380812
9,M6H,West Toronto,"Dufferin, Dovercourt Village",43.6535,-79.383935


##### View unique Boroughs once again in Toronto

In [195]:
Toronto_df.Borough.unique()

array(['Downtown Toronto', 'East Toronto', 'West Toronto',
       'Central Toronto'], dtype=object)

##### Get geographic coordinates of Toronto

In [196]:
address = 'Toronto'

geolocator = Nominatim(user_agent='CA_explorer')
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

print('The geograpical coordinates of {} are {}, {}.'.format(address, latitude, longitude))

The geograpical coordinates of Toronto are 43.6534817, -79.3839347.


#### Visualize Toronto and the neighborhoods in it.

In [197]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(Toronto_df['Latitude'], Toronto_df['Longitude'], Toronto_df['Borough'], Toronto_df['Neighborhood']):
    label = "{}, {}".format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto  # Feel free to zoom in to see more

### Explore the data using foursquare API

In [198]:
CLIENT_ID = 'L2XUYCRD44VP2YBDMZ3XY0Z4XKO1JODM5SMG3IIYRZ5330FD' # your Foursquare ID
CLIENT_SECRET = '5KI00FG4RT2VJQTIDPXFY1N4AOKTIXLIZEAO0AHPP3DPD0U1' # your Foursquare Secret
VERSION = '20190727' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: L2XUYCRD44VP2YBDMZ3XY0Z4XKO1JODM5SMG3IIYRZ5330FD
CLIENT_SECRET:5KI00FG4RT2VJQTIDPXFY1N4AOKTIXLIZEAO0AHPP3DPD0U1


##### Confirm the shape of Toronto_df

In [199]:
Toronto_df.shape  # This shows there are 39 rows and 5 columns

(39, 5)

In [200]:
# Next let's see the unique borough names in Toronto_df
Toronto_df.Borough.unique()

array(['Downtown Toronto', 'East Toronto', 'West Toronto',
       'Central Toronto'], dtype=object)

### Explore the first location in 'Downtown Toronto'

##### Get the details of the first entry

In [201]:
first_entry_Downtown_Toronto = Toronto_df[Toronto_df.Borough =='Downtown Toronto'].head(1)

first_entry_Downtown_Toronto

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6542,-79.380812


##### Get the top 100 venues that are in the first entry within a radius of 500 meters.

##### First create the get request url

In [202]:
radius = 500
LIMIT=100

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    first_entry_Downtown_Toronto.Latitude[0], 
    first_entry_Downtown_Toronto.Longitude[0], 
    radius, 
    LIMIT)

##### Send the GET request and examine the resutls

In [203]:
result = requests.get(url).json()

result.keys()

#result

dict_keys(['meta', 'response'])

##### Function that extracts the category of the venue

In [204]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [205]:
venues = result['response']['groups'][0]['items']

#venues

In [206]:
nearby_venues = json_normalize(venues) # flatten JSON

  nearby_venues = json_normalize(venues) # flatten JSON


In [207]:
# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng', 'venue.location.distance']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng,distance
0,Elgin And Winter Garden Theatres,Theater,43.653394,-79.378507,204
1,UNIQLO ユニクロ,Clothing Store,43.65591,-79.380641,193
2,Indigo,Bookstore,43.653515,-79.380696,73
3,LUSH,Cosmetics Shop,43.653557,-79.3804,76
4,Yonge-Dundas Square,Plaza,43.656054,-79.380495,210


##### And how many venues were returned by Foursquare

In [208]:
print('The total number of venues returned is {}'.format(len(nearby_venues)))

The total number of venues returned is 92


### Explore venues in 'Central Toronto'


In [209]:
central_toronto_df = Toronto_df[Toronto_df.Borough == 'Central Toronto']

central_toronto_df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
18,M4N,Central Toronto,Lawrence Park,43.6535,-79.383935
19,M5N,Central Toronto,Roselawn,43.6535,-79.383935
20,M4P,Central Toronto,Davisville North,43.6535,-79.383935
21,M5P,Central Toronto,"Forest Hill North & West, Forest Hill Road Park",43.6535,-79.383935
23,M4R,Central Toronto,"North Toronto West, Lawrence Park",43.6535,-79.383935
24,M5R,Central Toronto,"The Annex, North Midtown, Yorkville",43.6535,-79.383935
26,M4S,Central Toronto,Davisville,43.6535,-79.383935
29,M4T,Central Toronto,"Moore Park, Summerhill East",43.6535,-79.383935
31,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...",43.6535,-79.383935


##### Create a function to repeat the same process to all the neighborhoods in Central Toronto

In [210]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name'],
            v['venue']['location']['distance']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category',
                  'Venue Distance']
    
    return(nearby_venues)

In [211]:
central_toronto_venues = getNearbyVenues(names=central_toronto_df['Neighborhood'],
                                   latitudes=central_toronto_df['Latitude'],
                                   longitudes=central_toronto_df['Longitude'])

Lawrence Park
Roselawn
Davisville North
Forest Hill North & West, Forest Hill Road Park
North Toronto West,  Lawrence Park
The Annex, North Midtown, Yorkville
Davisville
Moore Park, Summerhill East
Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park


##### Check the size of the resulting dataframe

In [212]:
print(central_toronto_venues.shape)

central_toronto_venues.head()

(648, 8)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category,Venue Distance
0,Lawrence Park,43.653482,-79.383935,Downtown Toronto,43.653232,-79.385296,Neighborhood,113
1,Lawrence Park,43.653482,-79.383935,Nathan Phillips Square,43.65227,-79.383516,Plaza,138
2,Lawrence Park,43.653482,-79.383935,Japango,43.655268,-79.385165,Sushi Restaurant,222
3,Lawrence Park,43.653482,-79.383935,Poke Guys,43.654895,-79.385052,Poke Place,181
4,Lawrence Park,43.653482,-79.383935,Indigo,43.653515,-79.380696,Bookstore,260


##### Check how many venues were returned per neighborhood

In [213]:
central_toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category,Venue Distance
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Davisville,72,72,72,72,72,72,72
Davisville North,72,72,72,72,72,72,72
"Forest Hill North & West, Forest Hill Road Park",72,72,72,72,72,72,72
Lawrence Park,72,72,72,72,72,72,72
"Moore Park, Summerhill East",72,72,72,72,72,72,72
"North Toronto West, Lawrence Park",72,72,72,72,72,72,72
Roselawn,72,72,72,72,72,72,72
"Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park",72,72,72,72,72,72,72
"The Annex, North Midtown, Yorkville",72,72,72,72,72,72,72


##### Find the number of unique categories  that can be curated from all the returned venues

In [214]:
print('There are {} unique categories of Venues'.format(central_toronto_venues['Venue Category'].nunique()))

There are 55 unique categories of Venues


### Analyze Each Neighborhood

In [215]:
# one hot encoding
central_toronto_onehot = pd.get_dummies(central_toronto_venues[['Venue Category']], prefix="", prefix_sep="")

In [216]:
# add neighborhood column back to dataframe
central_toronto_onehot['Neighborhood'] = central_toronto_venues['Neighborhood']

In [217]:
# move neighborhood column to the first column
fixed_columns = [central_toronto_onehot.columns[-1]] + list(central_toronto_onehot.columns[:-1])
central_toronto_onehot = central_toronto_onehot[fixed_columns]

pd.set_option('display.max_columns', None)
central_toronto_onehot.head()

Unnamed: 0,Vietnamese Restaurant,American Restaurant,Art Museum,Bank,Bookstore,Breakfast Spot,Bubble Tea Shop,Burger Joint,Café,Clothing Store,Cocktail Bar,Coffee Shop,Colombian Restaurant,Comic Shop,Concert Hall,Cosmetics Shop,Department Store,Diner,Electronics Store,Fast Food Restaurant,Food Court,Furniture / Home Store,Gastropub,Gym / Fitness Center,Hotel,Jazz Club,Latin American Restaurant,Mexican Restaurant,Middle Eastern Restaurant,Miscellaneous Shop,Modern European Restaurant,Monument / Landmark,Movie Theater,Music Venue,Neighborhood,New American Restaurant,Noodle House,Office,Opera House,Pizza Place,Plaza,Poke Place,Restaurant,Salad Place,Seafood Restaurant,Shoe Store,Shopping Mall,Smoothie Shop,Steakhouse,Sushi Restaurant,Tanning Salon,Thai Restaurant,Theater,Vegetarian / Vegan Restaurant,Video Game Store
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Lawrence Park,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Lawrence Park,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Lawrence Park,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Lawrence Park,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Lawrence Park,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


##### Examine the new Data frame siz

In [218]:
central_toronto_onehot.shape

(648, 55)

##### Group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [219]:
central_toronto_grouped = central_toronto_onehot.groupby('Neighborhood').mean().reset_index()
central_toronto_grouped

Unnamed: 0,Neighborhood,Vietnamese Restaurant,American Restaurant,Art Museum,Bank,Bookstore,Breakfast Spot,Bubble Tea Shop,Burger Joint,Café,Clothing Store,Cocktail Bar,Coffee Shop,Colombian Restaurant,Comic Shop,Concert Hall,Cosmetics Shop,Department Store,Diner,Electronics Store,Fast Food Restaurant,Food Court,Furniture / Home Store,Gastropub,Gym / Fitness Center,Hotel,Jazz Club,Latin American Restaurant,Mexican Restaurant,Middle Eastern Restaurant,Miscellaneous Shop,Modern European Restaurant,Monument / Landmark,Movie Theater,Music Venue,New American Restaurant,Noodle House,Office,Opera House,Pizza Place,Plaza,Poke Place,Restaurant,Salad Place,Seafood Restaurant,Shoe Store,Shopping Mall,Smoothie Shop,Steakhouse,Sushi Restaurant,Tanning Salon,Thai Restaurant,Theater,Vegetarian / Vegan Restaurant,Video Game Store
0,Davisville,0.013889,0.027778,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.027778,0.083333,0.013889,0.083333,0.013889,0.013889,0.013889,0.027778,0.013889,0.027778,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.027778,0.013889,0.027778,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.027778,0.013889,0.013889
1,Davisville North,0.013889,0.027778,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.027778,0.083333,0.013889,0.083333,0.013889,0.013889,0.013889,0.027778,0.013889,0.027778,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.027778,0.013889,0.027778,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.027778,0.013889,0.013889
2,"Forest Hill North & West, Forest Hill Road Park",0.013889,0.027778,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.027778,0.083333,0.013889,0.083333,0.013889,0.013889,0.013889,0.027778,0.013889,0.027778,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.027778,0.013889,0.027778,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.027778,0.013889,0.013889
3,Lawrence Park,0.013889,0.027778,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.027778,0.083333,0.013889,0.083333,0.013889,0.013889,0.013889,0.027778,0.013889,0.027778,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.027778,0.013889,0.027778,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.027778,0.013889,0.013889
4,"Moore Park, Summerhill East",0.013889,0.027778,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.027778,0.083333,0.013889,0.083333,0.013889,0.013889,0.013889,0.027778,0.013889,0.027778,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.027778,0.013889,0.027778,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.027778,0.013889,0.013889
5,"North Toronto West, Lawrence Park",0.013889,0.027778,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.027778,0.083333,0.013889,0.083333,0.013889,0.013889,0.013889,0.027778,0.013889,0.027778,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.027778,0.013889,0.027778,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.027778,0.013889,0.013889
6,Roselawn,0.013889,0.027778,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.027778,0.083333,0.013889,0.083333,0.013889,0.013889,0.013889,0.027778,0.013889,0.027778,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.027778,0.013889,0.027778,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.027778,0.013889,0.013889
7,"Summerhill West, Rathnelly, South Hill, Forest...",0.013889,0.027778,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.027778,0.083333,0.013889,0.083333,0.013889,0.013889,0.013889,0.027778,0.013889,0.027778,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.027778,0.013889,0.027778,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.027778,0.013889,0.013889
8,"The Annex, North Midtown, Yorkville",0.013889,0.027778,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.027778,0.083333,0.013889,0.083333,0.013889,0.013889,0.013889,0.027778,0.013889,0.027778,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.027778,0.013889,0.027778,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.013889,0.027778,0.013889,0.013889


In [220]:
central_toronto_grouped.shape

(9, 55)

##### View each neighborhood along with the top 5 most common venues

In [221]:
num_top_venues = 5

for hood in central_toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = central_toronto_grouped[central_toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Davisville----
            venue  freq
0     Coffee Shop  0.08
1  Clothing Store  0.08
2  Cosmetics Shop  0.03
3           Diner  0.03
4           Plaza  0.03


----Davisville North----
            venue  freq
0     Coffee Shop  0.08
1  Clothing Store  0.08
2  Cosmetics Shop  0.03
3           Diner  0.03
4           Plaza  0.03


----Forest Hill North & West, Forest Hill Road Park----
            venue  freq
0     Coffee Shop  0.08
1  Clothing Store  0.08
2  Cosmetics Shop  0.03
3           Diner  0.03
4           Plaza  0.03


----Lawrence Park----
            venue  freq
0     Coffee Shop  0.08
1  Clothing Store  0.08
2  Cosmetics Shop  0.03
3           Diner  0.03
4           Plaza  0.03


----Moore Park, Summerhill East----
            venue  freq
0     Coffee Shop  0.08
1  Clothing Store  0.08
2  Cosmetics Shop  0.03
3           Diner  0.03
4           Plaza  0.03


----North Toronto West,  Lawrence Park----
            venue  freq
0     Coffee Shop  0.08
1  Clothing Store  0.

##### Convert above results to a Pandas Data frame

##### First write a function tp sort the venues in descending order

In [222]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

##### Later create the new Data frame and display the Top 10 venues for each neighborhood

In [223]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = central_toronto_grouped['Neighborhood']

for ind in np.arange(central_toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(central_toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Davisville,Coffee Shop,Clothing Store,Plaza,Café,Theater,American Restaurant,Cosmetics Shop,Restaurant,Diner,Department Store
1,Davisville North,Coffee Shop,Clothing Store,Plaza,Café,Theater,American Restaurant,Cosmetics Shop,Restaurant,Diner,Department Store
2,"Forest Hill North & West, Forest Hill Road Park",Coffee Shop,Clothing Store,Plaza,Café,Theater,American Restaurant,Cosmetics Shop,Restaurant,Diner,Department Store
3,Lawrence Park,Coffee Shop,Clothing Store,Plaza,Café,Theater,American Restaurant,Cosmetics Shop,Restaurant,Diner,Department Store
4,"Moore Park, Summerhill East",Coffee Shop,Clothing Store,Plaza,Café,Theater,American Restaurant,Cosmetics Shop,Restaurant,Diner,Department Store


### Cluster Neighborhoods

In [224]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

##### Run k-means to cluster the neighborhood into 5 clusters.

In [225]:
# set number of clusters
kclusters = 5

c_toronto_grouped_clustering = central_toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(c_toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

  kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(c_toronto_grouped_clustering)


array([0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

##### Create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood

In [226]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

c_toronto_merged = central_toronto_df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
c_toronto_merged = c_toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

c_toronto_merged.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
18,M4N,Central Toronto,Lawrence Park,43.6535,-79.383935,0,Coffee Shop,Clothing Store,Plaza,Café,Theater,American Restaurant,Cosmetics Shop,Restaurant,Diner,Department Store
19,M5N,Central Toronto,Roselawn,43.6535,-79.383935,0,Coffee Shop,Clothing Store,Plaza,Café,Theater,American Restaurant,Cosmetics Shop,Restaurant,Diner,Department Store
20,M4P,Central Toronto,Davisville North,43.6535,-79.383935,0,Coffee Shop,Clothing Store,Plaza,Café,Theater,American Restaurant,Cosmetics Shop,Restaurant,Diner,Department Store
21,M5P,Central Toronto,"Forest Hill North & West, Forest Hill Road Park",43.6535,-79.383935,0,Coffee Shop,Clothing Store,Plaza,Café,Theater,American Restaurant,Cosmetics Shop,Restaurant,Diner,Department Store
23,M4R,Central Toronto,"North Toronto West, Lawrence Park",43.6535,-79.383935,0,Coffee Shop,Clothing Store,Plaza,Café,Theater,American Restaurant,Cosmetics Shop,Restaurant,Diner,Department Store


##### Finally, visualize the resulting clusters

In [227]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(c_toronto_merged['Latitude'], c_toronto_merged['Longitude'], c_toronto_merged['Neighborhood'], c_toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Examine Clusters

##### Now, you can examine each cluster and determine the discriminating venue categories that distinguish each cluster. Based on the defining categories, you can then assign a name to each cluster. I will leave this exercise to you.

##### Cluster 1

In [228]:
c_toronto_merged.loc[c_toronto_merged['Cluster Labels'] == 0, c_toronto_merged.columns[[1] + list(range(5, c_toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
18,Central Toronto,0,Coffee Shop,Clothing Store,Plaza,Café,Theater,American Restaurant,Cosmetics Shop,Restaurant,Diner,Department Store
19,Central Toronto,0,Coffee Shop,Clothing Store,Plaza,Café,Theater,American Restaurant,Cosmetics Shop,Restaurant,Diner,Department Store
20,Central Toronto,0,Coffee Shop,Clothing Store,Plaza,Café,Theater,American Restaurant,Cosmetics Shop,Restaurant,Diner,Department Store
21,Central Toronto,0,Coffee Shop,Clothing Store,Plaza,Café,Theater,American Restaurant,Cosmetics Shop,Restaurant,Diner,Department Store
23,Central Toronto,0,Coffee Shop,Clothing Store,Plaza,Café,Theater,American Restaurant,Cosmetics Shop,Restaurant,Diner,Department Store
24,Central Toronto,0,Coffee Shop,Clothing Store,Plaza,Café,Theater,American Restaurant,Cosmetics Shop,Restaurant,Diner,Department Store
26,Central Toronto,0,Coffee Shop,Clothing Store,Plaza,Café,Theater,American Restaurant,Cosmetics Shop,Restaurant,Diner,Department Store
29,Central Toronto,0,Coffee Shop,Clothing Store,Plaza,Café,Theater,American Restaurant,Cosmetics Shop,Restaurant,Diner,Department Store
31,Central Toronto,0,Coffee Shop,Clothing Store,Plaza,Café,Theater,American Restaurant,Cosmetics Shop,Restaurant,Diner,Department Store
