<h4>Reading the table from wiki web-page:</h4>

In [1]:
import pandas as pd

toronto_data = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M", header=0)

Toronto=toronto_data[0]

<h4>Looking at the result:</h4>

In [2]:
Toronto

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


In [3]:
print(Toronto.shape)

(180, 3)


<h4>Replacing the "Not assigned" meanings with no-values NaN :</h4>

In [4]:
import numpy as np
Toronto.replace("Not assigned", np.nan, inplace = True)

In [5]:
Toronto.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,,
1,M2A,,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


<h4>Removing lines without values in the column "Borough":</h4>

In [6]:
Toronto.dropna(subset=["Borough"], axis=0, inplace=True)

In [7]:
print(Toronto.shape)

(103, 3)


In [8]:
Toronto.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


<h4>Groupping the data frame by Poctal Code in order to avoid duplications:</h4>

In [9]:
neighborhood=Toronto.groupby("Postal Code", axis=0).sum()

In [10]:
neighborhood

Unnamed: 0_level_0,Borough,Neighborhood
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Malvern, Rouge"
M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
M1E,Scarborough,"Guildwood, Morningside, West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae
M1J,Scarborough,Scarborough Village
M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
M1N,Scarborough,"Birch Cliff, Cliffside West"


In [11]:
neighborhood.reset_index("Postal Code", inplace=True)

<h4>Checking the result:</h4>

In [12]:
neighborhood

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


<h4>Check if we have missing data:</h4>

In [13]:
missing_data = neighborhood.isnull()
missing_data.head(5)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False


<h4>We don't have missing data:</h4>

In [14]:
for column in missing_data.columns.values.tolist():
    print(column)
    print (missing_data[column].value_counts())
    print("")

Postal Code
False    103
Name: Postal Code, dtype: int64

Borough
False    103
Name: Borough, dtype: int64

Neighborhood
False    103
Name: Neighborhood, dtype: int64



<h4>Method shape applied to my result:</h4>

In [15]:
print(neighborhood.shape)

(103, 3)


<h4>Reading geographical coordinates from csv-file:</h4>

In [16]:
geogr=pd.read_csv('https://cocl.us/Geospatial_data')

In [17]:
geogr.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


<h4>Create new data frame that will containe combined information:</h4>

In [18]:
# define the dataframe columns
column_names = ['Postal Code','Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

# instantiate the dataframe
Toronto_neighborhoods = pd.DataFrame(columns=column_names)

In [19]:
Toronto_neighborhoods

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude


<h4>Merge first data frame "neighborhood" with geographical coordinates data frame "geogr":</h4>

In [20]:
Toronto_neighborhoods=neighborhood.merge(geogr)


<h4>Verify the result:</h4>

In [21]:
Toronto_neighborhoods.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [22]:
Toronto_neighborhoods.tail()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ...",43.688905,-79.554724
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437
102,M9W,Etobicoke,"Northwest, West Humber - Clairville",43.706748,-79.594054


In [23]:
print(Toronto_neighborhoods.shape)

(103, 5)


<h4>Check if we have missing data:</h4>

In [24]:
missing_data = Toronto_neighborhoods.isnull()
missing_data.head(5)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,False,False,False,False,False
1,False,False,False,False,False
2,False,False,False,False,False
3,False,False,False,False,False
4,False,False,False,False,False


<h4>No missing data:</h4>

In [25]:
for column in missing_data.columns.values.tolist():
    print(column)
    print (missing_data[column].value_counts())
    print("")

Postal Code
False    103
Name: Postal Code, dtype: int64

Borough
False    103
Name: Borough, dtype: int64

Neighborhood
False    103
Name: Neighborhood, dtype: int64

Latitude
False    103
Name: Latitude, dtype: int64

Longitude
False    103
Name: Longitude, dtype: int64



<h4>Importing libraries:</h4>

In [26]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans



Solving environment: done

# All requested packages already installed.



In [27]:
!conda install -c conda-forge folium=0.5.0 --yes
import folium

print('Folium installed and imported!')

Solving environment: done

# All requested packages already installed.

Folium installed and imported!


<h4>Use geopy library to get the latitude and longitude values of Toronto</h4>

In [28]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


<h4>Create a map of Toronto with neighborhoods superimposed on top.</h4>

In [29]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(Toronto_neighborhoods['Latitude'], Toronto_neighborhoods['Longitude'], Toronto_neighborhoods['Borough'], Toronto_neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

 <h4>Slice the original dataframe and create a new dataframe of the Central Toronto data.</h4>

In [30]:
toronto_data1 = Toronto_neighborhoods[Toronto_neighborhoods['Borough'] == 'Central Toronto'].reset_index(drop=True)
toronto_data1.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
1,M4P,Central Toronto,Davisville North,43.712751,-79.390197
2,M4R,Central Toronto,"North Toronto West, Lawrence Park",43.715383,-79.405678
3,M4S,Central Toronto,Davisville,43.704324,-79.38879
4,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316


 <h4>Let's get the geographical coordinates of Central Toronto.</h4>

In [31]:
address = 'Central Toronto, ON'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Cenral Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Cenral Toronto are 43.6534817, -79.3839347.


 <h4>As we did with all of Toronto, let's visualizate Central Toronto the neighborhoods in it.</h4>

In [32]:
# create map of Centrl Toronto using latitude and longitude values
map_centraltoronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(toronto_data1['Latitude'], toronto_data1['Longitude'], toronto_data1['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_centraltoronto)  
    
map_centraltoronto

 <h4>Define Foursquare Credentials and Version</h4>

In [33]:
CLIENT_ID = 'OJ5IYDCIRSMUJT1JJBMWRJXBH4CNAUNAWF3SS0MSKVXSZKEJ' # your Foursquare ID
CLIENT_SECRET = 'GENH4G3YUSH1PLFBJHWYSGL50FWQNF2Q10TLPLRBGQ1155ZU' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: OJ5IYDCIRSMUJT1JJBMWRJXBH4CNAUNAWF3SS0MSKVXSZKEJ
CLIENT_SECRET:GENH4G3YUSH1PLFBJHWYSGL50FWQNF2Q10TLPLRBGQ1155ZU


 <h4>Let's explore the first neighborhood in our dataframe.</h4>

In [34]:
toronto_data1.loc[0, 'Neighborhood']

'Lawrence Park'

 <h4>Get the neighborhood's latitude and longitude values.</h4>

In [35]:
neighborhood_latitude = toronto_data1.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = toronto_data1.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = toronto_data1.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Lawrence Park are 43.7280205, -79.3887901.


<h4>Now, let's get the top 100 venues that are in Marble Hill within a radius of 500 meters. First, let's create the GET request URL. Name your URL url.</h4>

In [36]:
LIMIT = 100 # limit of number of venues returned by Foursquare API

radius = 500 # define radius

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=OJ5IYDCIRSMUJT1JJBMWRJXBH4CNAUNAWF3SS0MSKVXSZKEJ&client_secret=GENH4G3YUSH1PLFBJHWYSGL50FWQNF2Q10TLPLRBGQ1155ZU&v=20180605&ll=43.7280205,-79.3887901&radius=500&limit=100'

 <h4>Send the GET request and examine the resutls.</h4>

In [37]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5ef8bc792f5bc4555b4d4bb6'},
 'response': {'headerLocation': 'Toronto',
  'headerFullLocation': 'Toronto',
  'headerLocationGranularity': 'city',
  'totalResults': 4,
  'suggestedBounds': {'ne': {'lat': 43.7325205045, 'lng': -79.3825744605273},
   'sw': {'lat': 43.7235204955, 'lng': -79.3950057394727}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '50e6da19e4b0d8a78a0e9794',
       'name': 'Lawrence Park Ravine',
       'location': {'address': '3055 Yonge Street',
        'crossStreet': 'Lawrence Avenue East',
        'lat': 43.72696303913755,
        'lng': -79.39438246708775,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.72696303913755,
          'lng': -79.39438246708775}],
        'distance': 465,
        'c

 <h4>let's borrow the get_category_type function from the Foursquare lab.</h4>

In [38]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

 <h4>Now we are ready to clean the json and structure it into a pandas dataframe.</h4>

In [39]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Lawrence Park Ravine,Park,43.726963,-79.394382
1,Dim Sum Deluxe,Dim Sum Restaurant,43.726953,-79.39426
2,Zodiac Swim School,Swim School,43.728532,-79.38286
3,TTC Bus #162 - Lawrence-Donway,Bus Line,43.728026,-79.382805


 <h4>And how many venues were returned by Foursquare?</h4>

In [40]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

4 venues were returned by Foursquare.


 <h4>Let's create a function to repeat the same process to all the neighborhoods in Central Toronto </h4>

In [41]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

 <h4>Run the above function on each neighborhood and create a new dataframe called manhattan_venues</h4>

In [42]:

centraltoronto_venues = getNearbyVenues(names=toronto_data1['Neighborhood'],
                                   latitudes=toronto_data1['Latitude'],
                                   longitudes=toronto_data1['Longitude']
                                  )


Lawrence Park
Davisville North
North Toronto West, Lawrence Park
Davisville
Moore Park, Summerhill East
Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park
Roselawn
Forest Hill North & West, Forest Hill Road Park
The Annex, North Midtown, Yorkville


 <h4>Let's check the size of the resulting dataframe</h4>

In [43]:
print(centraltoronto_venues.shape)
centraltoronto_venues.head()

(110, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Lawrence Park,43.72802,-79.38879,Lawrence Park Ravine,43.726963,-79.394382,Park
1,Lawrence Park,43.72802,-79.38879,Dim Sum Deluxe,43.726953,-79.39426,Dim Sum Restaurant
2,Lawrence Park,43.72802,-79.38879,Zodiac Swim School,43.728532,-79.38286,Swim School
3,Lawrence Park,43.72802,-79.38879,TTC Bus #162 - Lawrence-Donway,43.728026,-79.382805,Bus Line
4,Davisville North,43.712751,-79.390197,Sherwood Park,43.716551,-79.387776,Park


 <h4>Let's check the size of the resulting dataframe</h4>

In [44]:
centraltoronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Davisville,32,32,32,32,32,32
Davisville North,10,10,10,10,10,10
"Forest Hill North & West, Forest Hill Road Park",4,4,4,4,4,4
Lawrence Park,4,4,4,4,4,4
"Moore Park, Summerhill East",1,1,1,1,1,1
"North Toronto West, Lawrence Park",19,19,19,19,19,19
Roselawn,3,3,3,3,3,3
"Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park",16,16,16,16,16,16
"The Annex, North Midtown, Yorkville",21,21,21,21,21,21


In [45]:
print('There are {} uniques categories.'.format(len(centraltoronto_venues['Venue Category'].unique())))

There are 61 uniques categories.


 <h4>Analyze Each Neighborhood</h4>

In [46]:
# one hot encoding
centraltoronto_onehot = pd.get_dummies(centraltoronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
centraltoronto_onehot['Neighborhood'] = centraltoronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [centraltoronto_onehot.columns[-1]] + list(centraltoronto_onehot.columns[:-1])
centraltoronto_onehot = centraltoronto_onehot[fixed_columns]

centraltoronto_onehot.head()

Unnamed: 0,Neighborhood,American Restaurant,BBQ Joint,Bagel Shop,Bank,Breakfast Spot,Brewery,Burger Joint,Bus Line,Café,Cheese Shop,Chinese Restaurant,Clothing Store,Coffee Shop,Dance Studio,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Donut Shop,Farmers Market,Fast Food Restaurant,Food & Drink Shop,Fried Chicken Joint,Garden,Gas Station,Gourmet Shop,Greek Restaurant,Gym,Gym / Fitness Center,History Museum,Home Service,Hotel,Indian Restaurant,Italian Restaurant,Jewelry Store,Light Rail Station,Liquor Store,Mexican Restaurant,Middle Eastern Restaurant,Park,Pharmacy,Pizza Place,Pool,Pub,Restaurant,Salon / Barbershop,Sandwich Place,Seafood Restaurant,Shoe Store,Spa,Sporting Goods Shop,Sports Bar,Supermarket,Sushi Restaurant,Swim School,Thai Restaurant,Toy / Game Store,Trail,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Yoga Studio
0,Lawrence Park,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Lawrence Park,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Lawrence Park,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
3,Lawrence Park,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Davisville North,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


 <h4>And let's examine the new dataframe size. </h4>

In [47]:
centraltoronto_onehot.shape

(110, 62)

 <h4>Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category </h4>

In [48]:
centraltoronto_grouped = centraltoronto_onehot.groupby('Neighborhood').mean().reset_index()
centraltoronto_grouped

Unnamed: 0,Neighborhood,American Restaurant,BBQ Joint,Bagel Shop,Bank,Breakfast Spot,Brewery,Burger Joint,Bus Line,Café,Cheese Shop,Chinese Restaurant,Clothing Store,Coffee Shop,Dance Studio,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Donut Shop,Farmers Market,Fast Food Restaurant,Food & Drink Shop,Fried Chicken Joint,Garden,Gas Station,Gourmet Shop,Greek Restaurant,Gym,Gym / Fitness Center,History Museum,Home Service,Hotel,Indian Restaurant,Italian Restaurant,Jewelry Store,Light Rail Station,Liquor Store,Mexican Restaurant,Middle Eastern Restaurant,Park,Pharmacy,Pizza Place,Pool,Pub,Restaurant,Salon / Barbershop,Sandwich Place,Seafood Restaurant,Shoe Store,Spa,Sporting Goods Shop,Sports Bar,Supermarket,Sushi Restaurant,Swim School,Thai Restaurant,Toy / Game Store,Trail,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Yoga Studio
0,Davisville,0.0,0.0,0.0,0.0,0.0,0.03125,0.0,0.0,0.0625,0.0,0.0,0.0,0.0625,0.0,0.0,0.09375,0.0,0.03125,0.0,0.03125,0.0,0.0,0.0,0.0,0.03125,0.03125,0.03125,0.0625,0.0,0.0,0.0,0.0,0.03125,0.0625,0.0,0.0,0.0,0.0,0.0,0.03125,0.03125,0.0625,0.0,0.0,0.03125,0.03125,0.09375,0.03125,0.0,0.0,0.0,0.0,0.0,0.0625,0.0,0.03125,0.03125,0.0,0.0,0.0,0.0
1,Davisville North,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.1,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Forest Hill North & West, Forest Hill Road Park",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.25,0.0,0.0,0.0
3,Lawrence Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0
4,"Moore Park, Summerhill East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"North Toronto West, Lawrence Park",0.0,0.0,0.052632,0.0,0.0,0.0,0.0,0.0,0.052632,0.0,0.052632,0.157895,0.105263,0.0,0.0,0.0,0.0,0.052632,0.0,0.0,0.052632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052632,0.0,0.052632,0.0,0.0,0.0,0.0,0.052632,0.052632,0.0,0.0,0.052632,0.052632,0.052632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052632
6,Roselawn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,"Summerhill West, Rathnelly, South Hill, Forest...",0.0625,0.0,0.0625,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,0.0625,0.0,0.0,0.0,0.0,0.0625,0.0,0.125,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,0.0625,0.0625,0.0,0.0,0.0,0.0,0.0,0.0625,0.0
8,"The Annex, North Midtown, Yorkville",0.0,0.047619,0.0,0.0,0.0,0.0,0.047619,0.0,0.142857,0.047619,0.0,0.0,0.095238,0.0,0.0,0.0,0.0,0.0,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.0,0.0,0.047619,0.0,0.0,0.0,0.047619,0.0,0.047619,0.047619,0.047619,0.047619,0.0,0.047619,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.0,0.0


 <h4>Let's confirm the new size</h4>

In [None]:
centraltoronto_grouped.shape

 <h4>Let's print each neighborhood along with the top 5 most common venues </h4>

In [49]:
num_top_venues = 5

for hood in centraltoronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = centraltoronto_grouped[centraltoronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Davisville----
                venue  freq
0        Dessert Shop  0.09
1      Sandwich Place  0.09
2         Coffee Shop  0.06
3  Italian Restaurant  0.06
4                Café  0.06


----Davisville North----
                  venue  freq
0                 Hotel   0.2
1        Breakfast Spot   0.1
2     Food & Drink Shop   0.1
3  Gym / Fitness Center   0.1
4      Department Store   0.1


----Forest Hill North & West, Forest Hill Road Park----
                 venue  freq
0        Jewelry Store  0.25
1                Trail  0.25
2     Sushi Restaurant  0.25
3                 Park  0.25
4  American Restaurant  0.00


----Lawrence Park----
                venue  freq
0         Swim School  0.25
1            Bus Line  0.25
2                Park  0.25
3  Dim Sum Restaurant  0.25
4        Liquor Store  0.00


----Moore Park, Summerhill East----
                 venue  freq
0                 Park   1.0
1  American Restaurant   0.0
2                Hotel   0.0
3   Italian Restaurant   0.0

<h4>Let's put that into a pandas dataframe. First, let's write a function to sort the venues in descending order.</h4>

In [50]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

<h4>Now let's create the new dataframe and display the top 10 venues for each neighborhood.</h4>

In [51]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = centraltoronto_grouped['Neighborhood']

for ind in np.arange(centraltoronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(centraltoronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Davisville,Dessert Shop,Sandwich Place,Sushi Restaurant,Coffee Shop,Italian Restaurant,Gym,Pizza Place,Café,Thai Restaurant,Gas Station
1,Davisville North,Hotel,Department Store,Sandwich Place,Food & Drink Shop,Park,Pizza Place,Dance Studio,Gym / Fitness Center,Breakfast Spot,Bus Line
2,"Forest Hill North & West, Forest Hill Road Park",Jewelry Store,Trail,Park,Sushi Restaurant,Fast Food Restaurant,Dim Sum Restaurant,Diner,Donut Shop,Farmers Market,Yoga Studio
3,Lawrence Park,Swim School,Bus Line,Park,Dim Sum Restaurant,Yoga Studio,Dessert Shop,Greek Restaurant,Gourmet Shop,Gas Station,Garden
4,"Moore Park, Summerhill East",Park,Yoga Studio,Dessert Shop,Gym,Greek Restaurant,Gourmet Shop,Gas Station,Garden,Fried Chicken Joint,Food & Drink Shop


<h4>Cluster Neighborhoods.Run k-means to cluster the neighborhood into 5 clusters.</h4>

In [52]:
# set number of clusters
kclusters = 5

centraltoronto_grouped_clustering = centraltoronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(centraltoronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 3, 4, 2, 0, 1, 0, 0], dtype=int32)

 <h4>Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood. </h4>

In [53]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

centraltoronto_merged = toronto_data1

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
centraltoronto_merged = centraltoronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

centraltoronto_merged.head() # check the last columns!

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,4,Swim School,Bus Line,Park,Dim Sum Restaurant,Yoga Studio,Dessert Shop,Greek Restaurant,Gourmet Shop,Gas Station,Garden
1,M4P,Central Toronto,Davisville North,43.712751,-79.390197,0,Hotel,Department Store,Sandwich Place,Food & Drink Shop,Park,Pizza Place,Dance Studio,Gym / Fitness Center,Breakfast Spot,Bus Line
2,M4R,Central Toronto,"North Toronto West, Lawrence Park",43.715383,-79.405678,0,Clothing Store,Coffee Shop,Shoe Store,Gym / Fitness Center,Fast Food Restaurant,Diner,Mexican Restaurant,Park,Restaurant,Salon / Barbershop
3,M4S,Central Toronto,Davisville,43.704324,-79.38879,0,Dessert Shop,Sandwich Place,Sushi Restaurant,Coffee Shop,Italian Restaurant,Gym,Pizza Place,Café,Thai Restaurant,Gas Station
4,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316,2,Park,Yoga Studio,Dessert Shop,Gym,Greek Restaurant,Gourmet Shop,Gas Station,Garden,Fried Chicken Joint,Food & Drink Shop


 <h4>Finally, let's visualize the resulting clusters</h4>

In [54]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(centraltoronto_merged['Latitude'], centraltoronto_merged['Longitude'], centraltoronto_merged['Neighborhood'], centraltoronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

 <h4>Examine Clusters</h4>

Clsuter 1

In [55]:
centraltoronto_merged.loc[centraltoronto_merged['Cluster Labels'] == 0, centraltoronto_merged.columns[[1] + list(range(5, centraltoronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Central Toronto,0,Hotel,Department Store,Sandwich Place,Food & Drink Shop,Park,Pizza Place,Dance Studio,Gym / Fitness Center,Breakfast Spot,Bus Line
2,Central Toronto,0,Clothing Store,Coffee Shop,Shoe Store,Gym / Fitness Center,Fast Food Restaurant,Diner,Mexican Restaurant,Park,Restaurant,Salon / Barbershop
3,Central Toronto,0,Dessert Shop,Sandwich Place,Sushi Restaurant,Coffee Shop,Italian Restaurant,Gym,Pizza Place,Café,Thai Restaurant,Gas Station
5,Central Toronto,0,Pub,Coffee Shop,Sports Bar,Fried Chicken Joint,Vietnamese Restaurant,Light Rail Station,Liquor Store,Pizza Place,Restaurant,American Restaurant
8,Central Toronto,0,Café,Sandwich Place,Coffee Shop,Pub,Park,Pharmacy,Pizza Place,Liquor Store,Donut Shop,Indian Restaurant


Clsuter 2


In [56]:
centraltoronto_merged.loc[centraltoronto_merged['Cluster Labels'] == 1, centraltoronto_merged.columns[[1] + list(range(5, centraltoronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
6,Central Toronto,1,Home Service,Garden,Pool,Dessert Shop,Gym,Greek Restaurant,Gourmet Shop,Gas Station,Fried Chicken Joint,Food & Drink Shop


Clsuter 3

In [57]:
centraltoronto_merged.loc[centraltoronto_merged['Cluster Labels'] == 2, centraltoronto_merged.columns[[1] + list(range(5, centraltoronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,Central Toronto,2,Park,Yoga Studio,Dessert Shop,Gym,Greek Restaurant,Gourmet Shop,Gas Station,Garden,Fried Chicken Joint,Food & Drink Shop


Clsuter 4

In [58]:
centraltoronto_merged.loc[centraltoronto_merged['Cluster Labels'] == 3, centraltoronto_merged.columns[[1] + list(range(5, centraltoronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
7,Central Toronto,3,Jewelry Store,Trail,Park,Sushi Restaurant,Fast Food Restaurant,Dim Sum Restaurant,Diner,Donut Shop,Farmers Market,Yoga Studio


Clsuter 5

In [59]:
centraltoronto_merged.loc[centraltoronto_merged['Cluster Labels'] == 4, centraltoronto_merged.columns[[1] + list(range(5, centraltoronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Central Toronto,4,Swim School,Bus Line,Park,Dim Sum Restaurant,Yoga Studio,Dessert Shop,Greek Restaurant,Gourmet Shop,Gas Station,Garden
