# Neighborhoods in Toronto (data on Wikipedia was last updated on 15 July 2019)

## Importing Required Libraries

In [0]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import folium
import json
from pandas.io.json import json_normalize
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

## Scraping the Wikipedia page and getting the table content

In [0]:
wiki = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
wiki_content = wiki.content
soup = BeautifulSoup(wiki_content)
table_content = soup.find('table', attrs = {'class': 'wikitable'})
table_rows = table_content.find_all('tr')


## Transforming table content in pandas dataframe and removing rows having values 'Not assigned'

In [0]:
res = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text.strip() for tr in td if tr.text.strip()]
    if row:
        res.append(row)


df = pd.DataFrame(res, columns=["Postcode", "Borough", "Neighborhood"])
df = df[df.Borough != 'Not assigned']
df = df[df.Neighborhood != 'Not assigned']

## Combining rows on basis of Boroughs

In [4]:
df = df.groupby('Borough').agg({'Postcode':'first', 
                             'Neighborhood': ', '.join }).reset_index()
df

Unnamed: 0,Borough,Postcode,Neighborhood
0,Central Toronto,M4N,"Lawrence Park, Roselawn, Davisville North, For..."
1,Downtown Toronto,M5A,"Harbourfront, Regent Park, Ryerson, Garden Dis..."
2,East Toronto,M4E,"The Beaches, The Danforth West, Riverdale, The..."
3,East York,M4B,"Woodbine Gardens, Parkview Hill, Woodbine Heig..."
4,Etobicoke,M9A,"Islington Avenue, Cloverdale, Islington, Marti..."
5,Mississauga,M7R,Canada Post Gateway Processing Centre
6,North York,M3A,"Parkwoods, Victoria Village, Lawrence Heights,..."
7,Scarborough,M1B,"Rouge, Malvern, Highland Creek, Rouge Hill, Po..."
8,West Toronto,M6H,"Dovercourt Village, Dufferin, Little Portugal,..."
9,York,M6C,"Humewood-Cedarvale, Caledonia-Fairbanks, Del R..."


## Using shape method to print number of rows in the dataframe

In [5]:
df.shape

(10, 3)

## Mounting Google Drive to use Geospatial CSV file provided

In [6]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Converting CSV file into a dataframe

In [0]:
geospatial_df = pd.read_csv('gdrive/My Drive/Geospatial_Coordinates.csv')

## Merging dataframes on basis of Postcode

In [8]:
geospatial_df.rename(columns={'Postal Code': 'Postcode'}, inplace=True)
geospatial_df
geo_df = pd.merge(df, geospatial_df, on=['Postcode'], how='inner')
geo_df

Unnamed: 0,Borough,Postcode,Neighborhood,Latitude,Longitude
0,Central Toronto,M4N,"Lawrence Park, Roselawn, Davisville North, For...",43.72802,-79.38879
1,Downtown Toronto,M5A,"Harbourfront, Regent Park, Ryerson, Garden Dis...",43.65426,-79.360636
2,East Toronto,M4E,"The Beaches, The Danforth West, Riverdale, The...",43.676357,-79.293031
3,East York,M4B,"Woodbine Gardens, Parkview Hill, Woodbine Heig...",43.706397,-79.309937
4,Etobicoke,M9A,"Islington Avenue, Cloverdale, Islington, Marti...",43.667856,-79.532242
5,Mississauga,M7R,Canada Post Gateway Processing Centre,43.636966,-79.615819
6,North York,M3A,"Parkwoods, Victoria Village, Lawrence Heights,...",43.753259,-79.329656
7,Scarborough,M1B,"Rouge, Malvern, Highland Creek, Rouge Hill, Po...",43.806686,-79.194353
8,West Toronto,M6H,"Dovercourt Village, Dufferin, Little Portugal,...",43.669005,-79.442259
9,York,M6C,"Humewood-Cedarvale, Caledonia-Fairbanks, Del R...",43.693781,-79.428191


## Plotting all the boroughs

In [9]:
tor_lat = 43.651070
tor_long = -79.347015
map_toronto = folium.Map(location=[tor_lat, tor_long], zoom_start=10)

for lat, lng, borough, neighborhood in zip(geo_df['Latitude'], geo_df['Longitude'], geo_df['Borough'], geo_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

## Creating separate dataframe for Toronto Boroughs

In [10]:
toronto_df = geo_df[geo_df['Borough'].str.contains('Toronto')]
toronto_df

Unnamed: 0,Borough,Postcode,Neighborhood,Latitude,Longitude
0,Central Toronto,M4N,"Lawrence Park, Roselawn, Davisville North, For...",43.72802,-79.38879
1,Downtown Toronto,M5A,"Harbourfront, Regent Park, Ryerson, Garden Dis...",43.65426,-79.360636
2,East Toronto,M4E,"The Beaches, The Danforth West, Riverdale, The...",43.676357,-79.293031
8,West Toronto,M6H,"Dovercourt Village, Dufferin, Little Portugal,...",43.669005,-79.442259


## Initializing Foursquare details

In [0]:
CLIENT_ID = '*******' # your Foursquare ID
CLIENT_SECRET = '*******' # your Foursquare Secret
VERSION = '20190913' # Foursquare API version
downtor_lat = toronto_df.loc[1, 'Latitude']
downtor_long = toronto_df.loc[1, 'Longitude']
LIMIT = 50
radius = 500

## Gathering nearby venues for all the neighborhoods

In [12]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
                CLIENT_ID, 
                CLIENT_SECRET, 
                VERSION, 
                downtor_lat, 
                downtor_long, 
                radius, 
                LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

toronto_venues = getNearbyVenues(names=toronto_df['Neighborhood'],
                                   latitudes=toronto_df['Latitude'],
                                   longitudes=toronto_df['Longitude']
                                  )
toronto_venues

Lawrence Park, Roselawn, Davisville North, Forest Hill North, Forest Hill West, North Toronto West, The Annex, North Midtown, Yorkville, Davisville, Moore Park, Summerhill East, Deer Park, Forest Hill SE, Rathnelly, South Hill, Summerhill West
Harbourfront, Regent Park, Ryerson, Garden District, St. James Town, Berczy Park, Central Bay Street, Christie, Adelaide, King, Richmond, Harbourfront East, Toronto Islands, Union Station, Design Exchange, Toronto Dominion Centre, Commerce Court, Victoria Hotel, Harbord, University of Toronto, Chinatown, Grange Park, Kensington Market, CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara, Rosedale, Stn A PO Boxes 25 The Esplanade, Cabbagetown, St. James Town, First Canadian Place, Underground city, Church and Wellesley
The Beaches, The Danforth West, Riverdale, The Beaches West, India Bazaar, Studio District, Business Reply Mail Processing Centre 969 Eastern
Dovercourt Village, Dufferin, Littl

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Lawrence Park, Roselawn, Davisville North, For...",43.728020,-79.388790,Roselle Desserts,43.653447,-79.362017,Bakery
1,"Lawrence Park, Roselawn, Davisville North, For...",43.728020,-79.388790,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,"Lawrence Park, Roselawn, Davisville North, For...",43.728020,-79.388790,Toronto Cooper Koo Family Cherry St YMCA Centre,43.653191,-79.357947,Gym / Fitness Center
3,"Lawrence Park, Roselawn, Davisville North, For...",43.728020,-79.388790,Body Blitz Spa East,43.654735,-79.359874,Spa
4,"Lawrence Park, Roselawn, Davisville North, For...",43.728020,-79.388790,Impact Kitchen,43.656369,-79.356980,Restaurant
5,"Lawrence Park, Roselawn, Davisville North, For...",43.728020,-79.388790,Morning Glory Cafe,43.653947,-79.361149,Breakfast Spot
6,"Lawrence Park, Roselawn, Davisville North, For...",43.728020,-79.388790,Figs Breakfast & Lunch,43.655675,-79.364503,Breakfast Spot
7,"Lawrence Park, Roselawn, Davisville North, For...",43.728020,-79.388790,The Extension Room,43.653313,-79.359725,Gym / Fitness Center
8,"Lawrence Park, Roselawn, Davisville North, For...",43.728020,-79.388790,Corktown Common,43.655618,-79.356211,Park
9,"Lawrence Park, Roselawn, Davisville North, For...",43.728020,-79.388790,Dominion Pub and Kitchen,43.656919,-79.358967,Pub


## Categories of Venue present in the dataframe

In [13]:
toronto_venues['Venue Category'].unique()

array(['Bakery', 'Coffee Shop', 'Gym / Fitness Center', 'Spa',
       'Restaurant', 'Breakfast Spot', 'Park', 'Pub', 'Historic Site',
       'Chocolate Shop', 'Farmers Market', 'Dessert Shop',
       'Performing Arts Venue', 'Café', 'Mexican Restaurant',
       'French Restaurant', 'Theater', 'Yoga Studio', 'Shoe Store',
       'Event Space', 'Ice Cream Shop', 'Art Gallery',
       'Electronics Store', 'Brewery', 'Cosmetics Shop', 'Bank',
       'Beer Store', 'Hotel', 'Health Food Store'], dtype=object)

## Analyzing each neighborhood as per category

In [0]:
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood']
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

## Calculating the mean of a venue category for each neighborhood

In [15]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Art Gallery,Bakery,Bank,Beer Store,Breakfast Spot,Brewery,Café,Chocolate Shop,Coffee Shop,Cosmetics Shop,Dessert Shop,Electronics Store,Event Space,Farmers Market,French Restaurant,Gym / Fitness Center,Health Food Store,Historic Site,Hotel,Ice Cream Shop,Mexican Restaurant,Park,Performing Arts Venue,Pub,Restaurant,Shoe Store,Spa,Theater,Yoga Studio
0,"Dovercourt Village, Dufferin, Little Portugal,...",0.02,0.06,0.02,0.02,0.04,0.02,0.06,0.02,0.18,0.02,0.02,0.02,0.02,0.02,0.02,0.04,0.02,0.02,0.02,0.02,0.04,0.06,0.02,0.06,0.04,0.02,0.02,0.04,0.02
1,"Harbourfront, Regent Park, Ryerson, Garden Dis...",0.02,0.06,0.02,0.02,0.04,0.02,0.06,0.02,0.18,0.02,0.02,0.02,0.02,0.02,0.02,0.04,0.02,0.02,0.02,0.02,0.04,0.06,0.02,0.06,0.04,0.02,0.02,0.04,0.02
2,"Lawrence Park, Roselawn, Davisville North, For...",0.02,0.06,0.02,0.02,0.04,0.02,0.06,0.02,0.18,0.02,0.02,0.02,0.02,0.02,0.02,0.04,0.02,0.02,0.02,0.02,0.04,0.06,0.02,0.06,0.04,0.02,0.02,0.04,0.02
3,"The Beaches, The Danforth West, Riverdale, The...",0.02,0.06,0.02,0.02,0.04,0.02,0.06,0.02,0.18,0.02,0.02,0.02,0.02,0.02,0.02,0.04,0.02,0.02,0.02,0.02,0.04,0.06,0.02,0.06,0.04,0.02,0.02,0.04,0.02


## Getting top 5 venues for each neighborhood

In [16]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Dovercourt Village, Dufferin, Little Portugal, Trinity, Brockton, Exhibition Place, Parkdale Village, High Park, The Junction South, Parkdale, Roncesvalles, Runnymede, Swansea----
         venue  freq
0  Coffee Shop  0.18
1         Park  0.06
2       Bakery  0.06
3         Café  0.06
4          Pub  0.06


----Harbourfront, Regent Park, Ryerson, Garden District, St. James Town, Berczy Park, Central Bay Street, Christie, Adelaide, King, Richmond, Harbourfront East, Toronto Islands, Union Station, Design Exchange, Toronto Dominion Centre, Commerce Court, Victoria Hotel, Harbord, University of Toronto, Chinatown, Grange Park, Kensington Market, CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara, Rosedale, Stn A PO Boxes 25 The Esplanade, Cabbagetown, St. James Town, First Canadian Place, Underground city, Church and Wellesley----
         venue  freq
0  Coffee Shop  0.18
1         Park  0.06
2       Bakery  0.06
3         Café  0

## Function to get most common venues for a neighborhood

In [0]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

## Getting 10 most common venues for each neighborhood

In [18]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Dovercourt Village, Dufferin, Little Portugal,...",Coffee Shop,Café,Bakery,Pub,Park,Mexican Restaurant,Restaurant,Theater,Gym / Fitness Center,Breakfast Spot
1,"Harbourfront, Regent Park, Ryerson, Garden Dis...",Coffee Shop,Café,Bakery,Pub,Park,Mexican Restaurant,Restaurant,Theater,Gym / Fitness Center,Breakfast Spot
2,"Lawrence Park, Roselawn, Davisville North, For...",Coffee Shop,Café,Bakery,Pub,Park,Mexican Restaurant,Restaurant,Theater,Gym / Fitness Center,Breakfast Spot
3,"The Beaches, The Danforth West, Riverdale, The...",Coffee Shop,Café,Bakery,Pub,Park,Mexican Restaurant,Restaurant,Theater,Gym / Fitness Center,Breakfast Spot


## Running K means clustering on the dataframe

In [19]:
# set number of clusters
kclusters = 4

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

  return_n_iter=True)


array([0, 0, 0, 0], dtype=int32)

## Creating a single dataframe containing all the data

In [20]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Borough,Postcode,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Central Toronto,M4N,"Lawrence Park, Roselawn, Davisville North, For...",43.72802,-79.38879,0,Coffee Shop,Café,Bakery,Pub,Park,Mexican Restaurant,Restaurant,Theater,Gym / Fitness Center,Breakfast Spot
1,Downtown Toronto,M5A,"Harbourfront, Regent Park, Ryerson, Garden Dis...",43.65426,-79.360636,0,Coffee Shop,Café,Bakery,Pub,Park,Mexican Restaurant,Restaurant,Theater,Gym / Fitness Center,Breakfast Spot
2,East Toronto,M4E,"The Beaches, The Danforth West, Riverdale, The...",43.676357,-79.293031,0,Coffee Shop,Café,Bakery,Pub,Park,Mexican Restaurant,Restaurant,Theater,Gym / Fitness Center,Breakfast Spot
8,West Toronto,M6H,"Dovercourt Village, Dufferin, Little Portugal,...",43.669005,-79.442259,0,Coffee Shop,Café,Bakery,Pub,Park,Mexican Restaurant,Restaurant,Theater,Gym / Fitness Center,Breakfast Spot


## Mapping clusters

In [21]:
# create map
map_clusters = folium.Map(location=[tor_lat, tor_long], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Examining the Cluster

In [22]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Postcode,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4N,0,Coffee Shop,Café,Bakery,Pub,Park,Mexican Restaurant,Restaurant,Theater,Gym / Fitness Center,Breakfast Spot
1,M5A,0,Coffee Shop,Café,Bakery,Pub,Park,Mexican Restaurant,Restaurant,Theater,Gym / Fitness Center,Breakfast Spot
2,M4E,0,Coffee Shop,Café,Bakery,Pub,Park,Mexican Restaurant,Restaurant,Theater,Gym / Fitness Center,Breakfast Spot
8,M6H,0,Coffee Shop,Café,Bakery,Pub,Park,Mexican Restaurant,Restaurant,Theater,Gym / Fitness Center,Breakfast Spot
