# Exploring and Clustering the Neighborhoods of Toronto - Parts I, II, III

# Part I

In [2]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
import json # library to handle JSON files
import requests # library to handle requests
from bs4 import BeautifulSoup

print('Libraries imported.')

Libraries imported.


### Extracting data from Wikipedia

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
source = requests.get(url).text
soup = BeautifulSoup(source, 'html.parser')

wiki_table = soup.body.table.tbody

In [4]:
def get_cell(element):
    cells = element.find_all('td')
    row = []
    
    for cell in cells:
        if cell.a:            
            if (cell.a.text):
                row.append(cell.a.text)
                continue
        row.append(cell.string.strip())
        
    return row

def get_row(source):    
    data = []  
    
    for tr in source.find_all('tr'):
        row = get_cell(tr)
        if len(row) != 3:
            continue
        data.append(row)        
    
    return data

### Printing dataframe with three columns

In [13]:
data = get_row(wiki_table)
columns = ['Postcode', 'Borough', 'Neighborhood']
df = pd.DataFrame(data, columns=columns)
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Removing 'Not Assigned'

In [14]:
df = df[df.Borough != 'Not assigned']
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


### Reseting index

In [15]:
df.reset_index(inplace=True)
df.drop('index',axis=1,inplace=True)

df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


### Groupby 

In [17]:
df = df.groupby(['Postcode','Borough'])['Neighborhood'].apply(lambda x: ", ".join(x.astype(str))).reset_index()
df = df.sample(frac=1).reset_index(drop=True)
df.head(12)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1H,Scarborough,Cedarbrae
1,M6H,West Toronto,"Dovercourt Village, Dufferin"
2,M2M,North York,"Newtonbrook, Willowdale"
3,M9B,Etobicoke,"Cloverdale, Islington, Martin Grove, Princess ..."
4,M4G,East York,Leaside
5,M1J,Scarborough,Scarborough Village
6,M4M,East Toronto,Studio District
7,M4R,Central Toronto,North Toronto West
8,M8W,Etobicoke,"Alderwood, Long Branch"
9,M6C,York,Humewood-Cedarvale


### Number of rows in dataframe

In [18]:
df.shape

(103, 3)

# Part II - Adding the Latitude Longitude of each Neighborhood 

### Adding Geospatial data

In [19]:
url2="http://cocl.us/Geospatial_data"
geo_data=pd.read_csv(url2)
geo_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Printing dataframe with Latitude and Longitude

In [20]:
data = df.set_index('Postcode').join(geo_data.set_index('Postal Code'))
data = data.reset_index()
data.head(12)

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
1,M6H,West Toronto,"Dovercourt Village, Dufferin",43.669005,-79.442259
2,M2M,North York,"Newtonbrook, Willowdale",43.789053,-79.408493
3,M9B,Etobicoke,"Cloverdale, Islington, Martin Grove, Princess ...",43.650943,-79.554724
4,M4G,East York,Leaside,43.70906,-79.363452
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M4M,East Toronto,Studio District,43.659526,-79.340923
7,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
8,M8W,Etobicoke,"Alderwood, Long Branch",43.602414,-79.543484
9,M6C,York,Humewood-Cedarvale,43.693781,-79.428191


# Part III - Exploring and Clustering the Neighborhoods

In [25]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from geopy.geocoders import Nominatim
from sklearn.decomposition import PCA
!conda install -c conda-forge folium
import folium
from tqdm import tqdm
from collections import deque
import matplotlib.cm as cm
import matplotlib.colors as colors

Solving environment: done

# All requested packages already installed.



In [68]:
Toronto_data = data[data.Borough.str.contains("Toronto")]
Toronto_data.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
1,M6H,West Toronto,"Dovercourt Village, Dufferin",43.669005,-79.442259
6,M4M,East Toronto,Studio District,43.659526,-79.340923
7,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
12,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
13,M6K,West Toronto,"Brockton, Exhibition Place, Parkdale Village",43.636847,-79.428191


### Number of boroughs and neighborhoods

In [70]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(Toronto_data['Borough'].unique()),
        Toronto_data.shape[0]
    )
)

The dataframe has 4 boroughs and 38 neighborhoods.


### Geolocator using nominatim

In [31]:
address = 'Toronto, Canada'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

  app.launch_new_instance()


The geograpical coordinate of Toronto are 43.653963, -79.387207.


### Map of Toronto with latitude and longitude

In [33]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

neighborhoods = Toronto_data

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'],
                                           neighborhoods['Longitude'],
                                           neighborhoods['Borough'],
                                           neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)  
    
map_toronto

In [36]:
CLIENT_ID = 'GTMWFQVICQ3IEWUMXJ1WPOLWMRRHGZVRUSYGUU1Z4E2HH1JR'
CLIENT_SECRET = '4D0VQL4YUZ5ZOAZS2E2R1ZAURWIN5UFZRJLNWF1TXVUHGJKO'
VERSION = '20180605'

### Getting nearby venue info with API request

In [37]:
def getNearbyVenues(names, latitudes, longitudes, radius=500, LIMIT = 100):
    
    venues_list=[]
    for name, lat, lng in tqdm(zip(names, latitudes, longitudes), total = names.size):
        
            
      
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
   
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [38]:
Toronto_venues = getNearbyVenues(Toronto_data.Neighborhood,
                            Toronto_data.Latitude,
                            Toronto_data.Longitude)

100%|██████████| 38/38 [00:18<00:00,  2.14it/s]


In [39]:
print(Toronto_venues.shape)
Toronto_venues.head()

(1690, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Dovercourt Village, Dufferin",43.669005,-79.442259,The Greater Good Bar,43.669409,-79.439267,Bar
1,"Dovercourt Village, Dufferin",43.669005,-79.442259,Parallel,43.669516,-79.438728,Middle Eastern Restaurant
2,"Dovercourt Village, Dufferin",43.669005,-79.442259,Happy Bakery & Pastries,43.66705,-79.441791,Bakery
3,"Dovercourt Village, Dufferin",43.669005,-79.442259,Planet Fitness Toronto Galleria,43.667588,-79.442574,Gym / Fitness Center
4,"Dovercourt Village, Dufferin",43.669005,-79.442259,FreshCo,43.667918,-79.440754,Supermarket


In [40]:
Toronto_venues.groupby("Neighborhood").Venue.count().sort_values(ascending=False).head()

Neighborhood
Adelaide, King, Richmond                     100
St. James Town                               100
Ryerson, Garden District                     100
Chinatown, Grange Park, Kensington Market    100
Commerce Court, Victoria Hotel               100
Name: Venue, dtype: int64

### Unique categories

In [66]:
print('Number of unique categories {}.'.format(len(Toronto_venues['Venue Category'].unique())))

Number of unique categories 232.


In [48]:
Toronto_onehot = pd.get_dummies(Toronto_venues["Venue Category"],
                             prefix = "",
                             prefix_sep = "")

Toronto_onehot["Neighborhood"] = Toronto_venues["Neighborhood"]


nindex = list(Toronto_onehot.columns).index("Neighborhood")
cols = deque(Toronto_onehot.columns)
cols.rotate(-nindex)
cols = list(cols)
Toronto_onehot = Toronto_onehot[cols]

Toronto_onehot.head()

Unnamed: 0,Neighborhood,New American Restaurant,Nightclub,Noodle House,Office,Opera House,Optical Shop,Organic Grocery,Other Great Outdoors,Park,...,Mexican Restaurant,Middle Eastern Restaurant,Miscellaneous Shop,Modern European Restaurant,Molecular Gastronomy Restaurant,Monument / Landmark,Movie Theater,Museum,Music Store,Music Venue
0,"Dovercourt Village, Dufferin",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Dovercourt Village, Dufferin",0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,"Dovercourt Village, Dufferin",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Dovercourt Village, Dufferin",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Dovercourt Village, Dufferin",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [49]:
Toronto_onehot.shape

(1690, 232)

### One hot grouping

In [50]:
Toronto_grouped = Toronto_onehot.groupby('Neighborhood').mean().reset_index()
Toronto_grouped.head()

Unnamed: 0,Neighborhood,New American Restaurant,Nightclub,Noodle House,Office,Opera House,Optical Shop,Organic Grocery,Other Great Outdoors,Park,...,Mexican Restaurant,Middle Eastern Restaurant,Miscellaneous Shop,Modern European Restaurant,Molecular Gastronomy Restaurant,Monument / Landmark,Movie Theater,Museum,Music Store,Music Venue
0,"Adelaide, King, Richmond",0.01,0.0,0.01,0.01,0.01,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.01,0.0,0.01,0.0,0.0,0.0,0.0
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.017857,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.017857,0.0,0.0
2,"Brockton, Exhibition Place, Parkdale Village",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478
3,Business Reply Mail Processing Centre 969 Eastern,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [45]:
Toronto_grouped.shape

(38, 232)

### Most common venues

In [46]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

### Create columns to top number of venues

In [47]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = Toronto_grouped['Neighborhood']

for ind in np.arange(Toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(Toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Coffee Shop,Café,Thai Restaurant,Bar,Steakhouse,Hotel,Asian Restaurant,Restaurant,Gym,Breakfast Spot
1,Berczy Park,Coffee Shop,Cocktail Bar,Cheese Shop,Farmers Market,Beer Bar,Seafood Restaurant,Steakhouse,Café,Bakery,Concert Hall
2,"Brockton, Exhibition Place, Parkdale Village",Café,Coffee Shop,Breakfast Spot,Music Venue,Gym,Grocery Store,Furniture / Home Store,Falafel Restaurant,Convenience Store,Climbing Gym
3,Business Reply Mail Processing Centre 969 Eastern,Garden Center,Pizza Place,Auto Workshop,Spa,Restaurant,Gym / Fitness Center,Comic Shop,Light Rail Station,Smoke Shop,Fast Food Restaurant
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",Airport Service,Airport Lounge,Airport Terminal,Airport Gate,Airport,Boat or Ferry,Airport Food Court,Boutique,Bar,Harbor / Marina


### PCA used to reduce noise 

In [51]:
pca = PCA(.95)
Toronto_grouped_clustering = pca.fit_transform(Toronto_grouped.drop('Neighborhood', 1))
Toronto_grouped_clustering = Toronto_grouped.drop('Neighborhood', 1)

In [52]:
Toronto_grouped_clustering.shape

(38, 231)

### Set number of clusters

In [53]:
kclusters = 5
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Toronto_grouped_clustering)
print(kmeans.labels_[0:10])
print(kmeans.labels_.shape)

[0 0 0 0 0 0 0 0 0 0]
(38,)


### Add cluster to labels

In [55]:
Toronto_grouped["Cluster Labels"] = kmeans.labels_

# add clustering labels
Toronto_combined = TorontoData.merge(Toronto_grouped, left_on = "Neighborhood", right_on = "Neighborhood", how = "outer")


Toronto_combined = Toronto_combined.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

Toronto_combined["Cluster Labels"] = Toronto_combined["Cluster Labels"].fillna(5).astype("int")

Toronto_combined.head() # check the last columns!

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,New American Restaurant,Nightclub,Noodle House,Office,Opera House,...,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M6H,West Toronto,"Dovercourt Village, Dufferin",43.669005,-79.442259,0.0,0.0,0.0,0.0,0.0,...,Bakery,Supermarket,Pharmacy,Music Venue,Park,Brewery,Bar,Bank,Discount Store,Gym / Fitness Center
1,M4M,East Toronto,Studio District,43.659526,-79.340923,0.0,0.0,0.0,0.0,0.0,...,Café,Coffee Shop,American Restaurant,Bakery,Gastropub,Italian Restaurant,Seafood Restaurant,Comfort Food Restaurant,Convenience Store,Sandwich Place
2,M4R,Central Toronto,North Toronto West,43.715383,-79.405678,0.0,0.0,0.0,0.0,0.0,...,Sporting Goods Shop,Coffee Shop,Clothing Store,Yoga Studio,Metro Station,Restaurant,Gift Shop,Bagel Shop,Salon / Barbershop,Spa
3,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383,0.0,0.0,0.0,0.012195,0.0,...,Coffee Shop,Café,Italian Restaurant,Ice Cream Shop,Sandwich Place,Burger Joint,Bakery,Bar,Salad Place,Bubble Tea Shop
4,M6K,West Toronto,"Brockton, Exhibition Place, Parkdale Village",43.636847,-79.428191,0.0,0.0,0.0,0.0,0.0,...,Café,Coffee Shop,Breakfast Spot,Music Venue,Gym,Grocery Store,Furniture / Home Store,Falafel Restaurant,Convenience Store,Climbing Gym


### Creating map

In [62]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

kclusters = kclusters + 1

x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]


markers_colors = []
for lat, lon, poi, cluster in zip(Toronto_combined['Latitude'],
                                  Toronto_combined['Longitude'],
                                  Toronto_combined['Neighborhood'],
                                  Toronto_combined['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
map_clusters

### Sorting clusters based on most common venues: 1st to 10th most common

### Cluster 1

In [57]:
Toronto_combined.loc[Toronto_combined['Cluster Labels'] == 0, 
                     "1st Most Common Venue":"10th Most Common Venue"].head()

Unnamed: 0,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Bakery,Supermarket,Pharmacy,Music Venue,Park,Brewery,Bar,Bank,Discount Store,Gym / Fitness Center
1,Café,Coffee Shop,American Restaurant,Bakery,Gastropub,Italian Restaurant,Seafood Restaurant,Comfort Food Restaurant,Convenience Store,Sandwich Place
2,Sporting Goods Shop,Coffee Shop,Clothing Store,Yoga Studio,Metro Station,Restaurant,Gift Shop,Bagel Shop,Salon / Barbershop,Spa
3,Coffee Shop,Café,Italian Restaurant,Ice Cream Shop,Sandwich Place,Burger Joint,Bakery,Bar,Salad Place,Bubble Tea Shop
4,Café,Coffee Shop,Breakfast Spot,Music Venue,Gym,Grocery Store,Furniture / Home Store,Falafel Restaurant,Convenience Store,Climbing Gym


### Cluster 2

In [58]:
Toronto_combined.loc[Toronto_combined['Cluster Labels'] == 1, 
                     "1st Most Common Venue":"10th Most Common Venue"].head()

Unnamed: 0,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
27,Trail,Summer Camp,Playground,Music Venue,Afghan Restaurant,Arts & Crafts Store,Art Gallery,Aquarium,Antique Shop,American Restaurant


### Cluster 3

In [59]:
Toronto_combined.loc[Toronto_combined['Cluster Labels'] == 2, 
                     "1st Most Common Venue":"10th Most Common Venue"].head()

Unnamed: 0,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
6,Park,Playground,Trail,Building,Afghan Restaurant,Arts & Crafts Store,Art Gallery,Aquarium,Antique Shop,American Restaurant
11,Jewelry Store,Sushi Restaurant,Trail,Park,Music Venue,Airport,Arts & Crafts Store,Art Gallery,Aquarium,Antique Shop


### Cluster 4

In [60]:
Toronto_combined.loc[Toronto_combined['Cluster Labels'] == 3, 
                     "1st Most Common Venue":"10th Most Common Venue"].head()

Unnamed: 0,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
12,Music Venue,Home Service,Garden,Bus Line,Wings Joint,Antique Shop,American Restaurant,Airport Terminal,Airport Service,Airport Lounge


### Cluster 5

In [61]:
Toronto_combined.loc[Toronto_combined['Cluster Labels'] == 4, 
                     "1st Most Common Venue":"10th Most Common Venue"].head()

Unnamed: 0,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
19,Bus Line,Photography Studio,Swim School,Park,Music Venue,Airport Terminal,Airport Food Court,Airport Gate,Airport Lounge,Airport Service


### The End