## This is a Data Science Project on Clustering and segmenting neighbhour hood areas in Toronto, Canada.

Importing Libraries

In [1]:
import pandas as pd
import numpy as np

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
dfs = pd.read_html(url)
df1 = dfs[0]
df = df1[['Postal Code','Borough','Neighborhood']]

Pre-Processing Data

Replacing 'Not assigned' with np.nan in borough column
Droping cells with np.nan value in borough column 

In [3]:
df['Borough'].replace('Not assigned', np.nan,inplace = True)
df.dropna(inplace = True)

Replacing Neighborhood 'Not assigned' values to its corrosponding borough value.

In [4]:
df['Neighborhood'].replace('Not assigned',df['Borough'])
df.shape

(103, 3)

In [5]:
df

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"
11,M3B,North York,Don Mills
12,M4B,East York,"Parkview Hill, Woodbine Gardens"
13,M5B,Downtown Toronto,"Garden District, Ryerson"


Making a new DataFrame

In [6]:
column_names = ['Postal Code', 'Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)

Lets look at the structure of the dataframe

In [7]:
neighborhoods

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude


In [8]:
neighborhoods['Borough'] = df['Borough']
neighborhoods['Neighborhood'] = df['Neighborhood']
neighborhoods['Postal Code'] = df['Postal Code']

In [9]:
neighborhoods

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
2,M3A,North York,Parkwoods,,
3,M4A,North York,Victoria Village,,
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",,
5,M6A,North York,"Lawrence Manor, Lawrence Heights",,
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",,
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",,
9,M1B,Scarborough,"Malvern, Rouge",,
11,M3B,North York,Don Mills,,
12,M4B,East York,"Parkview Hill, Woodbine Gardens",,
13,M5B,Downtown Toronto,"Garden District, Ryerson",,


Lets read a csv file containing coordinates of postal codes.

In [10]:
coord = pd.read_csv('https://cocl.us/Geospatial_data')

Now, Lets compare the two dataframes and interate over each row to assign the latitude and longitude value of the postal code in neighborhoods dataframe from coord dataframe. 

In [11]:
for row in coord.iterrows():
    for row1 in neighborhoods.iterrows():
        if(row[1][0] == row1[1][0]):
            row1[1][3] = row[1][1] #latitude
            row1[1][4] = row[1][2] #longitude
   

Lets see how neighborhoods dataframe looks like...

In [12]:
neighborhoods

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
2,M3A,North York,Parkwoods,43.7533,-79.3297
3,M4A,North York,Victoria Village,43.7259,-79.3156
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6543,-79.3606
5,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7185,-79.4648
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6623,-79.3895
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.6679,-79.5322
9,M1B,Scarborough,"Malvern, Rouge",43.8067,-79.1944
11,M3B,North York,Don Mills,43.7459,-79.3522
12,M4B,East York,"Parkview Hill, Woodbine Gardens",43.7064,-79.3099
13,M5B,Downtown Toronto,"Garden District, Ryerson",43.6572,-79.3789


In [67]:
#!conda install -c conda-forge geopy --yes #uncomment if you dont have installed  
from geopy.geocoders import Nominatim


#!conda install -c conda-forge folium=0.5.0 --yes
import folium #for map visualisation


# Matplotlib and associated plotting modules
#import matpotlib as plt 
#import matplotlib.cm as cm
#import matplotlib.colors as colors


# import k-means from clustering stage
from sklearn.cluster import KMeans 

import requests
from pandas.io.json import json_normalize

print('Libraries imported.')

Libraries imported.


In [15]:
geolocator = Nominatim(user_agent = "toronto_explorer") 
location = geolocator.geocode("Toronto")
print(location.address) 
print("The Geographical Coordinates of Toronto are {}, {}".format(location.latitude, location.longitude))

Toronto, Golden Horseshoe, Ontario, M5H 2N2, Canada
The Geographical Coordinates of Toronto are 43.6534817, -79.3839347


Creating a map of Toronto!

In [17]:
# create map of Toronto using latitude and longitude values
latitude = location.latitude
longitude = location.longitude
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_newyork

Now lets analyse!!

In [48]:
# The code was removed by Watson Studio for sharing.

In [55]:
neighborhoods.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
2,M3A,North York,Parkwoods,43.7533,-79.3297
3,M4A,North York,Victoria Village,43.7259,-79.3156
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6543,-79.3606
5,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7185,-79.4648
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6623,-79.3895


Lets select one borough!!

In [71]:
neighborhoods.iloc[2,1]

'Downtown Toronto'

In [73]:
neighborhood_latitude = neighborhoods.iloc[2, 3] # neighborhood latitude value
neighborhood_longitude = neighborhoods.iloc[2, 4] # neighborhood longitude value

neighborhood_name = neighborhoods.iloc[2, 2] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Regent Park, Harbourfront are 43.6542599, -79.3606359.


In [74]:
LIMIT = 100
radius = 500
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)


url

'https://api.foursquare.com/v2/venues/explore?&client_id=VXTGALJ5PHZAB0CYF5YHMSEJIYMCGC1THKUZVLCKB3H2NPPE&client_secret=1KNRR0S2YIUM1ZDUJWO3J1KSZ1KSKMHHLADUVPZ4XUXO1S4S&v=20180605&ll=43.6542599,-79.3606359&radius=500&limit=100'

Exploring the region using foursquare API!!

In [75]:
results = requests.get(url).json()

Let us see how this json file looks

In [76]:
results

{'meta': {'code': 200, 'requestId': '5efb890730567d545e17674b'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Corktown',
  'headerFullLocation': 'Corktown, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 45,
  'suggestedBounds': {'ne': {'lat': 43.6587599045, 'lng': -79.3544279001486},
   'sw': {'lat': 43.6497598955, 'lng': -79.36684389985142}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '54ea41ad498e9a11e9e13308',
       'name': 'Roselle Desserts',
       'location': {'address': '362 King St E',
        'crossStreet': 'Trinity St',
        'lat': 43.653446723052674,
        'lng': -79.3620167174383,
        'labeledLatLngs': [{'label': 'display',
 

This is a function that extracts the category of the venue!

In [77]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

Now we are ready to clean the json and structure it into a pandas dataframe.

In [78]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Roselle Desserts,Bakery,43.653447,-79.362017
1,Tandem Coffee,Coffee Shop,43.653559,-79.361809
2,Cooper Koo Family YMCA,Distribution Center,43.653249,-79.358008
3,Body Blitz Spa East,Spa,43.654735,-79.359874
4,Dominion Pub and Kitchen,Pub,43.656919,-79.358967


 No of venues were returned by Foursquare

In [79]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

45 venues were returned by Foursquare.


### Let's create a function to repeat the same process to all the neighborhoods in Toronto