# Coursera Capstone Course
## Segmenting and Clustering Neighborhoods in Toronto
### This notebook consists of 3 parts corresponding to the 3 assignments

#### PART 1: Create data frame

In [1]:
import pandas as pd
import numpy as np
import sys

# Read in correct table
df=pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

#chose the correct table
df=df[0]

#remove rows where Borough is not assigned
df=df[df.Borough!='Not assigned']

#take care of not assigned Neighborhood
df.loc[df['Neighborhood'] == 'Not assigned', 'Neighborhood'] = df['Borough']

#merge rows with duplicate Postcode as required
df = df.groupby(['Postcode','Borough'])['Neighborhood'].apply(', '.join).reset_index()

df.shape

(103, 3)

#### PART2: Integrate coordinates into dataframe

In [2]:
import geocoder

#initialize latitude & longitute coordinates
df['Latitude']=""
df['Longitude']=""
'''
#loop over all postalcodes:
for postalcode in df['Postcode']:
    #initialize coordinates
    coord = None
    print(postalcode)
    # loop until you get the coordinates
    while(coord is None):
      g = geocoder.google('{}, Toronto, Ontario'.format(postalcode))
      coord = g.latlng
    # Assign Latitude and Longitude variables in data frame
    df.loc[[df['Postcode'] == postalcode],'Latitude'] = coord[0]
    df.loc[[df['Postcode'] == postalcode],'Longitude'] = coord[1]
           
df.head()
'''

"\n#loop over all postalcodes:\nfor postalcode in df['Postcode']:\n    #initialize coordinates\n    coord = None\n    print(postalcode)\n    # loop until you get the coordinates\n    while(coord is None):\n      g = geocoder.google('{}, Toronto, Ontario'.format(postalcode))\n      coord = g.latlng\n    # Assign Latitude and Longitude variables in data frame\n    df.loc[[df['Postcode'] == postalcode],'Latitude'] = coord[0]\n    df.loc[[df['Postcode'] == postalcode],'Longitude'] = coord[1]\n           \ndf.head()\n"

#### I was unable to use the geocoder package as the connection keeped being timed-out. Instead, I used the CSV as advised by the assignment

In [3]:
import io
import requests

url = "https://cocl.us/Geospatial_data"
s = requests.get(url).content
coord_df = pd.read_csv(io.StringIO(s.decode('utf-8')))

coord_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [4]:
#iterate over all postalcodes
for postalcode in df['Postcode']:
    # get latitude from csv for the postalcode
    lat=coord_df.loc[coord_df['Postal Code']==postalcode,'Latitude']
    long=coord_df.loc[coord_df['Postal Code']==postalcode,'Longitude']
    
    # update values in data frame
    df.loc[df['Postcode'] == postalcode,'Latitude'] = lat
    df.loc[df['Postcode'] == postalcode,'Longitude'] = long
    
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.8067,-79.1944
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.7845,-79.1605
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.7636,-79.1887
3,M1G,Scarborough,Woburn,43.771,-79.2169
4,M1H,Scarborough,Cedarbrae,43.7731,-79.2395


#### PART 3: Segmenting and Clustering

In [8]:
import json # library to handle JSON files
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
# import k-means from clustering stage
from sklearn.cluster import KMeans
import folium # map rendering library

print('Libraries imported.')

#continue with borouhs containing 'Toronto' as specified in assignment
df=df[df['Borough'].str.contains("Toronto")]
df.head()

Libraries imported.


Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.6764,-79.293
41,M4K,East Toronto,"The Danforth West, Riverdale",43.6796,-79.3522
42,M4L,East Toronto,"The Beaches West, India Bazaar",43.669,-79.3156
43,M4M,East Toronto,Studio District,43.6595,-79.3409
44,M4N,Central Toronto,Lawrence Park,43.728,-79.3888


geocoder parameters assignment

In [6]:
address = 'Toronto'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [14]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

#### Define Foursquare credentials

In [22]:
CLIENT_ID = 'MLRFOGQBIDKOWGGZD3QRAPADTITSHQQLM0J05TIOG1H5KMSZ'
CLIENT_SECRET = 'PTTU5J42WQNOXMYHOZ0MLLZLCFD2ZKHFJ15AFNDZCKEB2FZJ' 
VERSION = '20180605' # Foursquare API version
LIMIT=100

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: MLRFOGQBIDKOWGGZD3QRAPADTITSHQQLM0J05TIOG1H5KMSZ
CLIENT_SECRET:PTTU5J42WQNOXMYHOZ0MLLZLCFD2ZKHFJ15AFNDZCKEB2FZJ


#### Function to get venues for a neighborhood

In [82]:
def getNearbyVenues(names, latitudes, longitudes, radius=800):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### Get venues for each neighborhood



In [83]:
toronto_venues = getNearbyVenues(names=df['Neighborhood'],latitudes=df['Latitude'],longitudes=df['Longitude'])
print(toronto_venues.groupby('Neighborhood').count())
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))
toronto_venues.head()

The Beaches
The Danforth West, Riverdale
The Beaches West, India Bazaar
Studio District
Lawrence Park
Davisville North
North Toronto West
Davisville
Moore Park, Summerhill East
Deer Park, Forest Hill SE, Rathnelly, South Hill, Summerhill West
Rosedale
Cabbagetown, St. James Town
Church and Wellesley
Harbourfront
Ryerson, Garden District
St. James Town
Berczy Park
Central Bay Street
Adelaide, King, Richmond
Harbourfront East, Toronto Islands, Union Station
Design Exchange, Toronto Dominion Centre
Commerce Court, Victoria Hotel
Roselawn
Forest Hill North, Forest Hill West
The Annex, North Midtown, Yorkville
Harbord, University of Toronto
Chinatown, Grange Park, Kensington Market
CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara
Stn A PO Boxes 25 The Esplanade
First Canadian Place, Underground city
Christie
Dovercourt Village, Dufferin
Little Portugal, Trinity
Brockton, Exhibition Place, Parkdale Village
High Park, The Junction Sout

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,The Beaches,43.676357,-79.293031,Tori's Bakeshop,43.672114,-79.290331,Vegetarian / Vegan Restaurant
2,The Beaches,43.676357,-79.293031,Beaches Bake Shop,43.680363,-79.289692,Bakery
3,The Beaches,43.676357,-79.293031,The Beech Tree,43.680493,-79.288846,Gastropub
4,The Beaches,43.676357,-79.293031,The Fox Theatre,43.672801,-79.287272,Indie Movie Theater


#### Analyze each neighborhood

In [84]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Amphitheater,...,Udon Restaurant,University,Vegetarian / Vegan Restaurant,Video Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Function to sort venues in descending order

In [85]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

#### Create data frame with top 10 venues for each neighborhood


In [86]:
# group rows by neighborhood and by taking the mean of the frequency of occurrence of each category
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Udon Restaurant,University,Vegetarian / Vegan Restaurant,Video Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,"Adelaide, King, Richmond",0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,...,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.01
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Brockton, Exhibition Place, Parkdale Village",0.0,0.010101,0.0,0.0,0.0,0.0,0.0,0.0,0.010101,...,0.0,0.0,0.010101,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Business Reply Mail Processing Centre 969 Eastern,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",0.0,0.0,0.034483,0.034483,0.034483,0.034483,0.103448,0.103448,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"Cabbagetown, St. James Town",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Central Bay Street,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,...,0.0,0.01,0.01,0.0,0.0,0.0,0.01,0.0,0.0,0.0
7,"Chinatown, Grange Park, Kensington Market",0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.01,0.0,0.06,0.0,0.04,0.0,0.01,0.0,0.0,0.0
8,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.016129,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Church and Wellesley,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,...,0.0,0.0,0.01,0.01,0.01,0.0,0.0,0.01,0.01,0.0


In [87]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Coffee Shop,Café,Steakhouse,Hotel,Pizza Place,Asian Restaurant,Sushi Restaurant,Gastropub,Theater,Furniture / Home Store
1,Berczy Park,Coffee Shop,Café,Beer Bar,Hotel,Japanese Restaurant,Restaurant,Park,Cocktail Bar,Cheese Shop,Creperie
2,"Brockton, Exhibition Place, Parkdale Village",Coffee Shop,Café,Bar,Bakery,Arts & Crafts Store,Furniture / Home Store,Restaurant,Italian Restaurant,Theme Park Ride / Attraction,Nightclub
3,Business Reply Mail Processing Centre 969 Eastern,Fast Food Restaurant,Light Rail Station,Bar,Brewery,Pizza Place,Burrito Place,Bakery,Pub,Coffee Shop,Grocery Store
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",Harbor / Marina,Rental Car Location,Airport Service,Airport Terminal,Sculpture Garden,Coffee Shop,Boat or Ferry,Boutique,Park,Bar


#### K-means clustering

In [88]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([4, 4, 1, 1, 1, 1, 4, 1, 1, 4], dtype=int32)

In [89]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
37,M4E,East Toronto,The Beaches,43.6764,-79.293,1,Pub,Bar,Sandwich Place,Grocery Store,Breakfast Spot,Café,Bakery,Thai Restaurant,Japanese Restaurant,Ramen Restaurant
41,M4K,East Toronto,"The Danforth West, Riverdale",43.6796,-79.3522,1,Greek Restaurant,Coffee Shop,Pub,Ice Cream Shop,Café,Restaurant,Fast Food Restaurant,Italian Restaurant,Spa,Bakery
42,M4L,East Toronto,"The Beaches West, India Bazaar",43.669,-79.3156,1,Indian Restaurant,Grocery Store,Park,Pet Store,Coffee Shop,Restaurant,Sandwich Place,Café,Fast Food Restaurant,Burger Joint
43,M4M,East Toronto,Studio District,43.6595,-79.3409,1,Coffee Shop,Café,Bar,Bakery,Diner,Brewery,Pizza Place,Gastropub,Sandwich Place,American Restaurant
44,M4N,Central Toronto,Lawrence Park,43.728,-79.3888,2,Bookstore,Café,Gym / Fitness Center,Coffee Shop,Bus Line,Park,Restaurant,Electronics Store,Doner Restaurant,Donut Shop


In [90]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters