## Coursera Capstone: Clustering Toronto Neighborhoods
**Author: Zack Kenyon**
### Part 1: Making the data frame with beautifulSoup

In [1]:
!pip install -q bs4

In [2]:
import bs4
import pandas as pd
import numpy as np
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
from bs4 import SoupStrainer as strain

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
uClient = uReq(url)
page_html = uClient.read()
uClient.close()

table = strain("table", {"class":"wikitable sortable"}) #don't want to parse in more than whats needed
TOR = soup(page_html, 'html.parser', parse_only = table) #Got only the table of interest 

In [4]:
cols = TOR.findAll('th') #getting DF col titles
table_rows = TOR.findAll('tr') #getting DF rows


## Building DataFrame 

In [5]:
i = 0
col1 = []
col2 = []
col3 = []
for row in table_rows:
    if i == 0:
        i = i + 1
    else:
        item = row.findAll('td')
        col1.append(item[0].text)
        col2.append(item[1].text)
        col3.append(item[2].text.strip())
        
All_Borough = pd.DataFrame() #This is the grimey way to build a df since declaring one wholesale was giving me issues
All_Borough.insert(0, cols[0].text.strip(), col1)
All_Borough.insert(1, cols[1].text.strip(), col2)
All_Borough.insert(2, cols[2].text.strip(), col3)
All_Borough.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Cleaning DF 
1. **To remove the 'Not Assigned' Boroughs I will simply select all rows of the DataFrame that are assigned** 
2. **To assign names all 'Not Assigned' Neighbourhoods, I will loop through the Dataframe and set NB = Borough**
3. **To group all neighborhoods, I will loop through the DF collecting each Postcode. The select a DF of only that specific Postcode. Looping through that second DF I will collect the neighbourhoods in a string before assigning it to the original DF. Once complete, I will drop all duplicate PostCodes from the DF.**

In [6]:
cleaned_Borough = All_Borough[All_Borough['Borough'] != 'Not assigned'] #removing not_assigned
cleaned_Borough = cleaned_Borough.reset_index().drop(['index'], axis = 1)
cleaned_Borough.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


In [7]:
for i in range(0, len(cleaned_Borough)): #looping through all neighbourhoods to re-assign 
    if cleaned_Borough.iloc[i, 2] == 'Not Assigned':
        borough = cleaned_Borough[i, 1]
        cleaned_Borough[i, 2] = borough

for i in range(0, len(cleaned_Borough)): #looping through all post codes
    neigh = cleaned_Borough['Neighbourhood'].iloc[i]
    post = cleaned_Borough['Postcode'].iloc[i]
    duplicate = cleaned_Borough[cleaned_Borough['Postcode'] == post]
    for j in range(1, len(duplicate)): #looping through all duplicates of post codes
        neigh = str(neigh) + ', ' + str(duplicate.iloc[j, 2]) # concat neighbourhood strings
    cleaned_Borough.iloc[i, 2] = neigh #reassign to neighbourhood 
Toronto = cleaned_Borough.drop_duplicates(subset = ['Postcode']).reset_index().drop(['index'], axis = 1) #drop duplicate postcodes

In [8]:
Toronto.shape #display new DF

(103, 3)

# REGENT PARK DOES NOT EXIST

# Part 2: Getting LL data 
**using the pd.read_csv() fuction, I will create a DF and since its the same size as our Toronto DF, I will merge the two on their 'Postcode' cols** 

In [9]:
LL = pd.read_csv('Geospatial_Coordinates.csv')

In [10]:
LL.rename(columns = {'Postal Code': 'Postcode'}, inplace = True)
LL.sort_values('Postcode', inplace = True)
Toronto.sort_values('Postcode', inplace = True)

In [11]:
Toronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
6,M1B,Scarborough,"Rouge, Malvern"
12,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
18,M1E,Scarborough,"Guildwood, Morningside, West Hill"
22,M1G,Scarborough,Woburn
26,M1H,Scarborough,Cedarbrae


In [12]:
LL.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [13]:
Toronto = Toronto.merge(LL, on = 'Postcode') #Merged DF shows Postcode, Borough, Neighbourhood, Lattitude, Longitude

# Part 3 Clustering Nieghbourhoods
**I'm unsure of what they are wanting for this section so I'm just recreating what we did in our previous lab**

In [14]:
!pip install -q geopy
!pip install -q folium
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import json # library to handle JSON files
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
from sklearn.cluster import KMeans


In [15]:
#fourSquare credentials
CLIENT_ID = 'HMK4WZSTRPQ03QBMQ2UXPF1O4M4DAW5K0HHWE4BW15OPI3D5'
CLIENT_SECRET = 'XFPVKCL2QM42NJNHFK1A4BAY0BPZ1HTECLBG1JZHQ31BZ21F'
VERSION = '20200316' 

In [16]:
Tor_only = Toronto[Toronto['Borough'].str.contains('Toronto')]
Tor_only = Tor_only.reset_index().drop(['index'], axis =1)
Tor_only.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [17]:
LIMIT = 100
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [18]:
GTA_Venues = getNearbyVenues(Tor_only['Neighbourhood'], Tor_only['Latitude'], Tor_only['Longitude'])

The Beaches
The Danforth West, Riverdale
The Beaches West, India Bazaar
Studio District
Lawrence Park
Davisville North
North Toronto West
Davisville
Moore Park, Summerhill East
Deer Park, Forest Hill SE, Rathnelly, South Hill, Summerhill West
Rosedale
Cabbagetown, St. James Town
Church and Wellesley
Harbourfront
Ryerson, Garden District
St. James Town
Berczy Park
Central Bay Street
Adelaide, King, Richmond
Harbourfront East, Toronto Islands, Union Station
Design Exchange, Toronto Dominion Centre
Commerce Court, Victoria Hotel
Roselawn
Forest Hill North, Forest Hill West
The Annex, North Midtown, Yorkville
Harbord, University of Toronto
Chinatown, Grange Park, Kensington Market
CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara
Stn A PO Boxes 25 The Esplanade
First Canadian Place, Underground city
Christie
Dovercourt Village, Dufferin
Little Portugal, Trinity
Brockton, Exhibition Place, Parkdale Village
High Park, The Junction Sout

In [19]:
GTA_Venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
3,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
4,"The Danforth West, Riverdale",43.679557,-79.352188,Pantheon,43.677621,-79.351434,Greek Restaurant


In [20]:
Hot_Venue = pd.get_dummies(GTA_Venues[['Venue Category']], prefix = '', prefix_sep = '')
Hot_Venue.drop(['Neighborhood'], axis = 1, inplace = True)
Hot_Venue.insert(0, 'Neighborhood', GTA_Venues['Neighborhood'])
Tor_grouped = Hot_Venue.groupby('Neighborhood').mean().reset_index()
Tor_grouped.head()

Unnamed: 0,Neighborhood,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,"Adelaide, King, Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,...,0.0,0.0,0.02,0.0,0.0,0.01,0.0,0.0,0.01,0.0
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.017544,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Brockton, Exhibition Place, Parkdale Village",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Business Reply Mail Processing Centre 969 Eastern,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.066667
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",0.0,0.066667,0.066667,0.066667,0.133333,0.133333,0.066667,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = Tor_grouped['Neighborhood']

for ind in np.arange(Tor_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(Tor_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Coffee Shop,Restaurant,Thai Restaurant,Café,Bar,Steakhouse,Sushi Restaurant,Concert Hall,Seafood Restaurant,Bakery
1,Berczy Park,Coffee Shop,Cocktail Bar,Bakery,Beer Bar,Seafood Restaurant,Restaurant,Farmers Market,Café,Cheese Shop,Comfort Food Restaurant
2,"Brockton, Exhibition Place, Parkdale Village",Café,Performing Arts Venue,Nightclub,Coffee Shop,Breakfast Spot,Bakery,Pet Store,Climbing Gym,Restaurant,Burrito Place
3,Business Reply Mail Processing Centre 969 Eastern,Yoga Studio,Auto Workshop,Skate Park,Brewery,Spa,Burrito Place,Farmers Market,Fast Food Restaurant,Restaurant,Light Rail Station
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",Airport Lounge,Airport Service,Boutique,Harbor / Marina,Boat or Ferry,Bar,Coffee Shop,Plane,Sculpture Garden,Airport Terminal


In [22]:
kclusters = 5

Tor_clustering = Tor_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Tor_clustering)

# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

Tor_merged = Toronto.rename(columns = {'Neighbourhood':'Neighborhood'})
Tor_merged = Tor_merged.merge(neighborhoods_venues_sorted, on='Neighborhood')
Tor_merged.head() 

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,1,Trail,Pub,Health Food Store,Yoga Studio,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,1,Greek Restaurant,Coffee Shop,Italian Restaurant,Ice Cream Shop,Furniture / Home Store,Frozen Yogurt Shop,Pub,Pizza Place,Lounge,Liquor Store
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572,1,Park,Pizza Place,Movie Theater,Fish & Chips Shop,Sushi Restaurant,Steakhouse,Brewery,Pub,Fast Food Restaurant,Italian Restaurant
3,M4M,East Toronto,Studio District,43.659526,-79.340923,1,Café,Coffee Shop,Bakery,Italian Restaurant,Brewery,American Restaurant,Park,Seafood Restaurant,Sandwich Place,Cheese Shop
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,1,Dim Sum Restaurant,Park,Swim School,Bus Line,Yoga Studio,Empanada Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop


In [23]:
# create map
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors
map_clusters = folium.Map(location=[43.728020, -79.388790], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Tor_merged['Latitude'], Tor_merged['Longitude'], Tor_merged['Neighborhood'], Tor_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [28]:
Tor_merged.loc[Tor_merged['Cluster Labels'] == 1, Tor_merged.columns[[1] + list(range(5, Tor_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,East Toronto,1,Trail,Pub,Health Food Store,Yoga Studio,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant
1,East Toronto,1,Greek Restaurant,Coffee Shop,Italian Restaurant,Ice Cream Shop,Furniture / Home Store,Frozen Yogurt Shop,Pub,Pizza Place,Lounge,Liquor Store
2,East Toronto,1,Park,Pizza Place,Movie Theater,Fish & Chips Shop,Sushi Restaurant,Steakhouse,Brewery,Pub,Fast Food Restaurant,Italian Restaurant
3,East Toronto,1,Café,Coffee Shop,Bakery,Italian Restaurant,Brewery,American Restaurant,Park,Seafood Restaurant,Sandwich Place,Cheese Shop
4,Central Toronto,1,Dim Sum Restaurant,Park,Swim School,Bus Line,Yoga Studio,Empanada Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop
5,Central Toronto,1,Park,Hotel,Breakfast Spot,Sandwich Place,Dog Run,Food & Drink Shop,Department Store,Gym,Costume Shop,Coworking Space
6,Central Toronto,1,Coffee Shop,Clothing Store,Seafood Restaurant,Salon / Barbershop,Restaurant,Rental Car Location,Café,Chinese Restaurant,Park,Sporting Goods Shop
7,Central Toronto,1,Pizza Place,Sandwich Place,Dessert Shop,Gym,Italian Restaurant,Café,Sushi Restaurant,Coffee Shop,Greek Restaurant,Seafood Restaurant
9,Central Toronto,1,Pub,Coffee Shop,Supermarket,Restaurant,Light Rail Station,Vietnamese Restaurant,Liquor Store,Pizza Place,American Restaurant,Sushi Restaurant
11,Downtown Toronto,1,Coffee Shop,Restaurant,Café,Pizza Place,Italian Restaurant,Convenience Store,Pub,Bakery,Caribbean Restaurant,Playground


In [29]:
Tor_merged.loc[Tor_merged['Cluster Labels'] == 2, Tor_merged.columns[[1] + list(range(5, Tor_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
10,Downtown Toronto,2,Park,Playground,Trail,Yoga Studio,Department Store,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant


In [30]:
Tor_merged.loc[Tor_merged['Cluster Labels'] == 3, Tor_merged.columns[[1] + list(range(5, Tor_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
23,Central Toronto,3,Jewelry Store,Trail,Sushi Restaurant,Mexican Restaurant,Yoga Studio,Dim Sum Restaurant,Empanada Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant


In [31]:
Tor_merged.loc[Tor_merged['Cluster Labels'] == 4, Tor_merged.columns[[1] + list(range(5, Tor_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
22,Central Toronto,4,Pool,Garden,Yoga Studio,Department Store,Empanada Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant


In [32]:
Tor_merged.loc[Tor_merged['Cluster Labels'] == 5, Tor_merged.columns[[1] + list(range(5, Tor_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
