### This notebook contains all three parts of the Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto.





# PART ONE

In [1]:
# Import required libraries


import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import json
from pandas.io.json import json_normalize

In [2]:
# Retrieve url from wikipedia

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = requests.get(url).text

In [13]:
# Parse the url with BeautifulSoup parser=xml

soup = BeautifulSoup(page,'xml')
table = soup.find('table')

# Set the colums as ib the exampele in the assignment, and make a empty dataframe with the column names

columns = ['PostalCode', 'Borough', 'Neighborhood']
df = pd.DataFrame(columns=columns)

# Make a for lopp to retrieve the text for PostalCode, Borough, Neighborhood and append it to df

for tr in table.find_all('tr'):
    counter = 1
    for td in tr.find_all('td'):
        if counter == 1:
            code = td.text
            counter = counter + 1
        elif counter == 2:
            br = td.text
            counter = counter + 1
        elif counter == 3:
            t = td.text
            nigh = t[:-2]
            
    df = df.append({'PostalCode':code, 'Borough':br, 'Neighborhood': nigh }, ignore_index=True)

# Remove instances of Boroughs=not assigned + assign the name of the Borough where neighborhoods=not assigned 


df = df[df.Borough != 'Not assigned']

df.loc[df['Neighborhood'] == 'Not assigned', 'Neighborhood'] = df['Borough']

# Use group.by, apply and join to add together neighborhoods where more 
# than one neighborhood  exists in one postal code area, reset index

df = df.groupby(['PostalCode','Borough'])['Neighborhood'].apply(', '.join).reset_index()
df




Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Roug, Malver"
1,M1C,Scarborough,"Highland Cree, Rouge Hil, Port Unio"
2,M1E,Scarborough,"Guildwoo, Morningsid, West Hil"
3,M1G,Scarborough,Wobur
4,M1H,Scarborough,Cedarbra
5,M1J,Scarborough,Scarborough Villag
6,M1K,Scarborough,"East Birchmount Par, Ionvie, Kennedy Par"
7,M1L,Scarborough,"Clairle, Golden Mil, Oakridg"
8,M1M,Scarborough,"Cliffcres, Cliffsid, Scarborough Village Wes"
9,M1N,Scarborough,"Birch Clif, Cliffside Wes"


# PART 2

In [14]:
# Get csv file with coordinates an read it into a dataframe
geo = "http://cocl.us/Geospatial_data"
df_geo = pd.read_csv(geo)


In [15]:
# Merge the two dataframes, to insert columns with coordinates to df, and drop the second column 
# containing extra postal codes from df_geo 

df = df.merge(df_geo, left_on='PostalCode', right_on='Postal Code')
df = df.drop(columns='Postal Code')

df


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Roug, Malver",43.806686,-79.194353
1,M1C,Scarborough,"Highland Cree, Rouge Hil, Port Unio",43.784535,-79.160497
2,M1E,Scarborough,"Guildwoo, Morningsid, West Hil",43.763573,-79.188711
3,M1G,Scarborough,Wobur,43.770992,-79.216917
4,M1H,Scarborough,Cedarbra,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Villag,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Par, Ionvie, Kennedy Par",43.727929,-79.262029
7,M1L,Scarborough,"Clairle, Golden Mil, Oakridg",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcres, Cliffsid, Scarborough Village Wes",43.716316,-79.239476
9,M1N,Scarborough,"Birch Clif, Cliffside Wes",43.692657,-79.264848


# PART 3

In [16]:
# Create new dataframe containing only the Boroughs with Toronto in the name

df_to = df[df['Borough'].str.contains('Toronto', na = False)].reset_index(drop=True)
df_to.shape

(38, 5)

In [17]:
# Import the nessecary libraries for clustering and mapmaking

import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes 
import folium # map rendering library


Collecting package metadata: ...working... done
Solving environment: ...working... done

# All requested packages already installed.



In [18]:
# Coordinates for toronto obtanined from https://latitudelongitude.org/ca/toronto/

lat_tor = 43.70011
long_tor = -79.4163

In [19]:
# Use folium to create map of Toronto with neighborhoods amd boroughs

map_tor = folium.Map(location=[lat_tor,long_tor ], zoom_start=10)

for lat, lng, borough, neighborhood in zip(df_to['Latitude'],df_to['Longitude'],df_to['Borough'],df_to['Neighborhood']):
        label = '{}, {}'.format(neighborhood, borough)
        label = folium.Popup(label, parse_html=True)
        folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_tor)

map_tor


In [20]:
# Add user credetials for foursqare + Version and LIMIT

CLIENT_ID = 'C4TKHUWCRCSH1NYV0LBALUUI5XTUQKNGA0QYZOFT3G3ADJEY' 
CLIENT_SECRET = 'SRW34K0JP4SR1ZLMM3DPLPY5JW45ASSHH25CJZVQLGMX4VEE'
VERSION = '20180605'
LIMIT = 100

In [21]:
# Make function "getNear" to extract the category of the venues 

def getNear(names, latitudes, longitudes, radius=500):
    
    ven_list = []
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        
        url =  'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        lng,
        radius,
        LIMIT)
    
        results = requests.get(url).json()['response']['groups'][0]['items']
    
        ven_list.append([(
        name,
        lat,
        lng,
        v['venue']['name'],
        v['venue']['location']['lat'],
        v['venue']['location']['lng'],
        v['venue']['categories'][0]['name']) for v in results])
    
    near_venues = pd.DataFrame([item for ven_list in ven_list for item in ven_list])
    near_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    return(near_venues)
        



In [22]:
# Use getNear from above on df_to to get the venues near the neighborhoods

toronto_ven = getNear(names=df_to['Neighborhood'],
                      latitudes=df_to['Latitude'],
                      longitudes=df_to['Longitude']
                     )

The Beache
The Danforth Wes, Riverdal
The Beaches Wes, India Bazaa
Studio Distric
Lawrence Par
Davisville Nort
North Toronto Wes
Davisvill
Moore Par, Summerhill Eas
Deer Par, Forest Hill S, Rathnell, South Hil, Summerhill Wes
Rosedal
Cabbagetow, St. James Tow
Church and Wellesle
Harbourfron, Regent Par
Ryerso, Garden Distric
St. James Tow
Berczy Par
Central Bay Stree
Adelaid, Kin, Richmon
Harbourfront Eas, Toronto Island, Union Statio
Design Exchang, Toronto Dominion Centr
Commerce Cour, Victoria Hote
Roselaw
Forest Hill Nort, Forest Hill Wes
The Anne, North Midtow, Yorkvill
Harbor, University of Toront
Chinatow, Grange Par, Kensington Marke
CN Towe, Bathurst Qua, Island airpor, Harbourfront Wes, King and Spadin, Railway Land, South Niagar
Stn A PO Boxes 25 The Esplanad
First Canadian Plac, Underground cit
Christi
Dovercourt Villag, Dufferi
Little Portuga, Trinit
Brockto, Exhibition Plac, Parkdale Villag
High Par, The Junction Sout
Parkdal, Roncesvalle
Runnymed, Swanse
Business Reply M

In [23]:
# Use hot encoding on toronto_ven, and add the new column to existing dataframe, 
# + moving neigbhorhood column to the  first column

toront_hot = pd.get_dummies(toronto_ven[['Venue Category']], prefix="")

toront_hot['Neighborhood'] = toronto_ven['Neighborhood']

fixed_columns = [toront_hot.columns[-1]] + list(toront_hot.columns[:-1])

toront_hot = toront_hot[fixed_columns]

#toront_hot.head(30)

print(toront_hot.shape)

(1700, 239)


In [24]:
# Group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

toront_grouped = toront_hot.groupby('Neighborhood').mean().reset_index()

toront_grouped.shape

(38, 239)

In [25]:
# Make a funtion sort_ven to sort the venues in descending order

def sort_ven(row, top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:top_venues]


In [26]:
# Use the function in a for loop to get top venues, and create a new dataframe neigh_sorted
# based on the results

top_venues = 10

indicators = ['st', 'nd', 'rd']

columns = ['Neighborhood']
for ind in np.arange(top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

neigh_sorted = pd.DataFrame(columns=columns)
neigh_sorted['Neighborhood'] = toront_grouped['Neighborhood']

for ind in np.arange(toront_grouped.shape[0]):
    neigh_sorted.iloc[ind, 1:] = sort_ven(toront_grouped.iloc[ind, :], top_venues)
    

#neigh_sorted.head(20)

In [28]:
# Use k.means to create 10 clusters of the neighborhoods

klusters = 10



toront_clustering = toront_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=klusters, random_state=0).fit(toront_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 


array([2, 2, 2, 0, 2, 2, 2, 2, 2, 2])

In [59]:
#neigh_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toront_merged = df_to

toront_merged = toront_merged.join(neigh_sorted.set_index('Neighborhood'), on='Neighborhood') 
                                   
toront_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beache,43.676357,-79.293031,7,_Health Food Store,_Pub,_Trail,_Neighborhood,_Other Great Outdoors,_Falafel Restaurant,_Event Space,_Farmers Market,_Fast Food Restaurant,_Filipino Restaurant
1,M4K,East Toronto,"The Danforth Wes, Riverdal",43.679557,-79.352188,2,_Greek Restaurant,_Coffee Shop,_Italian Restaurant,_Ice Cream Shop,_Furniture / Home Store,_Bubble Tea Shop,_Fruit & Vegetable Store,_Juice Bar,_Liquor Store,_Spa
2,M4L,East Toronto,"The Beaches Wes, India Bazaa",43.668999,-79.315572,0,_Park,_Gym,_Italian Restaurant,_Pizza Place,_Pub,_Movie Theater,_Sandwich Place,_Burrito Place,_Burger Joint,_Brewery
3,M4M,East Toronto,Studio Distric,43.659526,-79.340923,2,_Café,_Coffee Shop,_Gastropub,_Italian Restaurant,_Bakery,_American Restaurant,_Yoga Studio,_Comfort Food Restaurant,_Brewery,_Seafood Restaurant
4,M4N,Central Toronto,Lawrence Par,43.72802,-79.38879,6,_Park,_Swim School,_Bus Line,_Yoga Studio,_Doner Restaurant,_Fish & Chips Shop,_Filipino Restaurant,_Fast Food Restaurant,_Farmers Market,_Falafel Restaurant


In [30]:
# Create map with the clusters

map_clusters = folium.Map(location=[lat, lng], zoom_start=11)

x = np.arange(klusters)
ys = [i + x + (i*x)**2 for i in range(klusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, poi, cluster in zip(toront_merged['Latitude'], toront_merged['Longitude'], toront_merged['Neighborhood'], toront_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)

map_clusters

## View the first 5 clusters

In [None]:
# Make a list to use for indexing below, so it's only Borough and Neighborhood and top 10 venues witch gets shown
ind = [1, 2, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]

### CLUSTER 1

In [158]:
cluster_1 = toront_merged.loc[toront_merged['Cluster Labels'] == 0, toront_merged.columns[ind]]
cluster_1

Unnamed: 0,Borough,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,East Toronto,"The Beaches Wes, India Bazaa",_Park,_Gym,_Italian Restaurant,_Pizza Place,_Pub,_Movie Theater,_Sandwich Place,_Burrito Place,_Burger Joint,_Brewery
37,East Toronto,Business Reply Mail Processing Centre 969 Easter,_Yoga Studio,_Auto Workshop,_Comic Shop,_Pizza Place,_Recording Studio,_Restaurant,_Butcher,_Burrito Place,_Skate Park,_Brewery


### CLUSTER 2

In [157]:
cluster_2 = toront_merged.loc[toront_merged['Cluster Labels'] == 1, toront_merged.columns[ind]]
cluster_2

Unnamed: 0,Borough,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
8,Central Toronto,"Moore Par, Summerhill Eas",_Tennis Court,_Playground,_Convenience Store,_Doner Restaurant,_Fish Market,_Fish & Chips Shop,_Filipino Restaurant,_Fast Food Restaurant,_Farmers Market,_Falafel Restaurant


### CLUSTER 3

In [154]:
cluster_3 = toront_merged.loc[toront_merged['Cluster Labels'] == 2, toront_merged.columns[ind]]
cluster_3

Unnamed: 0,Borough,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,East Toronto,"The Danforth Wes, Riverdal",_Greek Restaurant,_Coffee Shop,_Italian Restaurant,_Ice Cream Shop,_Furniture / Home Store,_Bubble Tea Shop,_Fruit & Vegetable Store,_Juice Bar,_Liquor Store,_Spa
3,East Toronto,Studio Distric,_Café,_Coffee Shop,_Gastropub,_Italian Restaurant,_Bakery,_American Restaurant,_Yoga Studio,_Comfort Food Restaurant,_Brewery,_Seafood Restaurant
6,Central Toronto,North Toronto Wes,_Coffee Shop,_Yoga Studio,_Bagel Shop,_Park,_Clothing Store,_Dessert Shop,_Chinese Restaurant,_Rental Car Location,_Diner,_Salon / Barbershop
7,Central Toronto,Davisvill,_Pizza Place,_Dessert Shop,_Sandwich Place,_Italian Restaurant,_Café,_Thai Restaurant,_Sushi Restaurant,_Coffee Shop,_Restaurant,_Deli / Bodega
11,Downtown Toronto,"Cabbagetow, St. James Tow",_Coffee Shop,_Restaurant,_Park,_Café,_Italian Restaurant,_Bakery,_Pub,_Pizza Place,_Pet Store,_Breakfast Spot
12,Downtown Toronto,Church and Wellesle,_Japanese Restaurant,_Coffee Shop,_Sushi Restaurant,_Restaurant,_Gay Bar,_Men's Store,_Gym,_Pub,_Mediterranean Restaurant,_Bubble Tea Shop
13,Downtown Toronto,"Harbourfron, Regent Par",_Coffee Shop,_Bakery,_Pub,_Park,_Theater,_Breakfast Spot,_Restaurant,_Mexican Restaurant,_Café,_Yoga Studio
14,Downtown Toronto,"Ryerso, Garden Distric",_Coffee Shop,_Clothing Store,_Cosmetics Shop,_Café,_Middle Eastern Restaurant,_Fast Food Restaurant,_Pizza Place,_Ramen Restaurant,_Italian Restaurant,_Diner
15,Downtown Toronto,St. James Tow,_Coffee Shop,_Café,_Hotel,_Restaurant,_Breakfast Spot,_Gastropub,_Cosmetics Shop,_Clothing Store,_Cocktail Bar,_Bakery
16,Downtown Toronto,Berczy Par,_Coffee Shop,_Cocktail Bar,_Steakhouse,_Bakery,_Café,_Cheese Shop,_Seafood Restaurant,_Beer Bar,_Italian Restaurant,_Farmers Market


### CLUSTER 4

In [151]:
cluster_4 = toront_merged.loc[toront_merged['Cluster Labels'] == 3, toront_merged.columns[ind]]
cluster_4

Unnamed: 0,Borough,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
22,Central Toronto,Roselaw,_Ice Cream Shop,_Garden,_Yoga Studio,_Doner Restaurant,_Fish & Chips Shop,_Filipino Restaurant,_Fast Food Restaurant,_Farmers Market,_Falafel Restaurant,_Event Space


### CLUSTER 5

In [152]:
ind = [1, 2, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
cluster_5 = toront_merged.loc[toront_merged['Cluster Labels'] == 4, toront_merged.columns[ind]]

cluster_5

Unnamed: 0,Borough,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
23,Central Toronto,"Forest Hill Nort, Forest Hill Wes",_Jewelry Store,_Trail,_Park,_Sushi Restaurant,_Donut Shop,_Dumpling Restaurant,_Eastern European Restaurant,_Electronics Store,_Ethiopian Restaurant,_Yoga Studio


### CLUSTER 6

In [159]:
cluster_6 = toront_merged.loc[toront_merged['Cluster Labels'] == 5, toront_merged.columns[ind]]
cluster_6

Unnamed: 0,Borough,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
10,Downtown Toronto,Rosedal,_Park,_Playground,_Trail,_Dog Run,_Fish & Chips Shop,_Filipino Restaurant,_Fast Food Restaurant,_Farmers Market,_Falafel Restaurant,_Event Space


### CLUSTER 7

In [160]:
cluster_7 = toront_merged.loc[toront_merged['Cluster Labels'] == 6, toront_merged.columns[ind]]
cluster_7

Unnamed: 0,Borough,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,Central Toronto,Lawrence Par,_Park,_Swim School,_Bus Line,_Yoga Studio,_Doner Restaurant,_Fish & Chips Shop,_Filipino Restaurant,_Fast Food Restaurant,_Farmers Market,_Falafel Restaurant


### CLUSTER 8

In [162]:
cluster_8 = toront_merged.loc[toront_merged['Cluster Labels'] == 7, toront_merged.columns[ind]]
cluster_8

Unnamed: 0,Borough,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,East Toronto,The Beache,_Health Food Store,_Pub,_Trail,_Neighborhood,_Other Great Outdoors,_Falafel Restaurant,_Event Space,_Farmers Market,_Fast Food Restaurant,_Filipino Restaurant


### CLUSTER 9

In [163]:
cluster_9 = toront_merged.loc[toront_merged['Cluster Labels'] == 8, toront_merged.columns[ind]]
cluster_9

Unnamed: 0,Borough,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
5,Central Toronto,Davisville Nort,_Playground,_Hotel,_Clothing Store,_Food & Drink Shop,_Grocery Store,_Park,_Gym,_Breakfast Spot,_Sandwich Place,_Falafel Restaurant


### CLUSTER 10

In [164]:
cluster_10 = toront_merged.loc[toront_merged['Cluster Labels'] == 9, toront_merged.columns[ind]]
cluster_10

Unnamed: 0,Borough,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
9,Central Toronto,"Deer Par, Forest Hill S, Rathnell, South Hil, ...",_Pub,_Coffee Shop,_Liquor Store,_Light Rail Station,_Sushi Restaurant,_Supermarket,_Sports Bar,_Fried Chicken Joint,_American Restaurant,_Vietnamese Restaurant
