In [254]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

# get the table info
r  = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M", "lxml")

data = r.text

soup = BeautifulSoup(data)

table = soup.find_all('table')[0] 

# convert list to dataframe
dfs = pd.read_html(str(table))

df = pd.DataFrame(dfs[0])


In [255]:
# set the first row as columns
new_header = df.iloc[0]
df=df[1:]
df.columns = new_header
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


In [256]:
#drop the invalid rows
indexRows = df[df['Borough'] == 'Not assigned'].index
df.drop(indexRows, inplace=True)

In [257]:
# group the rows with same postcode, and list all neighbourhood
grouped_df = df.groupby(['Postcode','Borough'])
grouped_lists = grouped_df['Neighbourhood'].agg(lambda x: ",".join(x))
grouped_lists = grouped_lists.reset_index()
grouped_lists.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [258]:
grouped_lists['Neighbourhood'].replace({"Not assigned": grouped_lists['Borough']}, inplace=True)

In [259]:
# get the number of rows
grouped_lists.shape

(103, 3)

In [260]:
grouped_lists.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [261]:
!wget -O coor.csv https://cocl.us/Geospatial_data

--2020-03-08 21:31:54--  https://cocl.us/Geospatial_data
Resolving cocl.us (cocl.us)... 158.85.108.86, 158.85.108.83, 169.48.113.194
Connecting to cocl.us (cocl.us)|158.85.108.86|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2020-03-08 21:31:57--  https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv
Resolving ibm.box.com (ibm.box.com)... 103.116.4.197
Connecting to ibm.box.com (ibm.box.com)|103.116.4.197|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /public/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2020-03-08 21:31:59--  https://ibm.box.com/public/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv
Reusing existing connection to ibm.box.com:443.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://ibm.ent.box.com/public/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.cs

In [262]:
pdf = pd.read_csv('coor.csv')
pdf.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [263]:
result = grouped_lists.join(pdf)
del result['Postal Code']
result.head(60)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.692657,-79.264848


In [264]:
# explore and cluster the toronto neighbourhoods
toronto_neigh = result[result['Borough'].str.contains('Toronto')]
toronto_neigh.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [265]:
from sklearn.preprocessing import StandardScaler
import numpy as np
# normalizing over the standard deviation of coordinates
X = toronto_neigh.values[:,3:]
X = np.nan_to_num(X)
Clus_dataSet = StandardScaler().fit_transform(X)
Clus_dataSet

array([[ 0.39793749,  2.61964756],
       [ 0.53600121,  1.01941327],
       [ 0.08040863,  2.00991342],
       [-0.32834143,  1.32413968],
       [ 2.62714635,  0.02929996],
       [ 1.96828767, -0.00877123],
       [ 2.08186867, -0.42754079],
       [ 1.60468436,  0.02929996],
       [ 0.96823293,  0.18160095],
       [ 0.83179593, -0.27526956],
       [ 0.53623853,  0.33391005],
       [ 0.03590049,  0.60047059],
       [-0.05501868,  0.18160095],
       [-0.55554658,  0.79089146],
       [-0.43033262,  0.29583074],
       [-0.67489659,  0.39102765],
       [-0.96499134,  0.44814526],
       [-0.39621906,  0.06737385],
       [-0.71471013,  0.14352435],
       [-1.13564978,  0.21967755],
       [-0.86117495,  0.22443577],
       [-0.81708966,  0.27203152],
       [ 1.92270943, -0.7320562 ],
       [ 1.28638314, -0.57980391],
       [ 0.24054304, -0.42754079],
       [-0.19155492, -0.27526956],
       [-0.60103421, -0.27526956],
       [-1.64778475, -0.12299021],
       [-0.89317422,

In [266]:
from sklearn.cluster import KMeans 
# modelling 
clusterNum = 3
k_means = KMeans(init="k-means++", n_clusters = clusterNum, n_init=12)
k_means.fit(X)
labels = k_means.labels_
# labels.shape
print(labels)


[1 1 1 1 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 2 2 2 2 2 2 2
 1 1]


In [267]:
# add the label of cluster as the last column
toronto_neigh["cluster_kn"] = labels
toronto_neigh.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,cluster_kn
37,M4E,East Toronto,The Beaches,43.676357,-79.293031,1
41,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188,1
42,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572,1
43,M4M,East Toronto,Studio District,43.659526,-79.340923,1
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,0


In [268]:
from pandas.io.json import json_normalize
import folium
from geopy.geocoders import Nominatim 
import requests
import folium 

CLIENT_ID = 'DIHBSCAXS0A3HJJFKH14HTPH0MXS0CVDXCPBV5C1KFZ4QGHK' # your Foursquare ID
CLIENT_SECRET = 'HJCAN3I3D0ZG4EY4ULIPGZ2EWSAK2STHZ5PDP1PN2ZIB5YSE' # your Foursquare Secret
VERSION = '20191004'
LIMIT = 30

# select relevant columns
neighborhoods_subset = toronto_neigh[['Neighbourhood','Latitude','Longitude','cluster_kn']]
neighborhoods_subset.head()

neighborhoods_subset.columns = [col.split(".")[-1] for col in neighborhoods_subset.columns]

neighborhoods_subset.head()

Unnamed: 0,Neighbourhood,Latitude,Longitude,cluster_kn
37,The Beaches,43.676357,-79.293031,1
41,"The Danforth West,Riverdale",43.679557,-79.352188,1
42,"The Beaches West,India Bazaar",43.668999,-79.315572,1
43,Studio District,43.659526,-79.340923,1
44,Lawrence Park,43.72802,-79.38879,0


In [284]:
# select first neighborhood
neighborhood_name = neighborhoods_subset.loc[37,['Neighbourhood']]
neighborhood_latitude = neighborhoods_subset.loc[37, 'Latitude'] 
neighborhood_longitude = neighborhoods_subset.loc[37, 'Longitude'] 
# limit of number of venues returned by Foursquare API
LIMIT = 100 
radius = 500



In [270]:
# send the get request and examine the result
# url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
#     CLIENT_ID, 
#     CLIENT_SECRET, 
#     VERSION, 
#     neighborhood_latitude, 
#     neighborhood_longitude, 
#     radius, 
#     LIMIT)
# results = requests.get(url).json()

In [285]:
venues_map = folium.Map(location=[neighborhood_latitude, neighborhood_longitude], zoom_start=15)

In [286]:
# add a red circle marker to represent The Beaches
folium.features.CircleMarker(
    [neighborhood_latitude, neighborhood_longitude],
    radius=5,
    color='red',
#     popup='The Beaches',
    fill = True,
    fill_color = 'red',
    fill_opacity = 0.6
).add_to(venues_map)

<folium.features.CircleMarker at 0x1a174f5fd0>

In [287]:
venues_map 

In [294]:
colors = ['red','blue','yellow']

# plot the nearly neighborhoods
for lat, lng, clr in zip(neighborhoods_subset.Latitude, neighborhoods_subset.Longitude, neighborhoods_subset.cluster_kn):
    folium.features.CircleMarker(
        [lat, lng],
        radius=5,
        color=colors[clr],
#         popup=label,
        fill = True,
        fill_color=colors[clr],
        fill_opacity=0.6
    ).add_to(venues_map)

In [295]:
venues_map 