In [15]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [16]:
r  = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
data = r.text
soup = BeautifulSoup(data)

In [56]:
table = soup.find('table', attrs={'class':'wikitable sortable'})
dfs = pd.read_html(str(table))
df = dfs[0]

In [60]:
df = df[~df['Borough'].isin(['Not assigned'])]
df.loc[df.Borough == "Queen's Park", 'Neighbourhood'] = "Queen's Park"
df = df.groupby(['Postcode','Borough'])['Neighbourhood'].apply(','.join).reset_index()
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


In [61]:
df.shape

(103, 3)

In [62]:
df_loc = pd.read_csv('http://cocl.us/Geospatial_data')

In [63]:
df['Latitude'] = df_loc['Latitude']
df['Longitude'] = df_loc['Longitude']
df

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.692657,-79.264848


## Now we are going to cluster the neighborhoods based on their location

We chose k as 10 based on visual testing

In [64]:
from sklearn.cluster import KMeans

In [65]:
df_clustering = df.drop(['Neighbourhood','Borough', 'Postcode'], 1)
clusters_num = 10
kmeans = KMeans(n_clusters=clusters_num, random_state=0).fit(df_clustering)

kmeans.labels_

array([4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 6, 6, 6, 6, 6,
       6, 6, 8, 1, 6, 3, 8, 8, 8, 8, 8, 8, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       0, 0, 0, 0, 0, 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 7,
       7, 7, 7, 7, 7, 8, 0, 0, 9, 0, 9, 7, 7, 8, 9, 9, 9, 9, 9, 7, 5, 3,
       5, 5, 9, 9, 5, 5, 5, 5, 2, 2, 2, 2, 2, 2, 2], dtype=int32)

In [41]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
import folium # map rendering library

In [69]:
# add clustering labels
df.insert(0, 'Cluster Labels', kmeans.labels_)
df.head() # check the last columns!

Unnamed: 0,Cluster Labels,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,4,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,4,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,4,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,4,M1G,Scarborough,Woburn,43.770992,-79.216917
4,1,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [66]:
# create map
import numpy as np
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="toronto")
location_toronto = geolocator.geocode('Toronto')
latitude_toronto = location_toronto.latitude
longitude_toronto = location_toronto.longitude
map_clusters = folium.Map(location=[latitude_toronto, longitude_toronto], zoom_start=11)

In [70]:
x = np.arange(clusters_num)
ys = [i + x + (i*x)**2 for i in range(clusters_num)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array] 

markers_colors = []
for latitude, longitude, neigh, cluster in zip(df['Latitude'], df['Longitude'], df['Neighbourhood'], df['Cluster Labels']):
    label = folium.Popup(str(neigh) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [latitude, longitude],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters