In [1]:
import pandas as pd
import numpy as np

In [2]:
import requests
from bs4 import BeautifulSoup

In [3]:
!conda install -c conda-forge folium=0.5.0
import folium

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



In [4]:
from sklearn.cluster import KMeans

In [18]:
import matplotlib.cm as cm
import matplotlib.colors as colors

In [26]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
src = requests.get(url).text
soup = BeautifulSoup(src, 'xml')
nei_=soup.find('table')

In [27]:
cols = ['Postalcode','Borough','Neighborhood']
df=pd.DataFrame(columns=cols)
df.iloc[0:0]

Unnamed: 0,Postalcode,Borough,Neighborhood


In [29]:
for tr in nei_.find_all('tr'):
    tab_row=[]
    for td in tr.find_all('td'):
        row =td.text.strip()
        tab_row.append(row)
    if len(tab_row)==3:
        df.loc[len(df)] = tab_row

In [30]:
df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [31]:
# Dropping the rows where Borough is 'Not assigned'
df1 = df[df.Borough != 'Not assigned']

#Merging the neighbordhoods with same postal code
df2 =df1.groupby(['Postalcode','Borough'])['Neighborhood'].apply(', '.join).reset_index()
df2.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [32]:
df2['Neighborhood']=np.where(df2['Neighborhood']=='Not assigned', df2['Borough'], df2['Neighborhood'])

In [33]:
df2

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."


In [34]:
df2.shape


(103, 3)

# Second Part

In [39]:
lattlong = pd.read_csv('https://cocl.us/Geospatial_data')
lattlong.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [40]:
lattlong.shape

(103, 3)

In [41]:
#Merging the data frames to get a single data frame with all data
lattlong=lattlong.rename(columns={'Postal Code':'Postalcode'})
merg_df = pd.merge(df2,lattlong, on = 'Postalcode')
merg_df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [42]:
df3 = merg_df[merged_df['Borough'].str.contains('Toronto',regex=False)]
df3.head()

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [19]:
df3.shape

(39, 5)

In [43]:
map1 = folium.Map(location=[43.6532,-79.3832],zoom_start=11)
for latt,long,borough,neighborhood in zip(df3['Latitude'],df3['Longitude'],df3['Borough'],df3['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [latt,long],
    radius=5,
    popup=label,
    color='blue',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.75,
    parse_html=False).add_to(map1)
map1

In [21]:
#Run *k*-means to cluster the neighborhood into 5 clusters.
kclusters = 5

toronto_grouped_clustering = df3.drop(['Postalcode','Borough','Neighborhood'], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1], dtype=int32)

In [46]:
# add clustering labels
df3.insert(0, 'Cluster Labels', kmeans.labels_)

In [47]:
df3

Unnamed: 0,Cluster Labels,Postalcode,Borough,Neighborhood,Latitude,Longitude
37,0,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,0,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,0,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
43,0,M4M,East Toronto,Studio District,43.659526,-79.340923
44,1,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
45,1,M4P,Central Toronto,Davisville North,43.712751,-79.390197
46,1,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
47,1,M4S,Central Toronto,Davisville,43.704324,-79.38879
48,1,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
49,1,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...",43.686412,-79.400049


In [48]:
map2 = folium.Map(location=[43.6532,-79.3832],zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

for latt,long,borough,neighborhood,cluster in zip(df3['Latitude'],df3['Longitude'],df3['Borough'],df3['Neighborhood'],df3['Cluster Labels']):
    #label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup('Cluster' + str(cluster), parse_html=True)
    folium.CircleMarker(
    [latt,long],
    radius=5,
    popup=label,
    color=rainbow[cluster-1],
    fill=True,
    fill_color=rainbow[cluster-1],
    fill_opacity=0.75,
    parse_html=False).add_to(map2)
map2