### Read the data from webpage into panda data frame

In [1]:
import pandas as pd

df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]
df

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government
7,M8A,Not assigned,
8,M9A,Etobicoke,Islington Avenue
9,M1B,Scarborough,Malvern / Rouge


### Get rid off the cells with Borough "Not assigned"

In [2]:
df1=df[df['Borough'] != 'Not assigned'].reset_index(drop=True)
df1

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,Malvern / Rouge
7,M3B,North York,Don Mills
8,M4B,East York,Parkview Hill / Woodbine Gardens
9,M5B,Downtown Toronto,"Garden District, Ryerson"


### Check if any postal code is listed more than once

In [3]:
df1.groupby('Postal code').count().sort_values(by=['Neighborhood'], ascending = False)

Unnamed: 0_level_0,Borough,Neighborhood
Postal code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,1,1
M5R,1,1
M6G,1,1
M6E,1,1
M6C,1,1
M6B,1,1
M6A,1,1
M5X,1,1
M5W,1,1
M5V,1,1


In [4]:
df1['Neighborhood'] = df1.Neighborhood.str.replace(' / ', ', ')
df1

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


### import the library I need

In [5]:
#!conda install -c conda-forge geocoder

When I try this I get access denied, so I use the file instead

In [6]:
# import geocoder

# def get_geocode(postal_code):
#     lat_lng = None
#     while lat_lng is None:
#         g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
#         lat_lng = g.latlng

#     return lat_lng

# codes = df1['Postal code'].apply(get_geocode)
# codes

In [7]:
codes_df = pd.read_csv('http://cocl.us/Geospatial_data')
codes_df

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [8]:
df1.rename(columns={'Postal code': 'Postal Code'}, inplace=True)

In [9]:
final_df = pd.merge(df1, codes_df, on='Postal Code')
final_df

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


In [10]:
final_df.loc[final_df['Postal Code'] == 'M5G', :]

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
24,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383


After check this, I can make sure the data is the same as listed in the project instructions,just the order is different.

## Segment and Explore

In [11]:
from sklearn.cluster import KMeans

In [12]:
toronto_df = final_df.loc[final_df.Borough.str.contains('Toronto'), :]
toronto_df

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
31,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


In [13]:
toronto_df.Borough.nunique()

4

In [14]:
N_CLUSTERS = 4
k_means_model = KMeans(n_clusters=N_CLUSTERS, random_state=0).fit(toronto_df[['Latitude', 'Longitude']])
k_means_model.labels_

array([1, 1, 1, 1, 3, 1, 1, 2, 1, 2, 1, 2, 3, 1, 2, 3, 1, 3, 0, 0, 0, 0,
       2, 0, 1, 2, 0, 1, 2, 0, 1, 0, 1, 1, 1, 1, 1, 1, 3], dtype=int32)

There is generally good overlap between the kmeans clustering of the Lat/Lon coordinates and the given labels. East Toronto and Cluster 2 map perfectly, where there are only two errors in cluster 1 and 2. Since the k-means clustering is based on physical distance (from lat/long coordinates) while the borough names are administratively defined, this suggests that the Borough definitions are quite reasonable.

In [15]:
toronto_df['ClusterLabel'] = k_means_model.labels_
pd.crosstab(toronto_df.Borough, toronto_df.ClusterLabel)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


ClusterLabel,0,1,2,3
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Central Toronto,8,1,0,0
Downtown Toronto,0,18,1,0
East Toronto,0,0,0,5
West Toronto,0,0,6,0


In [16]:
label_cluster_corr_df = pd.concat([
    pd.get_dummies(toronto_df.Borough),
    pd.get_dummies(toronto_df.ClusterLabel)
], axis=1).corr().iloc[:4,:].iloc[:,4:]
label_cluster_corr_df

Unnamed: 0,0,1,2,3
Central Toronto,0.927478,-0.412098,-0.256174,-0.210042
Downtown Toronto,-0.495138,0.897368,-0.32219,-0.373773
East Toronto,-0.194809,-0.373773,-0.179358,1.0
West Toronto,-0.216612,-0.415605,0.911685,-0.163517


Let us manually write down a mapping between the Borough and ClusterLabel so that we can match the color-codes in the map later:

In [17]:
borough_to_cluster = {
    'Central Toronto': 0,
    'Downtown Toronto': 1,
    'East Toronto': 3,
    'West Toronto': 2
}

In [18]:
# !conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab

Use the code from previous lab to draw a map. We draw the map based on official neighborhood labels and again based on our clustering.

In [19]:
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors
import numpy as np

In [20]:
center_lat = 0.5 * (toronto_df.Latitude.min() +  toronto_df.Latitude.max())
center_lon = 0.5 * (toronto_df.Longitude.min() + toronto_df.Longitude.max())

## Neighborhood Map

Here we plot the centers of each neighborhood. The **outside** color is the label according to the clustering algorithm and the **inside** color is the label according to the official Borough. When the two colors disagree, then the k-means cluster label is not the same as the official Borough label. We can observe again that k-means perfectly recovers East Toronto, and the only two mistakes k-means makes are on two central neighborhoods at the tripoint of three Boroughs. 


In [21]:
map_clusters = folium.Map(location=[center_lat, center_lon], zoom_start=12)

# set color scheme for the clusters
x = np.arange(N_CLUSTERS)
ys = [i + x + (i*x)**2 for i in range(N_CLUSTERS)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

for _, row in toronto_df.iterrows():
    label = folium.Popup(f'{row.Borough}, Cluster {row.ClusterLabel}')
    folium.CircleMarker(
        [row.Latitude, row.Longitude],
        radius=8,
        popup=label,
        color=rainbow[row.ClusterLabel],
        fill=True,
        fill_color=rainbow[borough_to_cluster[row.Borough]],
        fill_opacity=0.7).add_to(map_clusters)
    
map_clusters