# Synopsis
This notebook is the support of the applied data science capstone project of IBM on the coursera plateform.

### 1. Scrap the neighborhoods in Toronto table from [Wikipedia page](https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M)

In [83]:
import pandas as pd
import numpy as np

In [84]:
# pd.set_option('display.max_rows', 10)

#### Get the data from the web page

In [85]:
# Scrap table and turn into a dataframe
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
neighDf = pd.read_html(url, header=0)

neighDf = pd.DataFrame(neighDf[0])
neighDf

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


#### Cleaning data

In [86]:
# Remove unassigned borough
filt = neighDf['Borough'] == 'Not assigned'
neighDf = neighDf[~filt]
neighDf.reset_index(inplace=True)
neighDf = neighDf.drop("index", axis=1)
neighDf

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [87]:
# Deals with the duplicated neighborhood in one postal code area
neighDf['Postal Code'].duplicated().value_counts()

False    103
Name: Postal Code, dtype: int64

All duplicated neighborhood are already handles in the wikipedia table

In [88]:
filt = neighDf['Neighbourhood'] == 'Not assigned'
filt.value_counts()

False    103
Name: Neighbourhood, dtype: int64

None neighbourhood has an unassigned 

### 2. Get longitude and latitude of each neighbourhood

In [89]:
# !pip install geocoder

In [90]:
import geocoder

lat_lng_coords = None

addresses = neighDf['Postal Code'] + ', Toronto, Ontario'

for i, address in enumerate(addresses):
    g = geocoder.arcgis(address)
    lat_lng_coords = g.latlng
    neighDf.loc[i,'latitude'] = lat_lng_coords[0]
    neighDf.loc[i,'longitude'] = lat_lng_coords[1]

In [91]:
neighDf

Unnamed: 0,Postal Code,Borough,Neighbourhood,latitude,longitude
0,M3A,North York,Parkwoods,43.75245,-79.32991
1,M4A,North York,Victoria Village,43.73057,-79.31306
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65512,-79.36264
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.72327,-79.45042
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66253,-79.39188
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.65319,-79.51113
99,M4Y,Downtown Toronto,Church and Wellesley,43.66659,-79.38133
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.64869,-79.38544
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.63278,-79.48945


### 3. Explore and cluster the neighborhoods in Toronto

#### Create a map of Toronto with neighborhoods superimposed on top

In [92]:
import folium

In [93]:
g = geocoder.arcgis('Toronto, Ontario')
lat_lng_coords = g.latlng
latitude = lat_lng_coords[0]
longitude = lat_lng_coords[1]

# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighDf['latitude'], neighDf['longitude'], neighDf['Borough'], neighDf['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

#### Cluster the neighbourhoods by latitude and longitude with a k-Means algorithm

In [94]:
from sklearn.cluster import KMeans

kclusters = 6

neigh_clustering = neighDf[['latitude', 'longitude']]

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(neigh_clustering)

neighDf['Cluster Labels'] = kmeans.labels_

In [95]:
neighDf

Unnamed: 0,Postal Code,Borough,Neighbourhood,latitude,longitude,Cluster Labels
0,M3A,North York,Parkwoods,43.75245,-79.32991,1
1,M4A,North York,Victoria Village,43.73057,-79.31306,1
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65512,-79.36264,3
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.72327,-79.45042,5
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66253,-79.39188,3
...,...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.65319,-79.51113,2
99,M4Y,Downtown Toronto,Church and Wellesley,43.66659,-79.38133,3
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.64869,-79.38544,3
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.63278,-79.48945,2


#### Create a map of Toronto with neighborhoods colorized by cluster type

In [96]:
import matplotlib.cm as cm
import matplotlib.colors as colors

map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(neighDf['latitude'], neighDf['longitude'], neighDf['Neighbourhood'], neighDf['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters