In [9]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd

### Get information from Wiki

In [10]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text   
soup = BeautifulSoup(source, 'html.parser')

In [11]:
PostalCodeList = []
BoroughList = []
NeighborhoodList = []
tbody = soup.find('tbody')

### Collect the Data

In [12]:
for index, value in enumerate(tbody.find_all('td')):
    if (index%3 == 0):
        PostalCodeList.append(value.text.strip())
    elif(index%3 == 1):
        BoroughList.append(value.text.strip())
    else:
        NeighborhoodList.append(value.text.strip())
dataDic = { "PostalCode":PostalCodeList, "Borough":BoroughList, "Neighborhood": NeighborhoodList }

### Translate to DataFrame

In [13]:
df = pd.DataFrame.from_dict(dataDic)
#print( df.head() )

### Remove column if Boroug is "Not assigned"

In [14]:
df = df[df.Borough != 'Not assigned']
df.reset_index(drop=True, inplace=True)
#print( df.head() )

In [15]:
aggregate_fun = {'PostalCode': 'first',
                 "Borough": 'first', 
                 "Neighborhood": lambda col: ','.join(col)}
df_new = df.groupby(df['PostalCode']).aggregate(aggregate_fun)

df_new.reset_index(drop=True, inplace=True)
#df_new

In [16]:
for index, row in df_new.iterrows():
    if (row.Neighborhood) == 'Not assigned':
        row.Neighborhood = row.Borough


In [17]:
#print(df_new.iloc[85])

### The shape of List of postal codes of Canada

In [18]:
df_new.shape

(103, 3)

### Get the atitude and the longitude coordinates  by geocoder

In [19]:
!wget -O geospatial.csv http://cocl.us/Geospatial_data

--2019-03-19 05:11:07--  http://cocl.us/Geospatial_data
Resolving cocl.us (cocl.us)... 169.48.113.201
Connecting to cocl.us (cocl.us)|169.48.113.201|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://cocl.us/Geospatial_data [following]
--2019-03-19 05:11:07--  https://cocl.us/Geospatial_data
Connecting to cocl.us (cocl.us)|169.48.113.201|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2019-03-19 05:11:09--  https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv
Resolving ibm.box.com (ibm.box.com)... 107.152.26.197
Connecting to ibm.box.com (ibm.box.com)|107.152.26.197|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /public/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2019-03-19 05:11:09--  https://ibm.box.com/public/static/9afzr83pps4pwf2smjjc

In [20]:
df_geo = pd.read_csv('geospatial.csv')
df_geo['PostalCode'] = df_geo['Postal Code']
df_geo_new = df_geo.drop(['Postal Code'], axis=1)
#df_geo_new.head()


In [21]:
df_geo_new_index = df_geo_new.set_index('PostalCode')
#df_geo_new_index.head()

In [29]:
toronto_fsa = df_new.join(df_geo_new_index, on='PostalCode')
toronto_fsa.head()

Unnamed: 0,Neighborhood,PostalCode,Borough,Latitude,Longitude
0,"Rouge,Malvern",M1B,Scarborough,43.806686,-79.194353
1,"Highland Creek,Rouge Hill,Port Union",M1C,Scarborough,43.784535,-79.160497
2,"Guildwood,Morningside,West Hill",M1E,Scarborough,43.763573,-79.188711
3,Woburn,M1G,Scarborough,43.770992,-79.216917
4,Cedarbrae,M1H,Scarborough,43.773136,-79.239476


#### Create a map of Toronto with neighborhoods superimposed on top.

In [33]:
!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values


Fetching package metadata .............
Solving package specifications: .

# All requested packages already installed.
# packages in environment at /opt/conda/envs/DSX-Python35:
#
geopy                     1.18.1                     py_0    conda-forge


ImportError: No module named 'folium'

In [34]:
!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    altair:  2.2.2-py35_1 conda-forge
    branca:  0.3.1-py_0   conda-forge
    folium:  0.5.0-py_0   conda-forge
    vincent: 0.4.4-py_1   conda-forge

altair-2.2.2-p 100% |################################| Time: 0:00:00  42.98 MB/s
branca-0.3.1-p 100% |################################| Time: 0:00:00  36.70 MB/s
vincent-0.4.4- 100% |################################| Time: 0:00:00  40.67 MB/s
folium-0.5.0-p 100% |################################| Time: 0:00:00  45.69 MB/s


In [35]:
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="tor_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Tornoto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Tornoto are 43.653963, -79.387207.


In [36]:
# create map of Tornoto using latitude and longitude values
map_tornoto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_fsa['Latitude'], toronto_fsa['Longitude'], toronto_fsa['Borough'], toronto_fsa['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_tornoto)  
    
map_tornoto

### Cluster

In [41]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

### Add new column of Cluster Label for mark the Borough is contains 'Toronto'

In [95]:
toronto_fsa['Cluster Labels'] = toronto_fsa['Borough'].str.contains('Toronto', regex=False)*1

toronto_fsa.head(120)

Unnamed: 0,Neighborhood,PostalCode,Borough,Latitude,Longitude,Cluster Labels
0,"Rouge,Malvern",M1B,Scarborough,43.806686,-79.194353,0
1,"Highland Creek,Rouge Hill,Port Union",M1C,Scarborough,43.784535,-79.160497,0
2,"Guildwood,Morningside,West Hill",M1E,Scarborough,43.763573,-79.188711,0
3,Woburn,M1G,Scarborough,43.770992,-79.216917,0
4,Cedarbrae,M1H,Scarborough,43.773136,-79.239476,0
5,Scarborough Village,M1J,Scarborough,43.744734,-79.239476,0
6,"East Birchmount Park,Ionview,Kennedy Park",M1K,Scarborough,43.727929,-79.262029,0
7,"Clairlea,Golden Mile,Oakridge",M1L,Scarborough,43.711112,-79.284577,0
8,"Cliffcrest,Cliffside,Scarborough Village West",M1M,Scarborough,43.716316,-79.239476,0
9,"Birch Cliff,Cliffside West",M1N,Scarborough,43.692657,-79.264848,0


### Remove no use coulmn. 

In [None]:
toronto_grouped_clustering = toronto_fsa
toronto_grouped_clustering = toronto_fsa.drop('Neighborhood', 1)
toronto_grouped_clustering = toronto_grouped_clustering.drop('PostalCode', 1)
toronto_grouped_clustering = toronto_grouped_clustering.drop('Borough', 1)

#toronto_grouped_clustering


### Prepare for cluster the group into two group. 
### Expect that with 'Cluster Labels', the data will be separated to two group.
### One is Borough with 'Toronto', another is not.

In [114]:
# set number of clusters
kclusters = 2

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:120] 

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)

### Prepare to draw map

In [115]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

In [116]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon , cluster in zip(toronto_grouped_clustering['Latitude'], toronto_grouped_clustering['Longitude'], toronto_grouped_clustering['Cluster Labels']):
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters