In [1]:
# @hidden_cell
# The project token is an authorization token that is used to access project resources like data sources, connections, and used by platform APIs.
from project_lib import Project
project = Project(project_id='ebd70894-9003-44bd-9234-9a5f3cc0b56d', project_access_token='p-1af019af2652215005539338cb955ad84482456d')
pc = project.project_context


# Import Libraries

In [2]:
import requests
import lxml.html as lh
import pandas as pd
import numpy as np

import matplotlib.cm as cm
import matplotlib.colors as colors

!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim

!conda install -c conda-forge folium=0.5.0 --yes
import folium

print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    certifi-2019.9.11          |           py36_0         147 KB  conda-forge
    ca-certificates-2019.9.11  |       hecc5488_0         144 KB  conda-forge
    geopy-1.20.0               |             py_0          57 KB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    openssl-1.1.1c             |       h516909a_0         2.1 MB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.50-py_0         conda-forge
    geopy:           1.20.0-py_0       conda-forge

The following packages will be UPDATED:

    ca-

# Scrape the table on the link

In [13]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

page = requests.get(url)
doc = lh.fromstring(page.content)
tr_elements = doc.xpath('//tr')

# Check the length of the first 10 rows
[len(T) for T in tr_elements[:11]]

[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]

# Parse Table Header

In [14]:
# Create an empty list and store each first element in the list
col = []
i = 0

for t in tr_elements[0]:
    i += 1
    name = t.text_content()
    print('%d:"%s"'%(i, name))
    col.append((name, []))

1:"Postcode"
2:"Borough"
3:"Neighbourhood
"


# Create Pandas DataFrame
**Each header is appended to a tuple in the empty list**

In [15]:
#Since the first row is the header, data is stored on the second row onwards.
for j in range(1, len(tr_elements)):
    T = tr_elements[j]
    if len(T) != 3: #If a row is not of size 3, the //tr data would not be out of the table
        break
    
    i = 0 # i is the index of our column
    
    #Iterate through each element of the row
    for t in T.iterchildren():
        #Check if row is empty
        if i == 2:
            try:
                data = t.text_content()[:-1]
            except:
                pass
        else:
            data = t.text_content()
        #Append the data to the empty list of the 'i'th column
        col[i][1].append(data)
        i += 1
        
print([len(C) for (title, C) in col])

Dict = {title: column for (title, column) in col}
df = pd.DataFrame(Dict)
df.columns = ['Postcode', 'Borough', 'Neighborhood']
df.head()
    

[288, 288, 288]


Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


# Refine dataframe

**<font size = 4>Since the above data were just scraped from the web page, lots of unnecessary data are included inside the dataframe(i.e. 'Not assigned'). We need to get rid of these data so that make the dataframe easy to visualize.<font>**

## 1. Drop the 'Not assigned' cells on 'Borough' column

In [16]:
for i in range(len(df)):
    t = df['Borough'][i] 
    if t == 'Not assigned':
        df.drop([i], axis = 0, inplace = True)
    else:
        pass

df.reset_index(drop = True).head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


## 2. Change 'Not assigned' Neighborhoods into the borough's name

In [17]:
for i in range(len(df)):
    if df.iloc[i, 2] == 'Not assigned':
        df.iloc[i, 2] = df.iloc[i, 1]

df.reset_index(drop = True, inplace = True)
df.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


## 3. Merge neighborhoods with same postcode and borough
**<font size = 4> Note: Unlike the sample of dataframe uploaded on the guideline, postcode cells are realigned in the alphabetical order in this notebook. But both are actually same! <font>**

In [18]:
#First Trial 
#neighbor = df.copy()

#for n in range(len(neighbor)): Error occured! - indexer is out of bounds
#    for i in range(len(neighbor)):
#        if (neighbor.iloc[n, 0] == neighbor.iloc[i, 0])&(neighbor.iloc[n, 1] == neighbor.iloc[i, 1]):
#            neighbor.iloc[i, 2] = neighbor.iloc[i, 2] + ' , ' + neighbor.iloc[i-1, 2]
#            neighbor = neighbor.drop([i-1], axis = 0)
    
#neighbor.reset_index(drop = True, inplace = True)
#neighbor.head(20)

nn=df.groupby(['Postcode','Borough'])['Neighborhood'].apply(list)
nn = pd.DataFrame(nn).reset_index()
nn['Neighborhood'] = nn['Neighborhood'].apply(lambda x:','.join(x))
nn.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


## Shape of the dataframe

In [19]:
nn.shape

(103, 3)

# Visualization
<font size = 4> Now we switch gears to visualize the dataframe with several visualizing tools. 
    [seconds later...] <br>
    Oops, I was in trouble with the geocoder -- it fails to load the location dataset (latitude, longitude) so instead, I just brought up the csv file here.<font>

## 1. Get the latitude and longitude of each neighborhood


In [20]:
!conda install -c conda-forge geocoder --yes
import geocoder

In [21]:
lat_long_coords = None

lat_list = []
long_list = []
for i in df['Postcode']:
    #while lat_long_coords is None:
    g = geocoder.google('{}, Toronto, Ontario'.format(i))
    lat_long_coords = g.lating
    
    lat_list.append(lat_long_coords)
    #long_list.append(lat_long_coords[1])
        

print(lat_list[:5])


#Failed 

[None, None, None, None, None]


In [22]:
#load the csv file saved as an asset in my project folder
file = project.get_file('Geospatial_Coordinates.csv')
df_location = pd.read_csv(file)
df_location.columns = ["Postcode", "Latitude", "Longitude"]
df_merged = pd.merge(nn, df_location, on = "Postcode")
df_merged.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [23]:
from geopy.geocoders import Nominatim
import folium

address = 'Toronto, Ontario'
geolocator = Nominatim(user_agent = 'Mino')
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

toronto_map = folium.Map(location = [latitude, longitude], zoom_start = 11) 

for lat, long, borough, neighbor in zip(df_merged.Latitude, df_merged.Longitude, df_merged.Borough, df_merged.Neighborhood):
    label = '{}, {}'.format(borough, neighbor)
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker([lat, long], radius = 5, popup = label, color = 'green', fill = True, fill_color = '#32a852', fill_opacity = 0.6, parse_html = False).add_to(toronto_map)
    
toronto_map

## 2. Clustering

**<font size = 4>Since we have no data but the names of borough and neighborhood and their locations, clustering would be proceded according to these location data. Let's see how it works <font>**

In [24]:
from sklearn.cluster import KMeans

kclusters = 3
df_cluster = df_merged.loc[:, ['Latitude', 'Longitude']]
kmeans = KMeans(n_clusters = kclusters, init = "k-means++", n_init = 5, random_state = 0).fit(df_cluster)
df_merged['Labels'] = kmeans.labels_
df_merged.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Labels
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353,2
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497,2
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711,2
3,M1G,Scarborough,Woburn,43.770992,-79.216917,2
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,2


In [25]:
#create map
toronto_map = folium.Map(location  = [latitude, longitude], zoom_start = 11)

#set color scheme for the clusters
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors
x = np.arange(kclusters)
ys = [i*x + (i*x)**2 for i in range(kclusters)]
color_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in color_array]

markers_color = []
for lat, long, borough, cluster in zip(df_merged.Latitude, df_merged.Longitude, df_merged.Borough, df_merged.Labels):
    label = folium.Popup(borough + ' Cluster ' + str(cluster), parse_html = True)
    folium.CircleMarker([lat, long], radius = 5, popup = label, color = rainbow[cluster-1], fill = True, fill_color = rainbow[cluster-1], fill_opacity = 0.6, parse_html = False).add_to(toronto_map)
    
toronto_map